--list-extractors List all supported extractors and the URLs they
would handle
--extractor-descriptions Output descriptions of all supported extractors
- --proxy URL Use the specified HTTP/HTTPS proxy
+ --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an
+ empty string (--proxy "") for direct connection
--no-check-certificate Suppress HTTPS certificate validation.
--cache-dir DIR Location in the filesystem where youtube-dl can
store downloaded information permanently. By
--dateafter DATE download only videos uploaded after this date
--no-playlist download only the currently playing video
--age-limit YEARS download only videos suitable for the given age
- --download-archive FILE Download only videos not present in the archive
- file. Record all downloaded videos in it.
+ --download-archive FILE Download only videos not listed in the archive
+ file. Record the IDs of all downloaded videos in
+ it.
## Download Options:
-r, --rate-limit LIMIT maximum download rate in bytes per second (e.g.
--get-description simulate, quiet but print video description
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
+ -j, --dump-json simulate, quiet but print JSON information
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
-v, --verbose print various debugging information
--dump-intermediate-pages print downloaded pages to debug problems(very
verbose)
- --write-pages Write downloaded pages to files in the current
- directory
+ --write-pages Write downloaded intermediary pages to files in
+ the current directory to debug problems
## Video Format Options:
- -f, --format FORMAT video format code, specifiy the order of
+ -f, --format FORMAT video format code, specify the order of
preference using slashes: "-f 22/17/18". "-f mp4"
and "-f flv" are also supported
--all-formats download all available video formats
# CONFIGURATION
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
# OUTPUT TEMPLATE
--list-extractors List all supported extractors and the URLs they
would handle
--extractor-descriptions Output descriptions of all supported extractors
- --proxy URL Use the specified HTTP/HTTPS proxy
+ --proxy URL Use the specified HTTP/HTTPS proxy. Pass in an
+ empty string (--proxy "") for direct connection
--no-check-certificate Suppress HTTPS certificate validation.
--cache-dir DIR Location in the filesystem where youtube-dl can
store downloaded information permanently. By
--dateafter DATE download only videos uploaded after this date
--no-playlist download only the currently playing video
--age-limit YEARS download only videos suitable for the given age
- --download-archive FILE Download only videos not present in the archive
- file. Record all downloaded videos in it.
+ --download-archive FILE Download only videos not listed in the archive
+ file. Record the IDs of all downloaded videos in
+ it.
Download Options:
-----------------
--get-description simulate, quiet but print video description
--get-filename simulate, quiet but print output filename
--get-format simulate, quiet but print output format
+ -j, --dump-json simulate, quiet but print JSON information
--newline output progress bar as new lines
--no-progress do not print progress bar
--console-title display progress in console titlebar
-v, --verbose print various debugging information
--dump-intermediate-pages print downloaded pages to debug problems(very
verbose)
- --write-pages Write downloaded pages to files in the current
- directory
+ --write-pages Write downloaded intermediary pages to files in
+ the current directory to debug problems
Video Format Options:
---------------------
- -f, --format FORMAT video format code, specifiy the order of
+ -f, --format FORMAT video format code, specify the order of
preference using slashes: "-f 22/17/18". "-f mp4"
and "-f flv" are also supported
--all-formats download all available video formats
You can configure youtube-dl by placing default arguments (such as
--extract-audio --no-mtime to always extract the audio and not copy the
-mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf.
+mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. On
+Windows, the configuration file locations are
+%APPDATA%\youtube-dl\config.txt and C:\Users\<Yourname>\youtube-dl.conf.
OUTPUT TEMPLATE
===============
__youtube_dl()
{
- local cur prev opts
+ local cur prev opts fileopts diropts keywords
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
opts="{{flags}}"
- keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater"
+ keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
+ fileopts="-a|--batch-file|--download-archive|--cookies"
+ diropts="--cache-dir"
+
+ if [[ ${prev} =~ ${fileopts} ]]; then
+ COMPREPLY=( $(compgen -f -- ${cur}) )
+ return 0
+ elif [[ ${prev} =~ ${diropts} ]]; then
+ COMPREPLY=( $(compgen -d -- ${cur}) )
+ return 0
+ fi
if [[ ${cur} =~ : ]]; then
COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) )
'data_files': [ # Installing system-wide would require sudo...
('etc/bash_completion.d', ['youtube-dl.bash-completion']),
('share/doc/youtube_dl', ['README.txt']),
- ('share/man/man1/', ['youtube-dl.1'])
+ ('share/man/man1', ['youtube-dl.1'])
]
}
if setuptools_available:
from youtube_dl.utils import preferredencoding
-def global_setup():
- youtube_dl._setup_opener(timeout=10)
-
-
def get_params(override=None):
PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
"parameters.json")
"writeinfojson": true,
"writesubtitles": false,
"allsubtitles": false,
- "listssubtitles": false
+ "listssubtitles": false,
+ "socket_timeout": 20
}
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import global_setup, try_rm
-global_setup()
+from test.helper import try_rm
from youtube_dl import YoutubeDL
}
ydl = YoutubeDL(params)
ydl.add_default_info_extractors()
- json_filename = filename + '.info.json'
+ json_filename = os.path.splitext(filename)[0] + '.info.json'
try_rm(json_filename)
ydl.download([url])
res = os.path.exists(json_filename)
def test_keywords(self):
self.assertMatch(':ytsubs', ['youtube:subscriptions'])
self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
- self.assertMatch(':thedailyshow', ['ComedyCentral'])
- self.assertMatch(':tds', ['ComedyCentral'])
- self.assertMatch(':colbertreport', ['ComedyCentral'])
- self.assertMatch(':cr', ['ComedyCentral'])
+ self.assertMatch(':ythistory', ['youtube:history'])
+ self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
+ self.assertMatch(':tds', ['ComedyCentralShows'])
+ self.assertMatch(':colbertreport', ['ComedyCentralShows'])
+ self.assertMatch(':cr', ['ComedyCentralShows'])
+
+ def test_vimeo_matching(self):
+ self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
+ self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
if __name__ == '__main__':
from test.helper import (
get_params,
get_testcases,
- global_setup,
try_rm,
md5,
report_warning
)
-global_setup()
import hashlib
tc_filename = get_tc_filename(tc)
try_rm(tc_filename)
try_rm(tc_filename + '.part')
- try_rm(tc_filename + '.info.json')
+ try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
try_rm_tcs_files()
try:
try_num = 1
if not test_case.get('params', {}).get('skip_download', False):
self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
self.assertTrue(tc_filename in finished_hook_called)
- self.assertTrue(os.path.exists(tc_filename + '.info.json'))
+ info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
+ self.assertTrue(os.path.exists(info_json_fn))
if 'md5' in tc:
md5_for_file = _file_md5(tc_filename)
self.assertEqual(md5_for_file, tc['md5'])
- with io.open(tc_filename + '.info.json', encoding='utf-8') as infof:
+ with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
for (info_field, expected) in tc.get('info_dict', {}).items():
if isinstance(expected, compat_str) and expected.startswith('md5:'):
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
from youtube_dl.extractor import (
DailymotionPlaylistIE,
DailymotionUserIE,
VimeoChannelIE,
+ VimeoUserIE,
UstreamChannelIE,
SoundcloudSetIE,
SoundcloudUserIE,
LivestreamIE,
NHLVideocenterIE,
BambuserChannelIE,
+ BandcampAlbumIE,
+ SmotriCommunityIE,
+ SmotriUserIE
)
self.assertEqual(result['title'], u'Vimeo Tributes')
self.assertTrue(len(result['entries']) > 24)
+ def test_vimeo_user(self):
+ dl = FakeYDL()
+ ie = VimeoUserIE(dl)
+ result = ie.extract('http://vimeo.com/nkistudio/videos')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], u'Nki')
+ self.assertTrue(len(result['entries']) > 65)
+
def test_ustream_channel(self):
dl = FakeYDL()
ie = UstreamChannelIE(dl)
result = ie.extract('http://bambuser.com/channel/pixelversity')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], u'pixelversity')
- self.assertTrue(len(result['entries']) >= 66)
+ self.assertTrue(len(result['entries']) >= 60)
+
+ def test_bandcamp_album(self):
+ dl = FakeYDL()
+ ie = BandcampAlbumIE(dl)
+ result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], u'Nightmare Night EP')
+ self.assertTrue(len(result['entries']) >= 4)
+
+ def test_smotri_community(self):
+ dl = FakeYDL()
+ ie = SmotriCommunityIE(dl)
+ result = ie.extract('http://smotri.com/community/video/kommuna')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], u'kommuna')
+ self.assertEqual(result['title'], u'КПРФ')
+ self.assertTrue(len(result['entries']) >= 4)
+
+ def test_smotri_user(self):
+ dl = FakeYDL()
+ ie = SmotriUserIE(dl)
+ result = ie.extract('http://smotri.com/user/inspector')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['id'], u'inspector')
+ self.assertEqual(result['title'], u'Inspector')
+ self.assertTrue(len(result['entries']) >= 9)
if __name__ == '__main__':
unittest.main()
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL, global_setup, md5
-global_setup()
+from test.helper import FakeYDL, md5
from youtube_dl.extractor import (
self.DL.params['writesubtitles'] = True
self.DL.params['subtitlesformat'] = 'vtt'
subtitles = self.getSubtitles()
- self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
+ self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
def test_youtube_list_subtitles(self):
self.DL.expect_warning(u'Video doesn\'t have automatic captions')
xpath_with_ns,
smuggle_url,
unsmuggle_url,
+ shell_quote,
+ encodeFilename,
)
if sys.version_info < (3, 0):
self.assertEqual(res_url, url)
self.assertEqual(res_data, None)
+ def test_shell_quote(self):
+ args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')]
+ self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+
if __name__ == '__main__':
unittest.main()
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import get_params, global_setup, try_rm
-global_setup()
+from test.helper import get_params, try_rm
import io
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import get_params, global_setup
-global_setup()
+from test.helper import get_params
import io
TEST_ID = 'BaW_jenozKc'
-INFO_JSON_FILE = TEST_ID + '.mp4.info.json'
+INFO_JSON_FILE = TEST_ID + '.info.json'
DESCRIPTION_FILE = TEST_ID + '.mp4.description'
EXPECTED_DESCRIPTION = u'''test chars: "'/\ä↭𝕐
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
from youtube_dl.extractor import (
def test_youtube_playlist(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0]
+ result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
self.assertIsPlaylist(result)
self.assertEqual(result['title'], 'ytdl test PL')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
def test_issue_673(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('PLBB231211A4F62143')[0]
+ result = ie.extract('PLBB231211A4F62143')
self.assertTrue(len(result['entries']) > 25)
def test_youtube_playlist_long(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0]
+ result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
self.assertIsPlaylist(result)
self.assertTrue(len(result['entries']) >= 799)
#651
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0]
+ result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
self.assertFalse('pElCt5oNDuI' in ytie_results)
self.assertFalse('KdPEApIVdWM' in ytie_results)
def test_youtube_playlist_empty(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0]
+ result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')
self.assertIsPlaylist(result)
self.assertEqual(len(result['entries']), 0)
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
# TODO find a > 100 (paginating?) videos course
- result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0]
+ result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
entries = result['entries']
self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
self.assertEqual(len(entries), 25)
dl = FakeYDL()
ie = YoutubeChannelIE(dl)
#test paginated channel
- result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0]
+ result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')
self.assertTrue(len(result['entries']) > 90)
#test autogenerated channel
- result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0]
+ result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
self.assertTrue(len(result['entries']) >= 18)
def test_youtube_user(self):
dl = FakeYDL()
ie = YoutubeUserIE(dl)
- result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
+ result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')
self.assertTrue(len(result['entries']) >= 320)
def test_youtube_safe_search(self):
dl = FakeYDL()
ie = YoutubePlaylistIE(dl)
- result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
+ result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')
self.assertEqual(len(result['entries']), 2)
def test_youtube_show(self):
result = ie.extract('http://www.youtube.com/show/airdisasters')
self.assertTrue(len(result) >= 3)
+ def test_youtube_mix(self):
+ dl = FakeYDL()
+ ie = YoutubePlaylistIE(dl)
+ result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y')
+ entries = result['entries']
+ self.assertTrue(len(entries) >= 20)
+ original_video = entries[0]
+ self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
+
if __name__ == '__main__':
unittest.main()
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import global_setup
-global_setup()
-
import io
import re
\-\-list\-extractors\ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs\ they
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ would\ handle
\-\-extractor\-descriptions\ \ \ Output\ descriptions\ of\ all\ supported\ extractors
-\-\-proxy\ URL\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ the\ specified\ HTTP/HTTPS\ proxy
+\-\-proxy\ URL\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ the\ specified\ HTTP/HTTPS\ proxy.\ Pass\ in\ an
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ empty\ string\ (\-\-proxy\ "")\ for\ direct\ connection
\-\-no\-check\-certificate\ \ \ \ \ Suppress\ HTTPS\ certificate\ validation.
\-\-cache\-dir\ DIR\ \ \ \ \ \ \ \ \ \ \ \ Location\ in\ the\ filesystem\ where\ youtube\-dl\ can
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ store\ downloaded\ information\ permanently.\ By
\-\-dateafter\ DATE\ \ \ \ \ \ \ \ \ \ \ download\ only\ videos\ uploaded\ after\ this\ date
\-\-no\-playlist\ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ only\ the\ currently\ playing\ video
\-\-age\-limit\ YEARS\ \ \ \ \ \ \ \ \ \ download\ only\ videos\ suitable\ for\ the\ given\ age
-\-\-download\-archive\ FILE\ \ \ \ Download\ only\ videos\ not\ present\ in\ the\ archive
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ file.\ Record\ all\ downloaded\ videos\ in\ it.
+\-\-download\-archive\ FILE\ \ \ \ Download\ only\ videos\ not\ listed\ in\ the\ archive
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ file.\ Record\ the\ IDs\ of\ all\ downloaded\ videos\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ it.
\f[]
.fi
.SS Download Options:
\-\-get\-description\ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ description
\-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename
\-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format
+\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information
\-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines
\-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar
\-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar
\-v,\ \-\-verbose\ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ various\ debugging\ information
\-\-dump\-intermediate\-pages\ \ print\ downloaded\ pages\ to\ debug\ problems(very
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ verbose)
-\-\-write\-pages\ \ \ \ \ \ \ \ \ \ \ \ \ \ Write\ downloaded\ pages\ to\ files\ in\ the\ current
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ directory
+\-\-write\-pages\ \ \ \ \ \ \ \ \ \ \ \ \ \ Write\ downloaded\ intermediary\ pages\ to\ files\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ the\ current\ directory\ to\ debug\ problems
\f[]
.fi
.SS Video Format Options:
.IP
.nf
\f[C]
-\-f,\ \-\-format\ FORMAT\ \ \ \ \ \ \ \ video\ format\ code,\ specifiy\ the\ order\ of
+\-f,\ \-\-format\ FORMAT\ \ \ \ \ \ \ \ video\ format\ code,\ specify\ the\ order\ of
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18".\ "\-f\ mp4"
\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ and\ "\-f\ flv"\ are\ also\ supported
\-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats
\f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio
and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or
\f[C]~/.config/youtube\-dl.conf\f[].
+On Windows, the configuration file locations are
+\f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] and
+\f[C]C:\\Users\\<Yourname>\\youtube\-dl.conf\f[].
.SH OUTPUT TEMPLATE
.PP
The \f[C]\-o\f[] option allows users to indicate a template for the
__youtube_dl()
{
- local cur prev opts
+ local cur prev opts fileopts diropts keywords
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
- opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --no-playlist --age-limit --download-archive --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-filename --get-format --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata"
- keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater"
+ prev="${COMP_WORDS[COMP_CWORD-1]}"
+ opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --socket-timeout --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --no-playlist --age-limit --download-archive --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata"
+ keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
+ fileopts="-a|--batch-file|--download-archive|--cookies"
+ diropts="--cache-dir"
+
+ if [[ ${prev} =~ ${fileopts} ]]; then
+ COMPREPLY=( $(compgen -f -- ${cur}) )
+ return 0
+ elif [[ ${prev} =~ ${diropts} ]]; then
+ COMPREPLY=( $(compgen -d -- ${cur}) )
+ return 0
+ fi
if [[ ${cur} =~ : ]]; then
COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) )
-import math
import os
import re
import subprocess
import sys
import time
-if os.name == 'nt':
- import ctypes
-
from .utils import (
compat_urllib_error,
compat_urllib_request,
ContentTooShortError,
determine_ext,
encodeFilename,
+ format_bytes,
sanitize_open,
timeconvert,
)
self._progress_hooks = []
self.params = params
- @staticmethod
- def format_bytes(bytes):
- if bytes is None:
- return 'N/A'
- if type(bytes) is str:
- bytes = float(bytes)
- if bytes == 0.0:
- exponent = 0
- else:
- exponent = int(math.log(bytes, 1024.0))
- suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
- converted = float(bytes) / float(1024 ** exponent)
- return '%.2f%s' % (converted, suffix)
-
@staticmethod
def format_seconds(seconds):
(mins, secs) = divmod(seconds, 60)
def format_speed(speed):
if speed is None:
return '%10s' % '---b/s'
- return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
+ return '%10s' % ('%s/s' % format_bytes(speed))
@staticmethod
def best_block_size(elapsed_time, bytes):
def to_stderr(self, message):
self.ydl.to_screen(message)
- def to_cons_title(self, message):
- """Set console/terminal window title to message."""
- if not self.params.get('consoletitle', False):
- return
- if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
- # c_wchar_p() might not be necessary if `message` is
- # already of type unicode()
- ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
- elif 'TERM' in os.environ:
- self.to_screen('\033]0;%s\007' % message, skip_eol=True)
+ def to_console_title(self, message):
+ self.ydl.to_console_title(message)
def trouble(self, *args, **kargs):
self.ydl.trouble(*args, **kargs)
else:
self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
(clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
- self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
+ self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %
(percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
def report_resuming_byte(self, resume_len):
(clear_line, data_len_str, self.format_seconds(tot_time)))
def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
+ def run_rtmpdump(args):
+ start = time.time()
+ resume_percent = None
+ resume_downloaded_data_len = None
+ proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+ cursor_in_new_line = True
+ proc_stderr_closed = False
+ while not proc_stderr_closed:
+ # read line from stderr
+ line = u''
+ while True:
+ char = proc.stderr.read(1)
+ if not char:
+ proc_stderr_closed = True
+ break
+ if char in [b'\r', b'\n']:
+ break
+ line += char.decode('ascii', 'replace')
+ if not line:
+ # proc_stderr_closed is True
+ continue
+ mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
+ if mobj:
+ downloaded_data_len = int(float(mobj.group(1))*1024)
+ percent = float(mobj.group(2))
+ if not resume_percent:
+ resume_percent = percent
+ resume_downloaded_data_len = downloaded_data_len
+ eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
+ speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
+ data_len = None
+ if percent > 0:
+ data_len = int(downloaded_data_len * 100 / percent)
+ data_len_str = u'~' + format_bytes(data_len)
+ self.report_progress(percent, data_len_str, speed, eta)
+ cursor_in_new_line = False
+ self._hook_progress({
+ 'downloaded_bytes': downloaded_data_len,
+ 'total_bytes': data_len,
+ 'tmpfilename': tmpfilename,
+ 'filename': filename,
+ 'status': 'downloading',
+ 'eta': eta,
+ 'speed': speed,
+ })
+ elif self.params.get('verbose', False):
+ if not cursor_in_new_line:
+ self.to_screen(u'')
+ cursor_in_new_line = True
+ self.to_screen(u'[rtmpdump] '+line)
+ proc.wait()
+ if not cursor_in_new_line:
+ self.to_screen(u'')
+ return proc.returncode
+
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
test = self.params.get('test', False)
except (OSError, IOError):
self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
return False
- verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet'
# Download using rtmpdump. rtmpdump returns exit code 2 when
# the connection was interrumpted and resuming appears to be
# possible. This is part of rtmpdump's normal usage, AFAIK.
- basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename]
+ basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
if player_url is not None:
basic_args += ['--swfVfy', player_url]
if page_url is not None:
if live:
basic_args += ['--live']
args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
+
+ if sys.platform == 'win32' and sys.version_info < (3, 0):
+ # Windows subprocess module does not actually support Unicode
+ # on Python 2.x
+ # See http://stackoverflow.com/a/9951851/35070
+ subprocess_encoding = sys.getfilesystemencoding()
+ args = [a.encode(subprocess_encoding, 'ignore') for a in args]
+ else:
+ subprocess_encoding = None
+
if self.params.get('verbose', False):
+ if subprocess_encoding:
+ str_args = [
+ a.decode(subprocess_encoding) if isinstance(a, bytes) else a
+ for a in args]
+ else:
+ str_args = args
try:
import pipes
- shell_quote = lambda args: ' '.join(map(pipes.quote, args))
+ shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
except ImportError:
shell_quote = repr
- self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
- retval = subprocess.call(args)
+ self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
+
+ retval = run_rtmpdump(args)
+
while (retval == 2 or retval == 1) and not test:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
+ self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
- retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+ retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
cursize = os.path.getsize(encodeFilename(tmpfilename))
if prevsize == cursize and retval == 1:
break
# Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
if prevsize == cursize and retval == 2 and cursize > 1024:
- self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+ self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
retval = 0
break
if retval == 0 or (test and retval == 2):
fsize = os.path.getsize(encodeFilename(tmpfilename))
- self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
+ self.to_screen(u'[rtmpdump] %s bytes' % fsize)
self.try_rename(tmpfilename, filename)
self._hook_progress({
'downloaded_bytes': fsize,
self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
return False
- data_len_str = self.format_bytes(data_len)
+ data_len_str = format_bytes(data_len)
byte_counter = 0 + resume_len
block_size = self.params.get('buffersize', 1024)
start = time.time()
options = ['-c', 'copy']
for (name, value) in metadata.items():
- options.extend(['-metadata', '%s="%s"' % (name, value)])
+ options.extend(['-metadata', '%s=%s' % (name, value)])
options.extend(['-f', ext])
self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
import errno
import io
+import json
import os
+import platform
import re
import shutil
+import subprocess
import socket
import sys
import time
import traceback
-from .utils import *
+if os.name == 'nt':
+ import ctypes
+
+from .utils import (
+ compat_cookiejar,
+ compat_http_client,
+ compat_print,
+ compat_str,
+ compat_urllib_error,
+ compat_urllib_request,
+ ContentTooShortError,
+ date_from_str,
+ DateRange,
+ determine_ext,
+ DownloadError,
+ encodeFilename,
+ ExtractorError,
+ format_bytes,
+ locked_file,
+ make_HTTPS_handler,
+ MaxDownloadsReached,
+ PostProcessingError,
+ platform_name,
+ preferredencoding,
+ SameFileError,
+ sanitize_filename,
+ subtitles_filename,
+ takewhile_inclusive,
+ UnavailableVideoError,
+ write_json_file,
+ write_string,
+ YoutubeDLHandler,
+)
from .extractor import get_info_extractor, gen_extractors
from .FileDownloader import FileDownloader
+from .version import __version__
class YoutubeDL(object):
forcethumbnail: Force printing thumbnail URL.
forcedescription: Force printing description.
forcefilename: Force printing final filename.
+ forcejson: Force printing info_dict as JSON.
simulate: Do not download the video files.
format: Video format code.
format_limit: Highest quality format to try.
playlistend: Playlist item to end at.
matchtitle: Download only matching titles.
rejecttitle: Reject downloads for matching titles.
+ logger: Log messages to a logging.Logger instance.
logtostderr: Log messages to stderr instead of stdout.
writedescription: Write the video description to a .description file
writeinfojson: Write the video description to a .info.json file
noplaylist: Download single video instead of a playlist if in doubt.
age_limit: An integer representing the user's age in years.
Unsuitable videos for the given age are skipped.
- downloadarchive: File name of a file where all downloads are recorded.
+ download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
+ cookiefile: File name where cookies should be read from and dumped to.
+ nocheckcertificate:Do not verify SSL certificates
+ proxy: URL of the proxy server to use
+ socket_timeout: Time to wait for unresponsive hosts, in seconds
The following parameters are not used by YoutubeDL itself, they are used by
the FileDownloader:
_num_downloads = None
_screen_file = None
- def __init__(self, params):
+ def __init__(self, params=None):
"""Create a FileDownloader object with the given options."""
self._ies = []
self._ies_instances = {}
self._download_retcode = 0
self._num_downloads = 0
self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+ self.params = {} if params is None else params
if (sys.version_info >= (3,) and sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
u'Assuming --restrict-filenames since file system encoding '
u'cannot encode all charactes. '
u'Set the LC_ALL environment variable to fix this.')
- params['restrictfilenames'] = True
+ self.params['restrictfilenames'] = True
- self.params = params
self.fd = FileDownloader(self, self.params)
- if '%(stitle)s' in self.params['outtmpl']:
+ if '%(stitle)s' in self.params.get('outtmpl', ''):
self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
+ self._setup_opener()
+
def add_info_extractor(self, ie):
"""Add an InfoExtractor object to the end of the list."""
self._ies.append(ie)
def to_screen(self, message, skip_eol=False):
"""Print message to stdout if not in quiet mode."""
- if not self.params.get('quiet', False):
+ if self.params.get('logger'):
+ self.params['logger'].debug(message)
+ elif not self.params.get('quiet', False):
terminator = [u'\n', u''][skip_eol]
output = message + terminator
write_string(output, self._screen_file)
def to_stderr(self, message):
"""Print message to stderr."""
assert type(message) == type(u'')
- output = message + u'\n'
- if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
- output = output.encode(preferredencoding())
- sys.stderr.write(output)
+ if self.params.get('logger'):
+ self.params['logger'].error(message)
+ else:
+ output = message + u'\n'
+ if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+ output = output.encode(preferredencoding())
+ sys.stderr.write(output)
+
+ def to_console_title(self, message):
+ if not self.params.get('consoletitle', False):
+ return
+ if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+ # c_wchar_p() might not be necessary if `message` is
+ # already of type unicode()
+ ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+ elif 'TERM' in os.environ:
+ write_string(u'\033]0;%s\007' % message, self._screen_file)
+
+ def save_console_title(self):
+ if not self.params.get('consoletitle', False):
+ return
+ if 'TERM' in os.environ:
+ # Save the title on stack
+ write_string(u'\033[22;0t', self._screen_file)
+
+ def restore_console_title(self):
+ if not self.params.get('consoletitle', False):
+ return
+ if 'TERM' in os.environ:
+ # Restore the title from stack
+ write_string(u'\033[23;0t', self._screen_file)
- def fixed_template(self):
- """Checks if the output template is fixed."""
- return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
+ def __enter__(self):
+ self.save_console_title()
+ return self
+
+ def __exit__(self, *args):
+ self.restore_console_title()
+
+ if self.params.get('cookiefile') is not None:
+ self.cookiejar.save()
def trouble(self, message=None, tb=None):
"""Determine action to take when a download problem appears.
"""Report file has already been fully downloaded."""
try:
self.to_screen(u'[download] %s has already been downloaded' % file_name)
- except (UnicodeEncodeError) as err:
+ except UnicodeEncodeError:
self.to_screen(u'[download] The file has already been downloaded')
def increment_downloads(self):
def _match_entry(self, info_dict):
""" Returns None iff the file should be downloaded """
- title = info_dict['title']
- matchtitle = self.params.get('matchtitle', False)
- if matchtitle:
- if not re.search(matchtitle, title, re.IGNORECASE):
- return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
- rejecttitle = self.params.get('rejecttitle', False)
- if rejecttitle:
- if re.search(rejecttitle, title, re.IGNORECASE):
- return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+ if 'title' in info_dict:
+ # This can happen when we're just evaluating the playlist
+ title = info_dict['title']
+ matchtitle = self.params.get('matchtitle', False)
+ if matchtitle:
+ if not re.search(matchtitle, title, re.IGNORECASE):
+ return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+ rejecttitle = self.params.get('rejecttitle', False)
+ if rejecttitle:
+ if re.search(rejecttitle, title, re.IGNORECASE):
+ return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
date = info_dict.get('upload_date', None)
if date is not None:
dateRange = self.params.get('daterange', DateRange())
if age_limit < info_dict.get('age_limit', 0):
return u'Skipping "' + title + '" because it is age restricted'
if self.in_download_archive(info_dict):
- return (u'%(title)s has already been recorded in archive'
- % info_dict)
+ return (u'%s has already been recorded in archive'
+ % info_dict.get('title', info_dict.get('id', u'video')))
return None
@staticmethod
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video':
self.add_extra_info(ie_result, extra_info)
- return self.process_video_result(ie_result)
+ return self.process_video_result(ie_result, download=download)
elif result_type == 'url':
# We have to add extra_info to the results because it may be
# contained in a playlist
ie_key=ie_result.get('ie_key'),
extra_info=extra_info)
elif result_type == 'playlist':
- self.add_extra_info(ie_result, extra_info)
+
# We process each entry in the playlist
playlist = ie_result.get('title', None) or ie_result.get('id', None)
self.to_screen(u'[download] Downloading playlist: %s' % playlist)
'webpage_url': ie_result['webpage_url'],
'extractor_key': ie_result['extractor_key'],
}
+
+ reason = self._match_entry(entry)
+ if reason is not None:
+ self.to_screen(u'[download] ' + reason)
+ continue
+
entry_result = self.process_ie_result(entry,
download=download,
extra_info=extra)
# Forced printings
if self.params.get('forcetitle', False):
- compat_print(info_dict['title'])
+ compat_print(info_dict['fulltitle'])
if self.params.get('forceid', False):
compat_print(info_dict['id'])
if self.params.get('forceurl', False):
compat_print(filename)
if self.params.get('forceformat', False):
compat_print(info_dict['format'])
+ if self.params.get('forcejson', False):
+ compat_print(json.dumps(info_dict))
# Do nothing else if in simulate mode
if self.params.get('simulate', False):
# subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE
subtitles = info_dict['subtitles']
- sub_format = self.params.get('subtitlesformat')
+ sub_format = self.params.get('subtitlesformat', 'srt')
for sub_lang in subtitles.keys():
sub = subtitles[sub_lang]
if sub is None:
return
if self.params.get('writeinfojson', False):
- infofn = filename + u'.info.json'
+ infofn = os.path.splitext(filename)[0] + u'.info.json'
self.report_writeinfojson(infofn)
try:
json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
def download(self, url_list):
"""Download a given list of URLs."""
- if len(url_list) > 1 and self.fixed_template():
+ if (len(url_list) > 1 and
+ '%' not in self.params['outtmpl']
+ and self.params.get('max_downloads') != 1):
raise SameFileError(self.params['outtmpl'])
for url in url_list:
try:
#It also downloads the videos
- videos = self.extract_info(url)
+ self.extract_info(url)
except UnavailableVideoError:
self.report_error(u'unable to download video')
except MaxDownloadsReached:
except (IOError, OSError):
self.report_warning(u'Unable to remove downloaded video file')
+ def _make_archive_id(self, info_dict):
+ # Future-proof against any change in case
+ # and backwards compatibility with prior versions
+ extractor = info_dict.get('extractor_key')
+ if extractor is None:
+ if 'id' in info_dict:
+ extractor = info_dict.get('ie_key') # key in a playlist
+ if extractor is None:
+ return None # Incomplete video information
+ return extractor.lower() + u' ' + info_dict['id']
+
def in_download_archive(self, info_dict):
fn = self.params.get('download_archive')
if fn is None:
return False
- vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+
+ vid_id = self._make_archive_id(info_dict)
+ if vid_id is None:
+ return False # Incomplete video information
+
try:
with locked_file(fn, 'r', encoding='utf-8') as archive_file:
for line in archive_file:
fn = self.params.get('download_archive')
if fn is None:
return
- vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+ vid_id = self._make_archive_id(info_dict)
+ assert vid_id
with locked_file(fn, 'a', encoding='utf-8') as archive_file:
archive_file.write(vid_id + u'\n')
@staticmethod
def format_resolution(format, default='unknown'):
+ if format.get('vcodec') == 'none':
+ return 'audio only'
if format.get('_resolution') is not None:
return format['_resolution']
if format.get('height') is not None:
return res
def list_formats(self, info_dict):
- def line(format):
- return (u'%-20s%-10s%-12s%s' % (
+ def format_note(fdict):
+ res = u''
+ if fdict.get('format_note') is not None:
+ res += fdict['format_note'] + u' '
+ if (fdict.get('vcodec') is not None and
+ fdict.get('vcodec') != 'none'):
+ res += u'%-5s' % fdict['vcodec']
+ elif fdict.get('vbr') is not None:
+ res += u'video'
+ if fdict.get('vbr') is not None:
+ res += u'@%4dk' % fdict['vbr']
+ if fdict.get('acodec') is not None:
+ if res:
+ res += u', '
+ res += u'%-5s' % fdict['acodec']
+ elif fdict.get('abr') is not None:
+ if res:
+ res += u', '
+ res += 'audio'
+ if fdict.get('abr') is not None:
+ res += u'@%3dk' % fdict['abr']
+ if fdict.get('filesize') is not None:
+ if res:
+ res += u', '
+ res += format_bytes(fdict['filesize'])
+ return res
+
+ def line(format, idlen=20):
+ return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
format['format_id'],
format['ext'],
self.format_resolution(format),
- format.get('format_note', ''),
- )
- )
+ format_note(format),
+ ))
formats = info_dict.get('formats', [info_dict])
- formats_s = list(map(line, formats))
+ idlen = max(len(u'format code'),
+ max(len(f['format_id']) for f in formats))
+ formats_s = [line(f, idlen) for f in formats]
if len(formats) > 1:
- formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)'
- formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)'
+ formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
+ formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
header_line = line({
'format_id': u'format code', 'ext': u'extension',
- '_resolution': u'resolution', 'format_note': u'note'})
+ '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
(info_dict['id'], header_line, u"\n".join(formats_s)))
+
+ def urlopen(self, req):
+ """ Start an HTTP download """
+ return self._opener.open(req)
+
+ def print_debug_header(self):
+ if not self.params.get('verbose'):
+ return
+ write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
+ try:
+ sp = subprocess.Popen(
+ ['git', 'rev-parse', '--short', 'HEAD'],
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+ cwd=os.path.dirname(os.path.abspath(__file__)))
+ out, err = sp.communicate()
+ out = out.decode().strip()
+ if re.match('[0-9a-f]+', out):
+ write_string(u'[debug] Git HEAD: ' + out + u'\n')
+ except:
+ try:
+ sys.exc_clear()
+ except:
+ pass
+ write_string(u'[debug] Python version %s - %s' %
+ (platform.python_version(), platform_name()) + u'\n')
+
+ proxy_map = {}
+ for handler in self._opener.handlers:
+ if hasattr(handler, 'proxies'):
+ proxy_map.update(handler.proxies)
+ write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
+
+ def _setup_opener(self):
+ timeout_val = self.params.get('socket_timeout')
+ timeout = 600 if timeout_val is None else float(timeout_val)
+
+ opts_cookiefile = self.params.get('cookiefile')
+ opts_proxy = self.params.get('proxy')
+
+ if opts_cookiefile is None:
+ self.cookiejar = compat_cookiejar.CookieJar()
+ else:
+ self.cookiejar = compat_cookiejar.MozillaCookieJar(
+ opts_cookiefile)
+ if os.access(opts_cookiefile, os.R_OK):
+ self.cookiejar.load()
+
+ cookie_processor = compat_urllib_request.HTTPCookieProcessor(
+ self.cookiejar)
+ if opts_proxy is not None:
+ if opts_proxy == '':
+ proxies = {}
+ else:
+ proxies = {'http': opts_proxy, 'https': opts_proxy}
+ else:
+ proxies = compat_urllib_request.getproxies()
+ # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+ if 'http' in proxies and 'https' not in proxies:
+ proxies['https'] = proxies['http']
+ proxy_handler = compat_urllib_request.ProxyHandler(proxies)
+ https_handler = make_HTTPS_handler(
+ self.params.get('nocheckcertificate', False))
+ opener = compat_urllib_request.build_opener(
+ https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+ # Delete the default user-agent header, which would otherwise apply in
+ # cases where our custom HTTP handler doesn't come into play
+ # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+ opener.addheaders = []
+ self._opener = opener
+
+ # TODO remove this global modification
+ compat_urllib_request.install_opener(opener)
+ socket.setdefaulttimeout(timeout)
'Ismael Mejía',
'Steffan \'Ruirize\' James',
'Andras Elso',
+ 'Jelle van der Waa',
+ 'Marcin Cieślak',
+ 'Anton Larionov',
+ 'Takuya Tsuchida',
+ 'Sergey M.',
)
__license__ = 'Public Domain'
import codecs
-import collections
import getpass
import optparse
import os
import random
import re
import shlex
-import socket
import subprocess
import sys
-import traceback
-import platform
from .utils import (
- compat_cookiejar,
compat_print,
- compat_str,
- compat_urllib_request,
DateRange,
decodeOption,
determine_ext,
DownloadError,
get_cachedir,
- make_HTTPS_handler,
MaxDownloadsReached,
- platform_name,
preferredencoding,
SameFileError,
std_headers,
write_string,
- YoutubeDLHandler,
)
from .update import update_self
-from .version import __version__
from .FileDownloader import (
FileDownloader,
)
from .extractor import gen_extractors
+from .version import __version__
from .YoutubeDL import YoutubeDL
from .PostProcessor import (
FFmpegMetadataPP,
def parseOpts(overrideArguments=None):
- def _readOptions(filename_bytes):
+ def _readOptions(filename_bytes, default=[]):
try:
optionf = open(filename_bytes)
except IOError:
- return [] # silently skip if file is not present
+ return default # silently skip if file is not present
try:
res = []
for l in optionf:
general.add_option('--extractor-descriptions',
action='store_true', dest='list_extractor_descriptions',
help='Output descriptions of all supported extractors', default=False)
- general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
+ general.add_option(
+ '--proxy', dest='proxy', default=None, metavar='URL',
+ help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
general.add_option(
'--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
general.add_option(
'--no-cache-dir', action='store_const', const=None, dest='cachedir',
help='Disable filesystem caching')
+ general.add_option(
+ '--socket-timeout', dest='socket_timeout',
+ type=float, default=None, help=optparse.SUPPRESS_HELP)
selection.add_option('--playlist-start',
dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
- selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
+ selection.add_option('--max-downloads', metavar='NUMBER',
+ dest='max_downloads', type=int, default=None,
+ help='Abort after downloading NUMBER files')
selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None)
selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None)
selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
default=None, type=int)
selection.add_option('--download-archive', metavar='FILE',
dest='download_archive',
- help='Download only videos not present in the archive file. Record all downloaded videos in it.')
+ help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
authentication.add_option('-u', '--username',
video_format.add_option('-f', '--format',
action='store', dest='format', metavar='FORMAT', default='best',
- help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
+ help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
video_format.add_option('--all-formats',
action='store_const', dest='format', help='download all available video formats', const='all')
video_format.add_option('--prefer-free-formats',
verbosity.add_option('--get-format',
action='store_true', dest='getformat',
help='simulate, quiet but print output format', default=False)
+ verbosity.add_option('-j', '--dump-json',
+ action='store_true', dest='dumpjson',
+ help='simulate, quiet but print JSON information', default=False)
verbosity.add_option('--newline',
action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
verbosity.add_option('--no-progress',
help='print downloaded pages to debug problems(very verbose)')
verbosity.add_option('--write-pages',
action='store_true', dest='write_pages', default=False,
- help='Write downloaded pages to files in the current directory')
+ help='Write downloaded intermediary pages to files in the current directory to debug problems')
verbosity.add_option('--youtube-print-sig-code',
action='store_true', dest='youtube_print_sig_code', default=False,
help=optparse.SUPPRESS_HELP)
if opts.verbose:
write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
else:
+ systemConf = _readOptions('/etc/youtube-dl.conf')
+
xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
if xdg_config_home:
userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
if not os.path.isfile(userConfFile):
userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
- systemConf = _readOptions('/etc/youtube-dl.conf')
- userConf = _readOptions(userConfFile)
+ userConf = _readOptions(userConfFile, None)
+
+ if userConf is None:
+ appdata_dir = os.environ.get('appdata')
+ if appdata_dir:
+ userConf = _readOptions(
+ os.path.join(appdata_dir, 'youtube-dl', 'config'),
+ default=None)
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
+ default=None)
+
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
+ default=None)
+ if userConf is None:
+ userConf = _readOptions(
+ os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
+ default=None)
+
+ if userConf is None:
+ userConf = []
+
commandLineConf = sys.argv[1:]
argv = systemConf + userConf + commandLineConf
opts, args = parser.parse_args(argv)
parser, opts, args = parseOpts(argv)
- # Open appropriate CookieJar
- if opts.cookiefile is None:
- jar = compat_cookiejar.CookieJar()
- else:
- try:
- jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile)
- if os.access(opts.cookiefile, os.R_OK):
- jar.load()
- except (IOError, OSError) as err:
- if opts.verbose:
- traceback.print_exc()
- write_string(u'ERROR: unable to open cookie file\n')
- sys.exit(101)
# Set user agent
if opts.user_agent is not None:
std_headers['User-Agent'] = opts.user_agent
all_urls = batchurls + args
all_urls = [url.strip() for url in all_urls]
- opener = _setup_opener(jar=jar, opts=opts)
-
extractors = gen_extractors()
if opts.list_extractors:
if opts.retries is not None:
try:
opts.retries = int(opts.retries)
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid retry count specified')
if opts.buffersize is not None:
numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
opts.playliststart = int(opts.playliststart)
if opts.playliststart <= 0:
raise ValueError(u'Playlist start must be positive')
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid playlist start number specified')
try:
opts.playlistend = int(opts.playlistend)
if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
raise ValueError(u'Playlist end must be greater than playlist start')
- except (TypeError, ValueError) as err:
+ except (TypeError, ValueError):
parser.error(u'invalid playlist end number specified')
if opts.extractaudio:
if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
u' file! Use "%%(ext)s" instead of %r' %
determine_ext(outtmpl, u''))
- # YoutubeDL
- ydl = YoutubeDL({
+ ydl_opts = {
'usenetrc': opts.usenetrc,
'username': opts.username,
'password': opts.password,
'videopassword': opts.videopassword,
- 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+ 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
'forceurl': opts.geturl,
'forcetitle': opts.gettitle,
'forceid': opts.getid,
'forcedescription': opts.getdescription,
'forcefilename': opts.getfilename,
'forceformat': opts.getformat,
+ 'forcejson': opts.dumpjson,
'simulate': opts.simulate,
- 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+ 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
'format': opts.format,
'format_limit': opts.format_limit,
'listformats': opts.listformats,
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
'download_archive': opts.download_archive,
- })
-
- if opts.verbose:
- write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
- try:
- sp = subprocess.Popen(
- ['git', 'rev-parse', '--short', 'HEAD'],
- stdout=subprocess.PIPE, stderr=subprocess.PIPE,
- cwd=os.path.dirname(os.path.abspath(__file__)))
- out, err = sp.communicate()
- out = out.decode().strip()
- if re.match('[0-9a-f]+', out):
- write_string(u'[debug] Git HEAD: ' + out + u'\n')
- except:
- try:
- sys.exc_clear()
- except:
- pass
- write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n')
-
- proxy_map = {}
- for handler in opener.handlers:
- if hasattr(handler, 'proxies'):
- proxy_map.update(handler.proxies)
- write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
-
- ydl.add_default_info_extractors()
-
- # PostProcessors
- # Add the metadata pp first, the other pps will copy it
- if opts.addmetadata:
- ydl.add_post_processor(FFmpegMetadataPP())
- if opts.extractaudio:
- ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
- if opts.recodevideo:
- ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
- if opts.embedsubtitles:
- ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
-
- # Update version
- if opts.update_self:
- update_self(ydl.to_screen, opts.verbose)
-
- # Maybe do nothing
- if len(all_urls) < 1:
- if not opts.update_self:
- parser.error(u'you must provide at least one URL')
- else:
- sys.exit()
+ 'cookiefile': opts.cookiefile,
+ 'nocheckcertificate': opts.no_check_certificate,
+ 'proxy': opts.proxy,
+ 'socket_timeout': opts.socket_timeout,
+ }
- try:
- retcode = ydl.download(all_urls)
- except MaxDownloadsReached:
- ydl.to_screen(u'--max-download limit reached, aborting.')
- retcode = 101
+ with YoutubeDL(ydl_opts) as ydl:
+ ydl.print_debug_header()
+ ydl.add_default_info_extractors()
+
+ # PostProcessors
+ # Add the metadata pp first, the other pps will copy it
+ if opts.addmetadata:
+ ydl.add_post_processor(FFmpegMetadataPP())
+ if opts.extractaudio:
+ ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
+ if opts.recodevideo:
+ ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
+ if opts.embedsubtitles:
+ ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
+
+ # Update version
+ if opts.update_self:
+ update_self(ydl.to_screen, opts.verbose)
+
+ # Maybe do nothing
+ if len(all_urls) < 1:
+ if not opts.update_self:
+ parser.error(u'you must provide at least one URL')
+ else:
+ sys.exit()
- # Dump cookie jar if requested
- if opts.cookiefile is not None:
try:
- jar.save()
- except (IOError, OSError):
- sys.exit(u'ERROR: unable to save cookie jar')
+ retcode = ydl.download(all_urls)
+ except MaxDownloadsReached:
+ ydl.to_screen(u'--max-download limit reached, aborting.')
+ retcode = 101
sys.exit(retcode)
-def _setup_opener(jar=None, opts=None, timeout=300):
- if opts is None:
- FakeOptions = collections.namedtuple(
- 'FakeOptions', ['proxy', 'no_check_certificate'])
- opts = FakeOptions(proxy=None, no_check_certificate=False)
-
- cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
- if opts.proxy is not None:
- if opts.proxy == '':
- proxies = {}
- else:
- proxies = {'http': opts.proxy, 'https': opts.proxy}
- else:
- proxies = compat_urllib_request.getproxies()
- # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
- if 'http' in proxies and 'https' not in proxies:
- proxies['https'] = proxies['http']
- proxy_handler = compat_urllib_request.ProxyHandler(proxies)
- https_handler = make_HTTPS_handler(opts)
- opener = compat_urllib_request.build_opener(
- https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
- # Delete the default user-agent header, which would otherwise apply in
- # cases where our custom HTTP handler doesn't come into play
- # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
- opener.addheaders = []
- compat_urllib_request.install_opener(opener)
- socket.setdefaulttimeout(timeout)
- return opener
-
-
def main(argv=None):
try:
_real_main(argv)
from .appletrailers import AppleTrailersIE
from .addanime import AddAnimeIE
+from .anitube import AnitubeIE
from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import (
)
from .auengine import AUEngineIE
from .bambuser import BambuserIE, BambuserChannelIE
-from .bandcamp import BandcampIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .bloomberg import BloombergIE
from .breakcom import BreakIE
from .canalplus import CanalplusIE
from .canalc2 import Canalc2IE
from .cinemassacre import CinemassacreIE
+from .clipfish import ClipfishIE
+from .clipsyndicate import ClipsyndicateIE
from .cnn import CNNIE
from .collegehumor import CollegeHumorIE
-from .comedycentral import ComedyCentralIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
from .condenast import CondeNastIE
from .criterion import CriterionIE
from .cspan import CSpanIE
+from .d8 import D8IE
from .dailymotion import (
DailymotionIE,
DailymotionPlaylistIE,
)
from .freesound import FreesoundIE
from .funnyordie import FunnyOrDieIE
+from .gamekings import GamekingsIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
from .generic import GenericIE
from .howcast import HowcastIE
from .hypem import HypemIE
from .ign import IGNIE, OneUPIE
+from .imdb import ImdbIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
from .kickstarter import KickStarterIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
-from .livestream import LivestreamIE
+from .livestream import LivestreamIE, LivestreamOriginalIE
from .metacafe import MetacafeIE
from .metacritic import MetacriticIE
from .mit import TechTVMITIE, MITIE
from .nbc import NBCNewsIE
from .newgrounds import NewgroundsIE
from .nhl import NHLIE, NHLVideocenterIE
+from .niconico import NiconicoIE
from .nowvideo import NowVideoIE
from .ooyala import OoyalaIE
from .orf import ORFIE
from .pbs import PBSIE
from .photobucket import PhotobucketIE
+from .podomatic import PodomaticIE
from .pornhub import PornHubIE
from .pornotube import PornotubeIE
from .rbmaradio import RBMARadioIE
from .sina import SinaIE
from .slashdot import SlashdotIE
from .slideshare import SlideshareIE
+from .smotri import (
+ SmotriIE,
+ SmotriCommunityIE,
+ SmotriUserIE,
+)
from .sohu import SohuIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
-from .southparkstudios import SouthParkStudiosIE
+from .southparkstudios import (
+ SouthParkStudiosIE,
+ SouthparkDeIE,
+)
from .space import SpaceIE
from .spankwire import SpankwireIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .statigram import StatigramIE
from .steam import SteamIE
+from .streamcloud import StreamcloudIE
from .sztvhu import SztvHuIE
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tf1 import TF1IE
from .thisav import ThisAVIE
+from .toutv import TouTvIE
from .traileraddict import TrailerAddictIE
from .trilulilu import TriluliluIE
from .tube8 import Tube8IE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .tutv import TutvIE
+from .tvp import TvpIE
from .unistra import UnistraIE
from .ustream import UstreamIE, UstreamChannelIE
from .vbox7 import Vbox7IE
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videopremium import VideoPremiumIE
-from .vimeo import VimeoIE, VimeoChannelIE
+from .vimeo import (
+ VimeoIE,
+ VimeoChannelIE,
+ VimeoUserIE,
+)
from .vine import VineIE
+from .viki import VikiIE
from .vk import VKIE
from .wat import WatIE
from .websurg import WeBSurgIE
from .xnxx import XNXXIE
from .xvideos import XVideosIE
from .xtube import XTubeIE
-from .yahoo import YahooIE, YahooSearchIE
+from .yahoo import (
+ YahooIE,
+ YahooNewsIE,
+ YahooSearchIE,
+)
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
YoutubeTruncatedURLIE,
YoutubeWatchLaterIE,
YoutubeFavouritesIE,
+ YoutubeHistoryIE,
)
from .zdf import ZDFIE
--- /dev/null
+import re
+
+from .common import InfoExtractor
+
+
+class AnitubeIE(InfoExtractor):
+ IE_NAME = u'anitube.se'
+ _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.anitube.se/video/36621',
+ u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
+ u'file': u'36621.mp4',
+ u'info_dict': {
+ u'id': u'36621',
+ u'ext': u'mp4',
+ u'title': u'Recorder to Randoseru 01',
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ webpage = self._download_webpage(url, video_id)
+ key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
+ webpage, u'key')
+
+ config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
+ key)
+
+ video_title = config_xml.find('title').text
+
+ formats = []
+ video_url = config_xml.find('file')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'sd',
+ 'url': video_url.text,
+ })
+ video_url = config_xml.find('filehd')
+ if video_url is not None:
+ formats.append({
+ 'format_id': 'hd',
+ 'url': video_url.text,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats
+ }
})
formats = sorted(formats, key=lambda f: (f['height'], f['width']))
- info = {
+ playlist.append({
'_type': 'video',
'id': video_id,
'title': title,
'upload_date': upload_date,
'uploader_id': uploader_id,
'user_agent': 'QuickTime compatible (youtube-dl)',
- }
- # TODO: Remove when #980 has been merged
- info['url'] = formats[-1]['url']
- info['ext'] = formats[-1]['ext']
-
- playlist.append(info)
+ })
return {
'_type': 'playlist',
for f in formats:
f['ext'] = determine_ext(f['url'])
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': title,
'description': description,
'uploader': uploader,
'upload_date': upload_date,
+ 'thumbnail': data.get('misc', {}).get('image'),
}
- thumbnail = data.get('misc', {}).get('image')
- if thumbnail:
- info['thumbnail'] = thumbnail
-
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
# encoding: utf-8
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
lang = mobj.group('lang')
return self._extract_liveweb(url, name, lang)
- if re.search(self._LIVE_URL, video_id) is not None:
+ if re.search(self._LIVE_URL, url) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
"""Extract from videos.arte.tv"""
ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
- ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
- ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+ ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
"""Extract form http://liveweb.arte.tv/"""
webpage = self._download_webpage(url, name)
video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
- config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
+ config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
video_id, u'Downloading information')
- config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
event_doc = config_doc.find('event')
url_node = event_doc.find('video').find('urlHd')
if url_node is None:
- url_node = video_doc.find('urlSd')
+ url_node = event_doc.find('urlSd')
return {'id': video_id,
'title': event_doc.find('name%s' % lang.capitalize()).text,
-import os.path
import re
from .common import InfoExtractor
from ..utils import (
compat_urllib_parse,
- compat_urllib_parse_urlparse,
+ determine_ext,
+ ExtractorError,
)
class AUEngineIE(InfoExtractor):
title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
webpage, u'title')
title = title.strip()
- links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage)
- links = [compat_urllib_parse.unquote(l) for l in links]
+ links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
+ links = map(compat_urllib_parse.unquote, links)
+
+ thumbnail = None
+ video_url = None
for link in links:
- root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path)
- if pathext == '.png':
+ if link.endswith('.png'):
thumbnail = link
- elif pathext == '.mp4':
- url = link
- ext = pathext
+ elif '/videos/' in link:
+ video_url = link
+ if not video_url:
+ raise ExtractorError(u'Could not find video URL')
+ ext = u'.' + determine_ext(video_url)
if ext == title[-len(ext):]:
title = title[:-len(ext)]
- ext = ext[1:]
- return [{
+
+ return {
'id': video_id,
- 'url': url,
- 'ext': ext,
+ 'url': video_url,
'title': title,
'thumbnail': thumbnail,
- }]
+ }
_TEST = {
u'url': u'http://bambuser.com/v/4050584',
- u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+ # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
+ #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
u'info_dict': {
u'id': u'4050584',
u'ext': u'flv',
u'uploader': u'pixelversity',
u'uploader_id': u'344706',
},
+ u'params': {
+ # It doesn't respect the 'Range' header, it would download the whole video
+ # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59
+ u'skip_download': True,
+ },
}
def _real_extract(self, url):
from .common import InfoExtractor
from ..utils import (
+ compat_str,
+ compat_urlparse,
ExtractorError,
)
class BandcampIE(InfoExtractor):
+ IE_NAME = u'Bandcamp'
_VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
- _TEST = {
+ _TESTS = [{
u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
u'file': u'1812978515.mp3',
u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
},
u'skip': u'There is a limit of 200 free downloads / month for the test song'
- }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
# We get the link to the free download page
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if m_download is None:
+ m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
+ if m_trackinfo:
+ json_code = m_trackinfo.group(1)
+ data = json.loads(json_code)
+
+ for d in data:
+ formats = [{
+ 'format_id': 'format_id',
+ 'url': format_url,
+ 'ext': format_id.partition('-')[0]
+ } for format_id, format_url in sorted(d['file'].items())]
+ return {
+ 'id': compat_str(d['id']),
+ 'title': d['title'],
+ 'formats': formats,
+ }
+ else:
raise ExtractorError(u'No free songs found')
download_link = m_download.group(1)
}
return [track_info]
+
+
+class BandcampAlbumIE(InfoExtractor):
+ IE_NAME = u'Bandcamp:album'
+ _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+
+ _TEST = {
+ u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+ u'playlist': [
+ {
+ u'file': u'1353101989.mp3',
+ u'md5': u'39bc1eded3476e927c724321ddf116cf',
+ u'info_dict': {
+ u'title': u'Intro',
+ }
+ },
+ {
+ u'file': u'38097443.mp3',
+ u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
+ u'info_dict': {
+ u'title': u'Kero One - Keep It Alive (Blazo remix)',
+ }
+ },
+ ],
+ u'params': {
+ u'playlistend': 2
+ },
+ u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
+ if not tracks_paths:
+ raise ExtractorError(u'The page doesn\'t contain any track')
+ entries = [
+ self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+ for t_path in tracks_paths]
+ title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
+ return {
+ '_type': 'playlist',
+ 'title': title,
+ 'entries': entries,
+ }
params = {'flashID': object_doc.attrib['id'],
'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
}
- playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+ def find_param(name):
+ node = find_xpath_attr(object_doc, './param', 'name', name)
+ if node is not None:
+ return node.attrib['value']
+ return None
+ playerKey = find_param('playerKey')
# Not all pages define this value
if playerKey is not None:
- params['playerKey'] = playerKey.attrib['value']
- videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+ params['playerKey'] = playerKey
+ # The three fields hold the id of the video
+ videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
if videoPlayer is not None:
- params['@videoPlayer'] = videoPlayer.attrib['value']
- linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL')
+ params['@videoPlayer'] = videoPlayer
+ linkBase = find_param('linkBaseURL')
if linkBase is not None:
- params['linkBaseURL'] = linkBase.attrib['value']
+ params['linkBaseURL'] = linkBase
data = compat_urllib_parse.urlencode(params)
return cls._FEDERATED_URL_TEMPLATE % data
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import unified_strdate
+
class CanalplusIE(InfoExtractor):
_VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
_VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.groupdict().get('id')
if video_id is None:
webpage = self._download_webpage(url, mobj.group('path'))
video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
info_url = self._VIDEO_INFO_TEMPLATE % video_id
- info_page = self._download_webpage(info_url,video_id,
+ doc = self._download_xml(info_url,video_id,
u'Downloading video info')
self.report_extraction(video_id)
- doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
video_info = [video for video in doc if video.find('ID').text == video_id][0]
infos = video_info.find('INFOS')
media = video_info.find('MEDIA')
--- /dev/null
+import re
+import time
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class ClipfishIE(InfoExtractor):
+ IE_NAME = u'clipfish'
+
+ _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+ _TEST = {
+ u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+ u'file': u'3966754.mp4',
+ u'md5': u'2521cd644e862936cf2e698206e47385',
+ u'info_dict': {
+ u'title': u'FIFA 14 - E3 2013 Trailer',
+ u'duration': 82,
+ },
+ u'skip': 'Blocked in the US'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
+ (video_id, int(time.time())))
+ doc = self._download_xml(
+ info_url, video_id, note=u'Downloading info page')
+ title = doc.find('title').text
+ video_url = doc.find('filename').text
+ if video_url is None:
+ xml_bytes = xml.etree.ElementTree.tostring(doc)
+ raise ExtractorError(u'Cannot find video URL in document %r' %
+ xml_bytes)
+ thumbnail = doc.find('imageurl').text
+ duration_str = doc.find('duration').text
+ m = re.match(
+ r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
+ duration_str)
+ if m:
+ duration = (
+ (int(m.group('hours')) * 60 * 60) +
+ (int(m.group('minutes')) * 60) +
+ (int(m.group('seconds')))
+ )
+ else:
+ duration = None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
--- /dev/null
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ find_xpath_attr,
+)
+
+
+class ClipsyndicateIE(InfoExtractor):
+ _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+ u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
+ u'info_dict': {
+ u'id': u'4629301',
+ u'ext': u'mp4',
+ u'title': u'Brick Briscoe',
+ u'duration': 612,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ js_player = self._download_webpage(
+ 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
+ video_id, u'Downlaoding player')
+ # it includes a required token
+ flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+
+ playlist_page = self._download_webpage(
+ 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
+ video_id, u'Downloading video info')
+ # Fix broken xml
+ playlist_page = re.sub('&', '&', playlist_page)
+ pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+
+ track_doc = pdoc.find('trackList/track')
+ def find_param(name):
+ node = find_xpath_attr(track_doc, './/param', 'name', name)
+ if node is not None:
+ return node.attrib['value']
+
+ return {
+ 'id': video_id,
+ 'title': find_param('title'),
+ 'url': track_doc.find('location').text,
+ 'thumbnail': find_param('thumbnail'),
+ 'duration': int(find_param('duration')),
+ }
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
path = mobj.group('path')
page_title = mobj.group('title')
info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
- info_xml = self._download_webpage(info_url, page_title)
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+ info = self._download_xml(info_url, page_title)
formats = []
for f in info.findall('files/file'):
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
self.report_extraction(video_id)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- metaXml = self._download_webpage(xmlUrl, video_id,
+ mdoc = self._download_xml(xmlUrl, video_id,
u'Downloading info XML',
u'Unable to download video info XML')
- mdoc = xml.etree.ElementTree.fromstring(metaXml)
try:
videoNode = mdoc.findall('./video')[0]
youtubeIdNode = videoNode.find('./youtubeID')
if next_url.endswith(u'manifest.f4m'):
manifest_url = next_url + '?hdcore=2.10.3'
- manifestXml = self._download_webpage(manifest_url, video_id,
+ adoc = self._download_xml(manifest_url, video_id,
u'Downloading XML manifest',
u'Unable to download video info XML')
- adoc = xml.etree.ElementTree.fromstring(manifestXml)
try:
- media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
- node_id = media_node.attrib['url']
video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
- except IndexError as err:
+ except IndexError:
raise ExtractorError(u'Invalid manifest file')
url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
+from .mtv import MTVServicesInfoExtractor
from ..utils import (
compat_str,
compat_urllib_parse,
)
-class ComedyCentralIE(InfoExtractor):
+class ComedyCentralIE(MTVServicesInfoExtractor):
+ _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+ _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
+
+ _TEST = {
+ u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+ u'md5': u'4167875aae411f903b751a21f357f1ee',
+ u'info_dict': {
+ u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+ u'ext': u'mp4',
+ u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
+ u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ title = mobj.group('title')
+ webpage = self._download_webpage(url, title)
+ mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
+ webpage, u'mgid')
+ return self._get_videos_info(mgid)
+
+
+class ComedyCentralShowsIE(InfoExtractor):
IE_DESC = u'The Daily Show / Colbert Report'
# urls can be abbreviations like :thedailyshow or :colbert
# urls for episodes like:
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- indexXml = self._download_webpage(indexUrl, epTitle,
+ idoc = self._download_xml(indexUrl, epTitle,
u'Downloading show index',
u'unable to download episode index')
results = []
- idoc = xml.etree.ElementTree.fromstring(indexXml)
itemEls = idoc.findall('.//item')
for partNum,itemEl in enumerate(itemEls):
mediaId = itemEl.findall('./guid')[0].text
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
- configXml = self._download_webpage(configUrl, epTitle,
+ cdoc = self._download_xml(configUrl, epTitle,
u'Downloading configuration for %s' % shortMediaId)
- cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
for rendition in cdoc.findall('.//rendition'):
finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
})
effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
- info = {
+ results.append({
'id': shortMediaId,
'formats': formats,
'uploader': showId,
'title': effTitle,
'thumbnail': None,
'description': compat_str(officialTitle),
- }
-
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- results.append(info)
+ })
return results
import socket
import sys
import netrc
+import xml.etree.ElementTree
from ..utils import (
compat_http_client,
compat_urllib_error,
- compat_urllib_request,
compat_str,
clean_html,
unescapeHTML,
)
+
class InfoExtractor(object):
"""Information Extractor class.
("3D" or "DASH video")
* width Width of the video, if known
* height Height of the video, if known
+ * abr Average audio bitrate in KBit/s
+ * acodec Name of the audio codec in use
+ * vbr Average video bitrate in KBit/s
+ * vcodec Name of the video codec in use
+ * filesize The number of bytes, if known in advance
webpage_url: The url to the video webpage, if given to youtube-dl it
should allow to get the same result again. (It will be set
by YoutubeDL if it's missing)
elif note is not False:
self.to_screen(u'%s: %s' % (video_id, note))
try:
- return compat_urllib_request.urlopen(url_or_request)
+ return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
if errnote is None:
errnote = u'Unable to download webpage'
""" Returns the data of the page as a string """
return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
+ def _download_xml(self, url_or_request, video_id,
+ note=u'Downloading XML', errnote=u'Unable to download XML'):
+ """Return the xml as an xml.etree.ElementTree.Element"""
+ xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+ return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+
def to_screen(self, msg):
"""Print msg to screen, prefixing it with '[ie_name]'"""
self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
self.to_screen(u'Logging in')
#Methods for following #608
- def url_result(self, url, ie=None):
+ def url_result(self, url, ie=None, video_id=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
video_info = {'_type': 'url',
'url': url,
'ie_key': ie}
+ if video_id is not None:
+ video_info['id'] = video_id
return video_info
def playlist_result(self, entries, playlist_id=None, playlist_title=None):
"""Returns a playlist"""
# Helper functions for extracting OpenGraph info
@staticmethod
- def _og_regex(prop):
- return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+ def _og_regexes(prop):
+ content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+ property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+ template = r'<meta[^>]+?%s[^>]+?%s'
+ return [
+ template % (property_re, content_re),
+ template % (content_re, property_re),
+ ]
def _og_search_property(self, prop, html, name=None, **kargs):
if name is None:
name = 'OpenGraph %s' % prop
- escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
- if not escaped is None:
- return unescapeHTML(escaped)
- return None
+ escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+ if escaped is None:
+ return None
+ return unescapeHTML(escaped)
def _og_search_thumbnail(self, html, **kargs):
return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
return self._og_search_property('title', html, **kargs)
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
- regexes = [self._og_regex('video')]
- if secure: regexes.insert(0, self._og_regex('video:secure_url'))
+ regexes = self._og_regexes('video')
+ if secure: regexes = self._og_regexes('video:secure_url') + regexes
return self._html_search_regex(regexes, html, name, **kargs)
+ def _html_search_meta(self, name, html, display_name=None):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'''(?ix)<meta
+ (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
+ [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
+ html, display_name, fatal=False)
+
+ def _dc_search_uploader(self, html):
+ return self._html_search_meta('dc.creator', html, 'uploader')
+
def _rta_search(self, html):
# See http://www.rtalabel.org/index.php?content=howtofaq#single
if re.search(r'(?ix)<meta\s+name="rating"\s+'
return 18
return 0
+ def _media_rating_search(self, html):
+ # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+ rating = self._html_search_meta('rating', html)
+
+ if not rating:
+ return None
+
+ RATING_TABLE = {
+ 'safe for kids': 0,
+ 'general': 8,
+ '14 years': 14,
+ 'mature': 17,
+ 'restricted': 19,
+ }
+ return RATING_TABLE.get(rating.lower(), None)
+
+
class SearchInfoExtractor(InfoExtractor):
"""
--- /dev/null
+# encoding: utf-8
+from .canalplus import CanalplusIE
+
+
+class D8IE(CanalplusIE):
+ _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
+ IE_NAME = u'd8.tv'
+
+ _TEST = {
+ u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+ u'file': u'966289.flv',
+ u'info_dict': {
+ u'title': u'Campagne intime - Documentaire exceptionnel',
+ u'description': u'md5:d2643b799fb190846ae09c61e59a859f',
+ u'upload_date': u'20131108',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ }
webpage = self._download_webpage(request,
id, u'Downloading page %s' % pagenum)
- playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+ playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
video_id = mobj.group(1)
canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
webpage = self._download_webpage(canonical_url, video_id)
- full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+ full_id = self._search_regex(
+ r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
webpage, u'full id')
query = compat_urllib_parse.urlencode({'vid': full_id})
- info_xml = self._download_webpage(
+ info = self._download_xml(
'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
u'Downloading video info')
- urls_xml = self._download_webpage(
+ urls = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
video_id, u'Downloading video formats info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
- urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
self.to_screen(u'%s: Getting video urls' % video_id)
formats = []
'vid': full_id,
'profile': profile,
})
- url_xml = self._download_webpage(
+ url_doc = self._download_xml(
'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
video_id, note=False)
- url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
format_url = url_doc.find('result/url').text
formats.append({
'url': format_url,
'format_id': profile,
})
- info = {
+ return {
'id': video_id,
'title': info.find('TITLE').text,
'formats': formats,
'duration': int(info.find('DURATION').text),
'upload_date': info.find('REGDTTM').text[:8],
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
# coding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
- details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
- details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+ details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details')
thumbnail_els = details_doc.findall('.//teaserimage')
thumbnails = [{
return (qidx, prefer_http, format['video_bitrate'])
formats.sort(key=_sortkey)
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': video_title,
'uploader': video_uploader,
'upload_date': upload_date,
}
-
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import determine_ext
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- config_xml = self._download_webpage(
+ config = self._download_xml(
'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video_url = config.find('file').text
return {
-import itertools
import json
import random
import re
class EscapistIE(InfoExtractor):
- _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
+ _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
_TEST = {
u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
- u'md5': u'c6793dbda81388f4264c1ba18684a74d',
+ u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
u'info_dict': {
u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
u"uploader": u"the-escapist-presents",
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
showName = mobj.group('showname')
videoId = mobj.group('episode')
self.report_extraction(videoId)
webpage = self._download_webpage(url, videoId)
- videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+ videoDesc = self._html_search_regex(
+ r'<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- playerUrl = self._og_search_video_url(webpage, name='player url')
+ playerUrl = self._og_search_video_url(webpage, name=u'player URL')
- title = self._html_search_regex('<meta name="title" content="([^"]*)"',
- webpage, u'player url').split(' : ')[-1]
+ title = self._html_search_regex(
+ r'<meta name="title" content="([^"]*)"',
+ webpage, u'title').split(' : ')[-1]
- configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+ configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
configUrl = compat_urllib_parse.unquote(configUrl)
- configJSON = self._download_webpage(configUrl, videoId,
- u'Downloading configuration',
- u'unable to download configuration')
-
- # Technically, it's JavaScript, not JSON
- configJSON = configJSON.replace("'", '"')
-
+ formats = []
+
+ def _add_format(name, cfgurl):
+ configJSON = self._download_webpage(
+ cfgurl, videoId,
+ u'Downloading ' + name + ' configuration',
+ u'Unable to download ' + name + ' configuration')
+
+ # Technically, it's JavaScript, not JSON
+ configJSON = configJSON.replace("'", '"')
+
+ try:
+ config = json.loads(configJSON)
+ except (ValueError,) as err:
+ raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ playlist = config['playlist']
+ formats.append({
+ 'url': playlist[1]['url'],
+ 'format_id': name,
+ })
+
+ _add_format(u'normal', configUrl)
+ hq_url = (configUrl +
+ ('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
try:
- config = json.loads(configJSON)
- except (ValueError,) as err:
- raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+ _add_format(u'hq', hq_url)
+ except ExtractorError:
+ pass # That's fine, we'll just use normal quality
- playlist = config['playlist']
- videoUrl = playlist[1]['url']
-
- info = {
+ return {
'id': videoId,
- 'url': videoUrl,
+ 'formats': formats,
'uploader': showName,
- 'upload_date': None,
'title': title,
- 'ext': 'mp4',
'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
}
-
- return [info]
import json
-import netrc
import re
import socket
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
webpage = self._download_webpage(url, video_id)
config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
u'config xml url')
- config_xml = self._download_webpage(config_xml_url, video_id,
+ config = self._download_xml(config_xml_url, video_id,
u'Downloading config xml')
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
encodings = config.find('ENCODINGS')
formats = []
})
descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
- info = {
+ return {
'id': video_id,
'title': self._og_search_title(webpage),
'formats': formats,
'description': descr,
'thumbnail': config.find('STILL/STILL_BIG').text,
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
for i, _ in enumerate(files, 1):
video_id = '%04d%d' % (episode, i)
video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
- video_title = 'Fernsehkritik %d.%d' % (episode, i)
videos.append({
'id': video_id,
'url': video_url,
# encoding: utf-8
import re
-import xml.etree.ElementTree
import json
from .common import InfoExtractor
class FranceTVBaseInfoExtractor(InfoExtractor):
def _extract_video(self, video_id):
- xml_desc = self._download_webpage(
+ info = self._download_xml(
'http://www.francetvinfo.fr/appftv/webservices/video/'
'getInfosOeuvre.php?id-diffusion='
+ video_id, video_id, 'Downloading XML config')
- info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8'))
manifest_url = info.find('videos/video/url').text
video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8')
--- /dev/null
+import re
+
+from .common import InfoExtractor
+
+
+class GamekingsIE(InfoExtractor):
+ _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
+ _TEST = {
+ u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
+ u'file': u'20130811.mp4',
+ # MD5 is flaky, seems to change regularly
+ #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+ u'info_dict': {
+ u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
+ u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+ }
+ }
+
+ def _real_extract(self, url):
+
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ webpage = self._download_webpage(url, name)
+ video_url = self._og_search_video_url(webpage)
+
+ video = re.search(r'[0-9]+', video_url)
+ video_id = video.group(0)
+
+ # Todo: add medium format
+ video_url = video_url.replace(video_id, 'large/' + video_id)
+
+ return {
+ 'id': video_id,
+ 'ext': 'mp4',
+ 'url': video_url,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- page_id = video_id = mobj.group('page_id')
+ page_id = mobj.group('page_id')
webpage = self._download_webpage(url, page_id)
data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
data_video = json.loads(unescapeHTML(data_video_json))
'format_id': q,
})
- info = {
+ return {
'id': data_video['guid'],
'title': compat_urllib_parse.unquote(data_video['title']),
'formats': formats,
'description': get_meta_content('description', webpage),
'thumbnail': self._og_search_thumbnail(webpage),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
import re
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
-class GametrailersIE(MTVIE):
- """
- Gametrailers use the same videos system as MTVIE, it just changes the feed
- url, where the uri is and the method to get the thumbnails.
- """
+
+class GametrailersIE(MTVServicesInfoExtractor):
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
},
}
- # Overwrite MTVIE properties we don't want
- _TESTS = []
_FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
- def _get_thumbnail_url(self, uri, itemdoc):
- search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
- return itemdoc.find(search_path).attrib['url']
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
raise ExtractorError(u'Failed to download URL: %s' % url)
self.report_extraction(video_id)
+
+ # it's tempting to parse this further, but you would
+ # have to take into account all the variations like
+ # Video Title - Site Name
+ # Site Name | Video Title
+ # Video Title - Tagline | Site Name
+ # and so on and so forth; it's just not practical
+ video_title = self._html_search_regex(r'<title>(.*)</title>',
+ webpage, u'video title', default=u'video', flags=re.DOTALL)
+
# Look for BrightCove:
bc_url = BrightcoveIE._extract_brightcove_url(webpage)
if bc_url is not None:
return self.url_result(surl, 'Vimeo')
# Look for embedded YouTube player
- mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)
- if mobj:
- surl = unescapeHTML(mobj.group(u'url'))
- return self.url_result(surl, 'Youtube')
+ matches = re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+ for tuppl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
+
+ # Look for embedded Dailymotion player
+ matches = re.findall(
+ r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+ if matches:
+ urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+ for tuppl in matches]
+ return self.playlist_result(
+ urlrs, playlist_id=video_id, playlist_title=video_title)
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
burl = unescapeHTML(mobj.group(1))
- return self.url_result(burl, 'Bandcamp')
+ # Don't set the extractor because it can be a track url or an album
+ return self.url_result(burl)
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
if mobj is None:
# Broaden the search a little bit: JWPlayer JS loader
- mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+ mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
if mobj is None:
# Try to find twitter cards info
mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
# here's a fun little line of code for you:
- video_extension = os.path.splitext(video_id)[1][1:]
video_id = os.path.splitext(video_id)[0]
- # it's tempting to parse this further, but you would
- # have to take into account all the variations like
- # Video Title - Site Name
- # Site Name | Video Title
- # Video Title - Tagline | Site Name
- # and so on and so forth; it's just not practical
- video_title = self._html_search_regex(r'<title>(.*)</title>',
- webpage, u'video title', default=u'video', flags=re.DOTALL)
-
# video uploader is domain name
video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
url, u'video uploader')
- return [{
+ return {
'id': video_id,
'url': video_url,
'uploader': video_uploader,
'upload_date': None,
'title': video_title,
- 'ext': video_extension,
- }]
+ }
_TEST = {
u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
u'file': u'390161.mp4',
- u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138',
+ u'md5': u'8b743df908c42f60cf6496586c7f12c3',
u'info_dict': {
u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.",
u"title": u"How to Tie a Square Knot Properly"
--- /dev/null
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urlparse,
+ get_element_by_attribute,
+)
+
+
+class ImdbIE(InfoExtractor):
+ IE_NAME = u'imdb'
+ IE_DESC = u'Internet Movie Database trailers'
+ _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.imdb.com/video/imdb/vi2524815897',
+ u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068',
+ u'info_dict': {
+ u'id': u'2524815897',
+ u'ext': u'mp4',
+ u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
+ u'description': u'md5:9061c2219254e5d14e03c25c98e96a81',
+ u'duration': 151,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url,video_id)
+ descr = get_element_by_attribute('itemprop', 'description', webpage)
+ available_formats = re.findall(
+ r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
+ flags=re.MULTILINE)
+ formats = []
+ for f_id, f_path in available_formats:
+ format_page = self._download_webpage(
+ compat_urlparse.urljoin(url, f_path),
+ u'Downloading info for %s format' % f_id)
+ json_data = self._search_regex(
+ r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
+ format_page, u'json data', flags=re.DOTALL)
+ info = json.loads(json_data)
+ format_info = info['videoPlayerObject']['video']
+ formats.append({
+ 'format_id': f_id,
+ 'url': format_info['url'],
+ 'height': int(info['titleObject']['encoding']['selected'][:-1]),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'formats': formats,
+ 'description': descr,
+ 'thumbnail': format_info['slate'],
+ 'duration': int(info['titleObject']['title']['duration_seconds']),
+ }
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
video_id = query_dic['publishedid'][0]
url = self._build_url(query)
- flashconfiguration_xml = self._download_webpage(url, video_id,
+ flashconfiguration = self._download_xml(url, video_id,
u'Downloading flash configuration')
- flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
file_url = flashconfiguration.find('file').text
file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
# Replace some of the parameters in the query to get the best quality
file_url = re.sub(r'(?<=\?)(.+)$',
lambda m: self._clean_query(m.group()),
file_url)
- info_xml = self._download_webpage(file_url, video_id,
+ info = self._download_xml(file_url, video_id,
u'Downloading video info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
item = info.find('channel/item')
def _bp(p):
import json
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- title = re.match(self._VALID_URL, url).group(1)
+ title = mobj.group(1)
webpage = self._download_webpage(url, title)
xml_link = self._html_search_regex(
r'<param name="flashvars" value="config=(.*?)" />',
r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
xml_link, u'video ID')
- xml_config = self._download_webpage(
+ config = self._download_xml(
xml_link, title, u'Downloading XML config')
- config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8'))
- info_json = self._search_regex(
- r'(?sm)<format\.json>(.*?)</format\.json>',
- xml_config, u'JSON information')
+ info_json = config.find('format.json').text
info = json.loads(info_json)['versions'][0]
video_url = 'http://video720.jeuxvideo.com/' + info['file']
import json
import os
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
archive_id = m.group(1)
api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
- chapter_info_xml = self._download_webpage(api, chapter_id,
+ doc = self._download_xml(api, chapter_id,
note=u'Downloading chapter information',
errnote=u'Chapter information download failed')
- doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
for a in doc.findall('.//archive'):
if archive_id == a.find('./id').text:
break
from ..utils import (
compat_urllib_parse_urlparse,
compat_urlparse,
- get_meta_content,
- ExtractorError,
+ xpath_with_ns,
)
class LivestreamIE(InfoExtractor):
+ IE_NAME = u'livestream'
_VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
_TEST = {
u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
info = json.loads(self._download_webpage(api_url, video_id,
u'Downloading video info'))
return self._extract_video_info(info)
+
+
+# The original version of Livestream uses a different system
+class LivestreamOriginalIE(InfoExtractor):
+ IE_NAME = u'livestream:original'
+ _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)'
+ _TEST = {
+ u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ u'info_dict': {
+ u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+ u'ext': u'flv',
+ u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
+ },
+ u'params': {
+ # rtmp
+ u'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ user = mobj.group('user')
+ api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
+
+ info = self._download_xml(api_url, video_id)
+ item = info.find('channel').find('item')
+ ns = {'media': 'http://search.yahoo.com/mrss'}
+ thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
+ # Remove the extension and number from the path (like 1.jpg)
+ path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path')
+
+ return {
+ 'id': video_id,
+ 'title': item.find('title').text,
+ 'url': 'rtmp://extondemand.livestream.com/ondemand',
+ 'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path),
+ 'ext': 'flv',
+ 'thumbnail': thumbnail_url,
+ }
description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
webpage, u'description', flags=re.DOTALL)
- info = {
+ return {
'id': video_id,
'title': clip.find('title').text,
'formats': formats,
'description': description,
'duration': int(clip.find('duration').text),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
'title': info['name'],
'url': final_song_url,
'ext': 'mp3',
- 'description': info['description'],
+ 'description': info.get('description'),
'thumbnail': info['pictures'].get('extra_large'),
'uploader': info['user']['name'],
'uploader_id': info['user']['username'],
def _media_xml_tag(tag):
return '{http://search.yahoo.com/mrss/}%s' % tag
-class MTVIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
-
- _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
-
- _TESTS = [
- {
- u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
- u'file': u'853555.mp4',
- u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
- u'info_dict': {
- u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
- u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
- },
- },
- {
- u'add_ie': ['Vevo'],
- u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
- u'file': u'USCJY1331283.mp4',
- u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
- u'info_dict': {
- u'title': u'Everything Has Changed',
- u'upload_date': u'20130606',
- u'uploader': u'Taylor Swift',
- },
- u'skip': u'VEVO is only available in some countries',
- },
- ]
+class MTVServicesInfoExtractor(InfoExtractor):
@staticmethod
def _id_from_uri(uri):
return uri.split(':')[-1]
def _transform_rtmp_url(rtmp_video_url):
m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
if not m:
- raise ExtractorError(u'Cannot transform RTMP url')
+ return rtmp_video_url
base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
return base + m.group('finalid')
def _get_thumbnail_url(self, uri, itemdoc):
- return 'http://mtv.mtvnimages.com/uri/' + uri
+ search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+ thumb_node = itemdoc.find(search_path)
+ if thumb_node is None:
+ return None
+ else:
+ return thumb_node.attrib['url']
def _extract_video_formats(self, metadataXml):
if '/error_country_block.swf' in metadataXml:
raise ExtractorError(u'This video is not available from your country.', expected=True)
mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
- renditions = mdoc.findall('.//rendition')
formats = []
for rendition in mdoc.findall('.//rendition'):
else:
description = None
- info = {
+ return {
'title': itemdoc.find('title').text,
'formats': self._extract_video_formats(mediagen_page),
'id': video_id,
'description': description,
}
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- return info
-
def _get_videos_info(self, uri):
video_id = self._id_from_uri(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
- infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
+ idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
u'Downloading info')
- idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
return [self._get_video_info(item) for item in idoc.findall('.//item')]
+
+class MTVIE(MTVServicesInfoExtractor):
+ _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+
+ _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+ _TESTS = [
+ {
+ u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+ u'file': u'853555.mp4',
+ u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+ u'info_dict': {
+ u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+ u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+ },
+ },
+ {
+ u'add_ie': ['Vevo'],
+ u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+ u'file': u'USCJY1331283.mp4',
+ u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+ u'info_dict': {
+ u'title': u'Everything Has Changed',
+ u'upload_date': u'20130606',
+ u'uploader': u'Taylor Swift',
+ },
+ u'skip': u'VEVO is only available in some countries',
+ },
+ ]
+
+ def _get_thumbnail_url(self, uri, itemdoc):
+ return 'http://mtv.mtvnimages.com/uri/' + uri
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
import os.path
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
# get metadata
metadata_url = META_DATA_URL_TEMPLATE % video_id
- metadata_text = self._download_webpage(metadata_url, video_id)
- metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
+ metadata = self._download_xml(metadata_url, video_id)
# extract values from metadata
url_flv_el = metadata.find('url_flv')
# encoding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
'protocol': 'p2p',
'inKey': key,
})
- info_xml = self._download_webpage(
+ info = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
video_id, u'Downloading video info')
- urls_xml = self._download_webpage(
+ urls = self._download_xml(
'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
video_id, u'Downloading video formats info')
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
- urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
formats = []
for format_el in urls.findall('EncodingOptions/EncodingOption'):
'height': int(format_el.find('height').text),
})
- info = {
+ return {
'id': video_id,
'title': info.find('Subject').text,
'formats': formats,
'upload_date': info.find('WriteDate').text.replace('.', ''),
'view_count': int(info.find('PlayCount').text),
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import find_xpath_attr, compat_str
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
- info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+ all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+ info = all_info.find('video')
return {'id': video_id,
'title': info.find('headline').text,
import re
import json
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
'path': initial_video_url.replace('.mp4', '_sd.mp4'),
})
path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
- path_response = self._download_webpage(path_url, video_id,
+ path_doc = self._download_xml(path_url, video_id,
u'Downloading final video url')
- path_doc = xml.etree.ElementTree.fromstring(path_response)
video_url = path_doc.find('path').text
join = compat_urlparse.urljoin
class NHLVideocenterIE(NHLBaseInfoExtractor):
IE_NAME = u'nhl.com:videocenter'
- IE_DESC = u'Download the first 12 videos from a videocenter category'
+ IE_DESC = u'NHL videocenter category'
_VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
@classmethod
--- /dev/null
+# encoding: utf-8
+
+import re
+import socket
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_http_client,
+ compat_urllib_error,
+ compat_urllib_parse,
+ compat_urllib_request,
+ compat_urlparse,
+ compat_str,
+
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class NiconicoIE(InfoExtractor):
+ IE_NAME = u'niconico'
+ IE_DESC = u'ニコニコ動画'
+
+ _TEST = {
+ u'url': u'http://www.nicovideo.jp/watch/sm22312215',
+ u'file': u'sm22312215.mp4',
+ u'md5': u'd1a75c0823e2f629128c43e1212760f9',
+ u'info_dict': {
+ u'title': u'Big Buck Bunny',
+ u'uploader': u'takuya0301',
+ u'uploader_id': u'2698420',
+ u'upload_date': u'20131123',
+ u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+ },
+ u'params': {
+ u'username': u'ydl.niconico@gmail.com',
+ u'password': u'youtube-dl',
+ },
+ }
+
+ _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+ _NETRC_MACHINE = 'niconico'
+ # If True it will raise an error if no login info is provided
+ _LOGIN_REQUIRED = True
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ # No authentication to be performed
+ if username is None:
+ if self._LOGIN_REQUIRED:
+ raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return False
+
+ # Log in
+ login_form_strs = {
+ u'mail': username,
+ u'password': password,
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+ request = compat_urllib_request.Request(
+ u'https://secure.nicovideo.jp/secure/login', login_data)
+ login_results = self._download_webpage(
+ request, u'', note=u'Logging in', errnote=u'Unable to log in')
+ if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
+ return False
+ return True
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ # Get video webpage. We are not actually interested in it, but need
+ # the cookies in order to be able to download the info webpage
+ self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+
+ video_info = self._download_xml(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
+ note=u'Downloading video info page')
+
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+ video_id, u'Downloading flv info')
+ video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+
+ # Start extracting information
+ video_title = video_info.find('.//title').text
+ video_extension = video_info.find('.//movie_type').text
+ video_format = video_extension.upper()
+ video_thumbnail = video_info.find('.//thumbnail_url').text
+ video_description = video_info.find('.//description').text
+ video_uploader_id = video_info.find('.//user_id').text
+ video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+ video_view_count = video_info.find('.//view_counter').text
+ video_webpage_url = video_info.find('.//watch_url').text
+
+ # uploader
+ video_uploader = video_uploader_id
+ url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
+ try:
+ user_info = self._download_xml(
+ url, video_id, note=u'Downloading user information')
+ video_uploader = user_info.find('.//nickname').text
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+
+ return {
+ 'id': video_id,
+ 'url': video_real_url,
+ 'title': video_title,
+ 'ext': video_extension,
+ 'format': video_format,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'view_count': video_view_count,
+ 'webpage_url': video_webpage_url,
+ }
--- /dev/null
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class PodomaticIE(InfoExtractor):
+ IE_NAME = 'podomatic'
+ _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+
+ _TEST = {
+ u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
+ u"file": u"2009-01-02T16_03_35-08_00.mp3",
+ u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
+ u"info_dict": {
+ u"uploader": u"Science Teaching Tips",
+ u"uploader_id": u"scienceteachingtips",
+ u"title": u"64. When the Moon Hits Your Eye",
+ u"duration": 446,
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ channel = mobj.group('channel')
+
+ json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
+ '?permalink=true&rtmp=0') %
+ (mobj.group('proto'), channel, video_id))
+ data_json = self._download_webpage(
+ json_url, video_id, note=u'Downloading video info')
+ data = json.loads(data_json)
+
+ video_url = data['downloadLink']
+ uploader = data['podcast']
+ title = data['title']
+ thumbnail = data['imageLocation']
+ duration = int(data['length'] / 1000.0)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'uploader': uploader,
+ 'uploader_id': channel,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ }
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
_TEST = {
u'url': u'http://www.redtube.com/66418',
u'file': u'66418.mp4',
- u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
+ # md5 varies from time to time, as in
+ # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
+ #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
u'info_dict': {
u"title": u"Sucked on a toilet",
u"age_limit": 18,
r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
video_title = self._html_search_regex(
- r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
webpage, u'title')
# No self-labeling, but they describe themselves as
u'skip_download': True,
},
},
- {
- u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1',
- u'file': u'129679.flv',
- u'info_dict': {
- u'upload_date': u'20131016',
- u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...',
- u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig',
- },
- u'params': {
- u'skip_download': True,
- },
- },
{
u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
u'file': u'124903.flv',
# coding: utf-8
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
def _extract_video(self, video_id):
data = compat_urllib_parse.urlencode({'vid': video_id})
- url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+ url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
video_id, u'Downloading video url')
image_page = self._download_webpage(
'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
video_id, u'Downloading thumbnail info')
- url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
return {'id': video_id,
'url': url_doc.find('./durl/url').text,
--- /dev/null
+# encoding: utf-8
+
+import re
+import json
+import hashlib
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ ExtractorError
+)
+
+
+class SmotriIE(InfoExtractor):
+ IE_DESC = u'Smotri.com'
+ IE_NAME = u'smotri'
+ _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+
+ _TESTS = [
+ # real video id 2610366
+ {
+ u'url': u'http://smotri.com/video/view/?id=v261036632ab',
+ u'file': u'v261036632ab.mp4',
+ u'md5': u'2a7b08249e6f5636557579c368040eb9',
+ u'info_dict': {
+ u'title': u'катастрофа с камер видеонаблюдения',
+ u'uploader': u'rbc2008',
+ u'uploader_id': u'rbc08',
+ u'upload_date': u'20131118',
+ u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
+ u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
+ },
+ },
+ # real video id 57591
+ {
+ u'url': u'http://smotri.com/video/view/?id=v57591cb20',
+ u'file': u'v57591cb20.flv',
+ u'md5': u'830266dfc21f077eac5afd1883091bcd',
+ u'info_dict': {
+ u'title': u'test',
+ u'uploader': u'Support Photofile@photofile',
+ u'uploader_id': u'support-photofile',
+ u'upload_date': u'20070704',
+ u'description': u'test, видео test',
+ u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
+ },
+ },
+ # video-password
+ {
+ u'url': u'http://smotri.com/video/view/?id=v1390466a13c',
+ u'file': u'v1390466a13c.mp4',
+ u'md5': u'f6331cef33cad65a0815ee482a54440b',
+ u'info_dict': {
+ u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+ u'uploader': u'timoxa40',
+ u'uploader_id': u'timoxa40',
+ u'upload_date': u'20100404',
+ u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
+ u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+ },
+ u'params': {
+ u'videopassword': u'qwerty',
+ },
+ },
+ # age limit + video-password
+ {
+ u'url': u'http://smotri.com/video/view/?id=v15408898bcf',
+ u'file': u'v15408898bcf.flv',
+ u'md5': u'91e909c9f0521adf5ee86fbe073aad70',
+ u'info_dict': {
+ u'title': u'этот ролик не покажут по ТВ',
+ u'uploader': u'zzxxx',
+ u'uploader_id': u'ueggb',
+ u'upload_date': u'20101001',
+ u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
+ u'age_limit': 18,
+ u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
+ },
+ u'params': {
+ u'videopassword': u'333'
+ }
+ }
+ ]
+
+ _SUCCESS = 0
+ _PASSWORD_NOT_VERIFIED = 1
+ _PASSWORD_DETECTED = 2
+ _VIDEO_NOT_FOUND = 3
+
+ def _search_meta(self, name, html, display_name=None):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name),
+ html, display_name, fatal=False)
+ return self._html_search_meta(name, html, display_name)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('videoid')
+ real_video_id = mobj.group('realvideoid')
+
+ # Download video JSON data
+ video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
+ video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON')
+ video_json = json.loads(video_json_page)
+
+ status = video_json['status']
+ if status == self._VIDEO_NOT_FOUND:
+ raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+ elif status == self._PASSWORD_DETECTED: # The video is protected by a password, retry with
+ # video-password set
+ video_password = self._downloader.params.get('videopassword', None)
+ if not video_password:
+ raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True)
+ video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
+ video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)')
+ video_json = json.loads(video_json_page)
+ status = video_json['status']
+ if status == self._PASSWORD_NOT_VERIFIED:
+ raise ExtractorError(u'Video password is invalid', expected=True)
+
+ if status != self._SUCCESS:
+ raise ExtractorError(u'Unexpected status value %s' % status)
+
+ # Extract the URL of the video
+ video_url = video_json['file_data']
+
+ # Video JSON does not provide enough meta data
+ # We will extract some from the video web page instead
+ video_page_url = 'http://' + mobj.group('url')
+ video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
+
+ # Adult content
+ if re.search(u'EroConfirmText">', video_page) is not None:
+ self.report_age_confirmation()
+ confirm_string = self._html_search_regex(
+ r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
+ video_page, u'confirm string')
+ confirm_url = video_page_url + '&confirm=%s' % confirm_string
+ video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)')
+ adult_content = True
+ else:
+ adult_content = False
+
+ # Extract the rest of meta data
+ video_title = self._search_meta(u'name', video_page, u'title')
+ if not video_title:
+ video_title = video_url.rsplit('/', 1)[-1]
+
+ video_description = self._search_meta(u'description', video_page)
+ END_TEXT = u' на сайте Smotri.com'
+ if video_description.endswith(END_TEXT):
+ video_description = video_description[:-len(END_TEXT)]
+ START_TEXT = u'Смотреть онлайн ролик '
+ if video_description.startswith(START_TEXT):
+ video_description = video_description[len(START_TEXT):]
+ video_thumbnail = self._search_meta(u'thumbnail', video_page)
+
+ upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
+ upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+ video_upload_date = (
+ (
+ upload_date_m.group('year') +
+ upload_date_m.group('month') +
+ upload_date_m.group('day')
+ )
+ if upload_date_m else None
+ )
+
+ duration_str = self._search_meta(u'duration', video_page)
+ duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+ video_duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m else None
+ )
+
+ video_uploader = self._html_search_regex(
+ u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
+ video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ video_uploader_id = self._html_search_regex(
+ u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
+ video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ video_view_count = self._html_search_regex(
+ u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
+ video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': video_title,
+ 'thumbnail': video_thumbnail,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'upload_date': video_upload_date,
+ 'uploader_id': video_uploader_id,
+ 'video_duration': video_duration,
+ 'view_count': video_view_count,
+ 'age_limit': 18 if adult_content else 0,
+ 'video_page_url': video_page_url
+ }
+
+
+class SmotriCommunityIE(InfoExtractor):
+ IE_DESC = u'Smotri.com community videos'
+ IE_NAME = u'smotri:community'
+ _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ community_id = mobj.group('communityid')
+
+ url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
+ rss = self._download_xml(url, community_id, u'Downloading community RSS')
+
+ entries = [self.url_result(video_url.text, 'Smotri')
+ for video_url in rss.findall('./channel/item/link')]
+
+ description_text = rss.find('./channel/description').text
+ community_title = self._html_search_regex(
+ u'^Видео сообщества "([^"]+)"$', description_text, u'community title')
+
+ return self.playlist_result(entries, community_id, community_title)
+
+
+class SmotriUserIE(InfoExtractor):
+ IE_DESC = u'Smotri.com user videos'
+ IE_NAME = u'smotri:user'
+ _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ user_id = mobj.group('userid')
+
+ url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
+ rss = self._download_xml(url, user_id, u'Downloading user RSS')
+
+ entries = [self.url_result(video_url.text, 'Smotri')
+ for video_url in rss.findall('./channel/item/link')]
+
+ description_text = rss.find('./channel/description').text
+ user_nickname = self._html_search_regex(
+ u'^Видео режиссера (.*)$', description_text,
+ u'user nickname')
+
+ return self.playlist_result(entries, user_id, user_nickname)
]
_CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@classmethod
def suitable(cls, url):
def _extract_info_dict(self, info, full_title=None, quiet=False):
track_id = compat_str(info['id'])
name = full_title or track_id
- if quiet == False:
+ if quiet:
self.report_extraction(name)
thumbnail = info['artwork_url']
if thumbnail is not None:
thumbnail = thumbnail.replace('-large', '-t500x500')
+ ext = info.get('original_format', u'mp3')
result = {
- 'id': track_id,
- 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+ 'id': track_id,
'uploader': info['user']['username'],
'upload_date': unified_strdate(info['created_at']),
- 'title': info['title'],
- 'ext': u'mp3',
+ 'title': info['title'],
'description': info['description'],
'thumbnail': thumbnail,
}
if info.get('downloadable', False):
- result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID)
- if not info.get('streamable', False):
- # We have to get the rtmp url
+ # We can build a direct link to the song
+ format_url = (
+ u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
+ track_id, self._CLIENT_ID))
+ result['formats'] = [{
+ 'format_id': 'download',
+ 'ext': ext,
+ 'url': format_url,
+ 'vcodec': 'none',
+ }]
+ else:
+ # We have to retrieve the url
stream_json = self._download_webpage(
- 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID),
+ 'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
track_id, u'Downloading track url')
- rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url']
- # The url doesn't have an rtmp app, we have to extract the playpath
- url, path = rtmp_url.split('mp3:', 1)
- result.update({
- 'url': url,
- 'play_path': 'mp3:' + path,
- })
+
+ formats = []
+ format_dict = json.loads(stream_json)
+ for key, stream_url in format_dict.items():
+ if key.startswith(u'http'):
+ formats.append({
+ 'format_id': key,
+ 'ext': ext,
+ 'url': stream_url,
+ 'vcodec': 'none',
+ })
+ elif key.startswith(u'rtmp'):
+ # The url doesn't have an rtmp app, we have to extract the playpath
+ url, path = stream_url.split('mp3:', 1)
+ formats.append({
+ 'format_id': key,
+ 'url': url,
+ 'play_path': 'mp3:' + path,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
+
+ if not formats:
+ # We fallback to the stream_url in the original info, this
+ # cannot be always used, sometimes it can give an HTTP 404 error
+ formats.append({
+ 'format_id': u'fallback',
+ 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+ 'ext': ext,
+ 'vcodec': 'none',
+ })
+
+ def format_pref(f):
+ if f['format_id'].startswith('http'):
+ return 2
+ if f['format_id'].startswith('rtmp'):
+ return 1
+ return 0
+
+ formats.sort(key=format_pref)
+ result['formats'] = formats
+
return result
def _real_extract(self, url):
resolv_url = self._resolv_url(url)
info_json = self._download_webpage(resolv_url, full_title)
- videos = []
info = json.loads(info_json)
if 'errors' in info:
for err in info['errors']:
import re
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
-class SouthParkStudiosIE(MTVIE):
+class SouthParkStudiosIE(MTVServicesInfoExtractor):
IE_NAME = u'southparkstudios.com'
- _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)'
+ _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
- _TEST = {
+ _TESTS = [{
u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
u'info_dict': {
u'title': u'Bat Daded',
u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
},
- }
-
- # Overwrite MTVIE properties we don't want
- _TESTS = []
-
- def _get_thumbnail_url(self, uri, itemdoc):
- search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
- thumb_node = itemdoc.find(search_path)
- if thumb_node is None:
- return None
- else:
- return thumb_node.attrib['url']
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ url = u'http://www.' + mobj.group(u'url')
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
webpage, u'mgid')
return self._get_videos_info(mgid)
+
+class SouthparkDeIE(SouthParkStudiosIE):
+ IE_NAME = u'southpark.de'
+ _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+ _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
+
+ _TESTS = [{
+ u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+ u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4',
+ u'info_dict': {
+ u'title': u'The Government Won\'t Respect My Privacy',
+ u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+ },
+ }]
compat_urllib_parse_urlparse,
compat_urllib_request,
compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
webpage = self._download_webpage(req, video_id)
video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title')
- video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
- thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
- description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False)
- if len(description) == 0:
- description = None
+ video_uploader = self._html_search_regex(
+ r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
+ thumbnail = self._html_search_regex(
+ r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
+ description = self._html_search_regex(
+ r'<div\s+id="descriptionContent">([^<]+)<', webpage, u'description', fatal=False)
video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
if webpage.find('flashvars\.encrypted = "true"') != -1:
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
class SpiegelIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
- _TEST = {
+ _TESTS = [{
u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
u'file': u'1259285.mp4',
u'md5': u'2c2754212136f35fb4b19767d242f66e',
u'info_dict': {
u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"
}
- }
+ },
+ {
+ u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+ u'file': u'1309159.mp4',
+ u'md5': u'f2cdf638d7aa47654e251e1aee360af1',
+ u'info_dict': {
+ u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers'
+ }
+ }]
def _real_extract(self, url):
m = re.match(self._VALID_URL, url)
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
- webpage, u'title')
+ video_title = self._html_search_regex(
+ r'<div class="module-title">(.*?)</div>', webpage, u'title')
xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
- xml_code = self._download_webpage(xml_url, video_id,
- note=u'Downloading XML', errnote=u'Failed to download XML')
-
- idoc = xml.etree.ElementTree.fromstring(xml_code)
- last_type = idoc[-1]
- filename = last_type.findall('./filename')[0].text
- duration = float(last_type.findall('./duration')[0].text)
+ idoc = self._download_xml(
+ xml_url, video_id,
+ note=u'Downloading XML', errnote=u'Failed to download XML')
+
+ formats = [
+ {
+ 'format_id': n.tag.rpartition('type')[2],
+ 'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text,
+ 'width': int(n.find('./width').text),
+ 'height': int(n.find('./height').text),
+ 'abr': int(n.find('./audiobitrate').text),
+ 'vbr': int(n.find('./videobitrate').text),
+ 'vcodec': n.find('./codec').text,
+ 'acodec': 'MP4A',
+ }
+ for n in list(idoc)
+ # Blacklist type 6, it's extremely LQ and not available on the same server
+ if n.tag.startswith('type') and n.tag != 'type6'
+ ]
+ formats.sort(key=lambda f: f['vbr'])
+ duration = float(idoc[0].findall('./duration')[0].text)
- video_url = 'http://video2.spiegel.de/flash/' + filename
- video_ext = filename.rpartition('.')[2]
info = {
'id': video_id,
- 'url': video_url,
- 'ext': video_ext,
'title': video_title,
'duration': duration,
+ 'formats': formats,
}
- return [info]
+ return info
--- /dev/null
+# coding: utf-8
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ compat_urllib_request,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+ IE_NAME = u'streamcloud.eu'
+ _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+
+ _TEST = {
+ u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
+ u'file': u'skp9j99s4bpz.mp4',
+ u'md5': u'6bea4c7fa5daaacc2a946b7146286686',
+ u'info_dict': {
+ u'title': u'youtube-dl test video \'/\\ ä ↭',
+ u'duration': 9,
+ },
+ u'skip': u'Only available from the EU'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ orig_webpage = self._download_webpage(url, video_id)
+
+ fields = re.findall(r'''(?x)<input\s+
+ type="(?:hidden|submit)"\s+
+ name="([^"]+)"\s+
+ (?:id="[^"]+"\s+)?
+ value="([^"]*)"
+ ''', orig_webpage)
+ post = compat_urllib_parse.urlencode(fields)
+
+ self.to_screen('%s: Waiting for timeout' % video_id)
+ time.sleep(12)
+ headers = {
+ b'Content-Type': b'application/x-www-form-urlencoded',
+ }
+ req = compat_urllib_request.Request(url, post, headers)
+
+ webpage = self._download_webpage(
+ req, video_id, note=u'Downloading video page ...')
+ title = self._html_search_regex(
+ r'<h1[^>]*>([^<]+)<', webpage, u'title')
+ video_url = self._search_regex(
+ r'file:\s*"([^"]+)"', webpage, u'video URL')
+ duration_str = self._search_regex(
+ r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False)
+ duration = None if duration_str is None else int(duration_str)
+ thumbnail = self._search_regex(
+ r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'duration': duration,
+ 'thumbnail': thumbnail,
+ }
u'info_dict': {
u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
- }
+ },
+ u'skip': u'Service temporarily disabled as of 2013-11-20'
}
def _real_extract(self, url):
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
self.report_extraction(video_id)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
- data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8'))
+ data = self._download_xml(data_url, video_id, 'Downloading data webpage')
qualities = ['500k', '480p', '1000k', '720p', '1080p']
return -1
formats.sort(key=sort_key)
if not formats:
- raise RegexNotFoundError(u'Unable to extract video URL')
+ raise ExtractorError(u'Unable to extract video URL')
return {
'id': video_id,
from .subtitles import SubtitlesInfoExtractor
from ..utils import (
- compat_str,
RegexNotFoundError,
)
self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
return [self._playlist_videos_info(url,name,playlist_id)]
- def _playlist_videos_info(self,url,name,playlist_id=0):
+
+ def _playlist_videos_info(self, url, name, playlist_id):
'''Returns the videos of the playlist'''
- video_RE=r'''
- <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
- ([.\s]*?)data-playlist_item_id="(\d+)"
- ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
- '''
- video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
- webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
- m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
- m_names=re.finditer(video_name_RE,webpage)
+
+ webpage = self._download_webpage(
+ url, playlist_id, u'Downloading playlist webpage')
+ matches = re.finditer(
+ r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
+ webpage)
playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
webpage, 'playlist title')
- playlist_entries = []
- for m_video, m_name in zip(m_videos,m_names):
- talk_url='http://www.ted.com%s' % m_name.group('talk_url')
- playlist_entries.append(self.url_result(talk_url, 'TED'))
- return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
+ playlist_entries = [
+ self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
+ for m in matches
+ ]
+ return self.playlist_result(
+ playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
def _talk_info(self, url, video_id=0):
"""Return the video for the talk in the url"""
'ext': 'mp4',
'url': stream['file'],
'format': stream['id']
- } for stream in info['htmlStreams']]
+ } for stream in info['htmlStreams']]
video_id = info['id']
self._list_available_subtitles(video_id, webpage)
return
- info = {
+ return {
'id': video_id,
'title': title,
'thumbnail': thumbnail,
'formats': formats,
}
- # TODO: Remove when #980 has been merged
- info.update(info['formats'][-1])
-
- return info
-
def _get_available_subtitles(self, video_id, webpage):
try:
options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
sub_lang_list[l] = url
return sub_lang_list
- except RegexNotFoundError as err:
+ except RegexNotFoundError:
self._downloader.report_warning(u'video doesn\'t have subtitles')
return {}
--- /dev/null
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ unified_strdate,
+)
+
+
+class TouTvIE(InfoExtractor):
+ IE_NAME = u'tou.tv'
+ _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+
+ _TEST = {
+ u'url': u'http://www.tou.tv/30-vies/S04E41',
+ u'file': u'30-vies_S04E41.mp4',
+ u'info_dict': {
+ u'title': u'30 vies Saison 4 / Épisode 41',
+ u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
+ u'age_limit': 8,
+ u'uploader': u'Groupe des Nouveaux Médias',
+ u'duration': 1296,
+ u'upload_date': u'20131118',
+ u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+ },
+ u'params': {
+ u'skip_download': True, # Requires rtmpdump
+ },
+ u'skip': 'Only available in Canada'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ mediaId = self._search_regex(
+ r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
+
+ streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
+ streams_doc = self._download_xml(
+ streams_url, video_id, note=u'Downloading stream list')
+
+ video_url = next(n.text
+ for n in streams_doc.findall('.//choice/url')
+ if u'//ad.doubleclick' not in n.text)
+ if video_url.endswith('/Unavailable.flv'):
+ raise ExtractorError(
+ u'Access to this video is blocked from outside of Canada',
+ expected=True)
+
+ duration_str = self._html_search_meta(
+ 'video:duration', webpage, u'duration')
+ duration = int(duration_str) if duration_str else None
+ upload_date_str = self._html_search_meta(
+ 'video:release_date', webpage, u'upload date')
+ upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage),
+ 'url': video_url,
+ 'description': self._og_search_description(webpage),
+ 'uploader': self._dc_search_uploader(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'age_limit': self._media_rating_search(webpage),
+ 'duration': duration,
+ 'upload_date': upload_date,
+ 'ext': 'mp4',
+ }
import json
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
u'video-formats2' % log)
- format_str = self._download_webpage(
+ format_doc = self._download_xml(
format_url, video_id,
note=u'Downloading formats',
errnote=u'Error while downloading formats')
-
- format_doc = xml.etree.ElementTree.fromstring(format_str)
video_url_template = (
u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
for fnode in format_doc.findall('./formats/format')
]
- info = {
+ return {
'_type': 'video',
'id': video_id,
'formats': formats,
'thumbnail': thumbnail,
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
-
- return info
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
- unescapeHTML,
)
from ..aes import (
aes_decrypt_text
--- /dev/null
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class TvpIE(InfoExtractor):
+ IE_NAME = u'tvp.pl'
+ _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)'
+
+ _TEST = {
+ u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238',
+ u'md5': u'148408967a6a468953c0a75cbdaf0d7a',
+ u'file': u'12878238.wmv',
+ u'info_dict': {
+ u'title': u'31.10.2013 - Odcinek 2',
+ u'description': u'31.10.2013 - Odcinek 2',
+ },
+ u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.'
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id
+ json_params = self._download_webpage(
+ json_url, video_id, u"Downloading video metadata")
+
+ params = json.loads(json_params)
+ self.report_extraction(video_id)
+ video_url = params['video_url']
+
+ title = self._og_search_title(webpage, fatal=True)
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'ext': 'wmv',
+ 'url': video_url,
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ }
continue
format_url = self._SMIL_BASE_URL + m.group('path')
- format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
- m.groupdict())
formats.append({
'url': format_url,
'format_id': u'SMIL_' + m.group('cbr'),
- 'format_note': format_note,
+ 'vcodec': m.group('vcodec'),
+ 'acodec': m.group('acodec'),
+ 'vbr': int(m.group('vbr')),
+ 'abr': int(m.group('abr')),
'ext': m.group('ext'),
'width': int(m.group('width')),
'height': int(m.group('height')),
r"thumbnail\s*:\s*'([^']*)'",
webpage, u'thumbnail', fatal=False)
- info = {
+ return {
'_type': 'video',
'id': video_id,
'title': title,
'duration': duration,
'formats': formats,
}
-
- # TODO: Remove when #980 has been merged
- info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
- info.update(info['formats'][-1])
-
- return info
import re
-import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id,
+ config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
video_id)
- config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
video = config.find('video')
sources = video.find('sources')
url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key)
class VideoPremiumIE(InfoExtractor):
- _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
_TEST = {
u'url': u'http://videopremium.tv/4w7oadjsf156',
u'file': u'4w7oadjsf156.f4v',
webpage_url = 'http://videopremium.tv/' + video_id
webpage = self._download_webpage(webpage_url, video_id)
- self.report_extraction(video_id)
+ if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+ # Download again, we need a cookie
+ webpage = self._download_webpage(
+ webpage_url, video_id,
+ note=u'Downloading webpage again (with cookie)')
- video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
- webpage, u'video title')
+ video_title = self._html_search_regex(
+ r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
- return [{
+ return {
'id': video_id,
'url': "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
'play_path': "mp4:%s.f4v" % video_id,
'player_url': "http://videopremium.tv/uplayer/uppod.swf",
'ext': 'f4v',
'title': video_title,
- }]
+ }
--- /dev/null
+import re
+
+from ..utils import (
+ ExtractorError,
+ unescapeHTML,
+ unified_strdate,
+)
+from .subtitles import SubtitlesInfoExtractor
+
+
+class VikiIE(SubtitlesInfoExtractor):
+ IE_NAME = u'viki'
+
+ _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+ _TEST = {
+ u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
+ u'file': u'1023585v.mp4',
+ u'md5': u'a21454021c2646f5433514177e2caa5f',
+ u'info_dict': {
+ u'title': u'Heirs Episode 14',
+ u'uploader': u'SBS',
+ u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+ u'upload_date': u'20131121',
+ u'age_limit': 13,
+ },
+ u'skip': u'Blocked in the US',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._og_search_title(webpage)
+ description = self._og_search_description(webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ uploader_m = re.search(
+ r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
+ if uploader_m is None:
+ uploader = None
+ else:
+ uploader = uploader_m.group(1).strip()
+
+ rating_str = self._html_search_regex(
+ r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
+ u'rating information', default='').strip()
+ RATINGS = {
+ 'G': 0,
+ 'PG': 10,
+ 'PG-13': 13,
+ 'R': 16,
+ 'NC': 18,
+ }
+ age_limit = RATINGS.get(rating_str)
+
+ info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
+ info_webpage = self._download_webpage(
+ info_url, video_id, note=u'Downloading info page')
+ if re.match(r'\s*<div\s+class="video-error', info_webpage):
+ raise ExtractorError(
+ u'Video %s is blocked from your location.' % video_id,
+ expected=True)
+ video_url = self._html_search_regex(
+ r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+
+ upload_date_str = self._html_search_regex(
+ r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+ upload_date = (
+ unified_strdate(upload_date_str)
+ if upload_date_str is not None
+ else None
+ )
+
+ # subtitles
+ video_subtitles = self.extract_subtitles(video_id, info_webpage)
+ if self._downloader.params.get('listsubtitles', False):
+ self._list_available_subtitles(video_id, info_webpage)
+ return
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'age_limit': age_limit,
+ 'uploader': uploader,
+ 'subtitles': video_subtitles,
+ 'upload_date': upload_date,
+ }
+
+ def _get_available_subtitles(self, video_id, info_webpage):
+ res = {}
+ for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
+ sturl = unescapeHTML(sturl_html)
+ m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
+ if not m:
+ continue
+ res[m.group('lang')] = sturl
+ return res
config = json.loads(config_json)
except RegexNotFoundError:
# For pro videos or player.vimeo.com urls
- config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
+ config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
webpage, u'info section', flags=re.DOTALL)
config = json.loads(config)
except Exception as e:
IE_NAME = u'vimeo:channel'
_VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
_MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+ _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
+ def _extract_videos(self, list_id, base_url):
video_ids = []
-
for pagenum in itertools.count(1):
- webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
- channel_id, u'Downloading page %s' % pagenum)
+ webpage = self._download_webpage(
+ '%s/videos/page:%d/' % (base_url, pagenum),list_id,
+ u'Downloading page %s' % pagenum)
video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
break
entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
for video_id in video_ids]
- channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
- webpage, u'channel title')
+ list_title = self._html_search_regex(self._TITLE_RE, webpage,
+ u'list title')
return {'_type': 'playlist',
- 'id': channel_id,
- 'title': channel_title,
+ 'id': list_id,
+ 'title': list_title,
'entries': entries,
}
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
+ return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
+
+
+class VimeoUserIE(VimeoChannelIE):
+ IE_NAME = u'vimeo:user'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+ _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
+
+ @classmethod
+ def suitable(cls, url):
+ if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url):
+ return False
+ return super(VimeoUserIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name = mobj.group('name')
+ return self._extract_videos(name, 'http://vimeo.com/%s' % name)
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
return [{
{
u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
u'file': u'2221348.flv',
- u'md5': u'e767b9475de189320f691f49c679c4c7',
+ u'md5': u'970a94178ca4118c5aa3aaea21211b81',
u'info_dict': {
u"upload_date": u"20130914",
u"uploader_id": u"jojo747400",
from ..utils import (
compat_urllib_parse_urlparse,
compat_urllib_request,
- compat_urllib_parse,
)
class XTubeIE(InfoExtractor):
_TESTS = [
{
u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- u'file': u'214727115.flv',
+ u'file': u'214727115.mp4',
+ u'md5': u'4962b075c08be8690a922ee026d05e69',
u'info_dict': {
u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
u'description': u'Julian and Travis watch Julian Smith',
},
- u'params': {
- # Requires rtmpdump
- u'skip_download': True,
- },
},
{
u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- u'file': u'103000935.flv',
+ u'file': u'103000935.mp4',
+ u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
u'info_dict': {
u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
},
- u'params': {
- # Requires rtmpdump
- u'skip_download': True,
- },
},
]
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
+ items_json = self._search_regex(r'mediaItems: ({.*?})$',
webpage, u'items', flags=re.MULTILINE)
items = json.loads(items_json)
info = items['mediaItems']['query']['results']['mediaObj'][0]
# The 'meta' field is not always in the video webpage, we request it
# from another page
long_id = info['id']
+ return self._get_info(info['id'], video_id)
+
+ def _get_info(self, long_id, video_id):
query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
- ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id)
+ ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
+ ' AND protocol="http"' % long_id)
data = compat_urllib_parse.urlencode({
'q': query,
'env': 'prod',
formats.append(format_info)
formats = sorted(formats, key=lambda f:(f['height'], f['width']))
- info = {
+ return {
'id': video_id,
'title': meta['title'],
'formats': formats,
'description': clean_html(meta['description']),
'thumbnail': meta['thumbnail'],
}
- # TODO: Remove when #980 has been merged
- info.update(formats[-1])
- return info
+
+class YahooNewsIE(YahooIE):
+ IE_NAME = 'yahoo:news'
+ _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
+
+ _TEST = {
+ u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+ u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
+ u'info_dict': {
+ u'id': u'104538833',
+ u'ext': u'mp4',
+ u'title': u'China Moses Is Crazy About the Blues',
+ u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
+ },
+ }
+
+ # Overwrite YahooIE properties we don't want
+ _TESTS = []
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
+ return self._get_info(long_id, video_id)
class YahooSearchIE(SearchInfoExtractor):
import string
import struct
import traceback
-import xml.etree.ElementTree
import zlib
from .common import InfoExtractor, SearchInfoExtractor
clean_html,
get_cachedir,
get_element_by_id,
+ get_element_by_attribute,
ExtractorError,
unescapeHTML,
unified_strdate,
class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
IE_DESC = u'YouTube.com'
- _VALID_URL = r"""^
+ _VALID_URL = r"""(?x)^
(
- (?:https?://)? # http(s):// (optional)
- (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+ (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
'248': 'webm',
}
_video_dimensions = {
- '5': '240x400',
+ '5': '400x240',
'6': '???',
'13': '???',
- '17': '144x176',
- '18': '360x640',
- '22': '720x1280',
- '34': '360x640',
- '35': '480x854',
- '36': '240x320',
- '37': '1080x1920',
- '38': '3072x4096',
- '43': '360x640',
- '44': '480x854',
- '45': '720x1280',
- '46': '1080x1920',
+ '17': '176x144',
+ '18': '640x360',
+ '22': '1280x720',
+ '34': '640x360',
+ '35': '854x480',
+ '36': '320x240',
+ '37': '1920x1080',
+ '38': '4096x3072',
+ '43': '640x360',
+ '44': '854x480',
+ '45': '1280x720',
+ '46': '1920x1080',
'82': '360p',
'83': '480p',
'84': '720p',
u"uploader": u"Philipp Hagemeister",
u"uploader_id": u"phihag",
u"upload_date": u"20121002",
- u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+ u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
}
},
{
u"uploader_id": u"justintimberlakeVEVO"
}
},
+ {
+ u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+ u"file": u"yZIXLfi8CZQ.mp4",
+ u"note": u"Embed-only video (#1746)",
+ u"info_dict": {
+ u"upload_date": u"20120608",
+ u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+ u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+ u"uploader": u"SET India",
+ u"uploader_id": u"setindia"
+ }
+ },
]
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
if YoutubePlaylistIE.suitable(url): return False
- return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ return re.match(cls._VALID_URL, url) is not None
def __init__(self, *args, **kwargs):
super(YoutubeIE, self).__init__(*args, **kwargs)
"""Turn the encrypted s field into a working signature"""
if player_url is not None:
+ if player_url.startswith(u'//'):
+ player_url = u'https:' + player_url
try:
player_id = (player_url, len(s))
if player_id not in self._player_cache:
params = compat_urllib_parse.urlencode({
'lang': lang,
'v': video_id,
- 'fmt': self._downloader.params.get('subtitlesformat'),
+ 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
'name': l[0].encode('utf-8'),
})
url = u'http://www.youtube.com/api/timedtext?' + params
def _get_available_automatic_caption(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_format = self._downloader.params.get('subtitlesformat')
+ sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen(u'%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = u'Couldn\'t find automatic captions for %s' % video_id
'asrs': 1,
})
list_url = caption_url + '&' + list_params
- list_page = self._download_webpage(list_url, video_id)
- caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+ caption_list = self._download_xml(list_url, video_id)
original_lang_node = caption_list.find('track')
if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
self._downloader.report_warning(u'Video doesn\'t have automatic captions')
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
data = compat_urllib_parse.urlencode({'video_id': video_id,
- 'el': 'embedded',
+ 'el': 'player_embedded',
'gl': 'US',
'hl': 'en',
'eurl': 'https://youtube.googleapis.com/v/' + video_id,
else:
raise ExtractorError(u'"token" parameter not in video info for unknown reason')
+ if 'view_count' in video_info:
+ view_count = int(video_info['view_count'][0])
+ else:
+ view_count = None
+
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
raise ExtractorError(u'"rental" videos not supported')
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
+ video_description = re.sub(r'''(?x)
+ <a\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ title="([^"]+)"\s+
+ (?:[a-zA-Z-]+="[^"]+"\s+)*?
+ class="yt-uix-redirect-link"\s*>
+ [^<]+
+ </a>
+ ''', r'\1', video_description)
video_description = clean_html(video_description)
else:
fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ 'view_count': view_count,
})
return results
-class YoutubePlaylistIE(InfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
(?:https?://)?
|
((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
- _MAX_RESULTS = 50
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+ _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+ _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
@classmethod
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ def _real_initialize(self):
+ self._login()
+
+ def _ids_to_results(self, ids):
+ return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+ for vid_id in ids]
+
+ def _extract_mix(self, playlist_id):
+ # The mixes are generated from a a single video
+ # the id of the playlist is just 'RD' + video_id
+ url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
+ webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+ title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
+ get_element_by_attribute('class', 'title ', webpage))
+ title = clean_html(title_span)
+ video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
+ ids = orderedSet(re.findall(video_re, webpage))
+ url_results = self._ids_to_results(ids)
+
+ return self.playlist_result(url_results, playlist_id, title)
+
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
video_id = query_dict['v'][0]
if self._downloader.params.get('noplaylist'):
self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+ return self.url_result(video_id, 'Youtube', video_id=video_id)
else:
self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- # Download playlist videos from API
- videos = []
+ if len(playlist_id) == 13: # 'RD' + 11 characters for the video id
+ # Mixes require a custom extraction process
+ return self._extract_mix(playlist_id)
+
+ # Extract the video ids from the playlist pages
+ ids = []
for page_num in itertools.count(1):
- start_index = self._MAX_RESULTS * (page_num - 1) + 1
- if start_index >= 1000:
- self._downloader.report_warning(u'Max number of results reached')
- break
- url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+ url = self._TEMPLATE_URL % (playlist_id, page_num)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+ matches = re.finditer(self._VIDEO_RE, page)
+ # We remove the duplicates and the link with index 0
+ # (it's not the first video of the playlist)
+ new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+ ids.extend(new_ids)
- try:
- response = json.loads(page)
- except ValueError as err:
- raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-
- if 'feed' not in response:
- raise ExtractorError(u'Got a malformed response from YouTube API')
- playlist_title = response['feed']['title']['$t']
- if 'entry' not in response['feed']:
- # Number of videos is a multiple of self._MAX_RESULTS
+ if re.search(self._MORE_PAGES_INDICATOR, page) is None:
break
- for entry in response['feed']['entry']:
- index = entry['yt$position']['$t']
- if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
- videos.append((
- index,
- 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
- ))
+ playlist_title = self._og_search_title(page)
- videos = [v[1] for v in sorted(videos)]
-
- url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
- return [self.playlist_result(url_results, playlist_id, playlist_title)]
+ url_results = self._ids_to_results(ids)
+ return self.playlist_result(url_results, playlist_id, playlist_title)
class YoutubeChannelIE(InfoExtractor):
# Download channel page
channel_id = mobj.group(1)
video_ids = []
+ url = 'https://www.youtube.com/channel/%s/videos' % channel_id
+ channel_page = self._download_webpage(url, channel_id)
+ if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
+ autogenerated = True
+ else:
+ autogenerated = False
- # Download all channel pages using the json-based channel_ajax query
- for pagenum in itertools.count(1):
- url = self._MORE_PAGES_URL % (pagenum, channel_id)
- page = self._download_webpage(url, channel_id,
- u'Downloading page #%s' % pagenum)
-
- page = json.loads(page)
-
- ids_in_page = self.extract_videos_from_page(page['content_html'])
- video_ids.extend(ids_in_page)
-
- if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
- break
+ if autogenerated:
+ # The videos are contained in a single page
+ # the ajax pages can't be used, they are empty
+ video_ids = self.extract_videos_from_page(channel_page)
+ else:
+ # Download all channel pages using the json-based channel_ajax query
+ for pagenum in itertools.count(1):
+ url = self._MORE_PAGES_URL % (pagenum, channel_id)
+ page = self._download_webpage(url, channel_id,
+ u'Downloading page #%s' % pagenum)
+
+ page = json.loads(page)
+
+ ids_in_page = self.extract_videos_from_page(page['content_html'])
+ video_ids.extend(ids_in_page)
+
+ if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+ break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
- urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
- url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
- return [self.playlist_result(url_entries, channel_id)]
+ url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_entries, channel_id)
class YoutubeUserIE(InfoExtractor):
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
break
- urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
- url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
- return [self.playlist_result(url_results, playlist_title = username)]
+ url_results = [
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
+ return self.playlist_result(url_results, playlist_title=username)
+
class YoutubeSearchIE(SearchInfoExtractor):
IE_DESC = u'YouTube.com searches'
if len(video_ids) > n:
video_ids = video_ids[:n]
- videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+ videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in video_ids]
return self.playlist_result(videos, query)
class YoutubeSearchDateIE(YoutubeSearchIE):
+ IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first'
Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
"""
_LOGIN_REQUIRED = True
- _PAGING_STEP = 30
# use action_load_personal_feed instead of action_load_system_feed
_PERSONAL_FEED = False
def _real_extract(self, url):
feed_entries = []
- # The step argument is available only in 2.7 or higher
- for i in itertools.count(0):
- paging = i*self._PAGING_STEP
+ paging = 0
+ for i in itertools.count(1):
info = self._download_webpage(self._FEED_TEMPLATE % paging,
u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
feed_html = info['feed_html']
m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
- feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+ feed_entries.extend(
+ self.url_result(video_id, 'Youtube', video_id=video_id)
+ for video_id in ids)
if info['paging'] is None:
break
+ paging = info['paging']
return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
_VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
_FEED_NAME = 'watch_later'
_PLAYLIST_TITLE = u'Youtube Watch Later'
- _PAGING_STEP = 100
_PERSONAL_FEED = True
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
+ _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+ _FEED_NAME = 'history'
+ _PERSONAL_FEED = True
+ _PLAYLIST_TITLE = u'Youtube Watch History'
+
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = u'youtube:favorites'
IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+# coding: utf-8
+
+import operator
import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- ExtractorError,
+ unified_strdate,
)
class ZDFIE(InfoExtractor):
- _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
- _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+ _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
+
+ _TEST = {
+ u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
+ u"file": u"2037704.webm",
+ u"info_dict": {
+ u"upload_date": u"20131127",
+ u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
+ u"uploader": u"spezial",
+ u"title": u"ZDFspezial - Ende des Machtpokers"
+ },
+ u"skip": u"Videos on ZDF.de are depublicised in short order",
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('video_id')
- if mobj.group('hash'):
- url = url.replace(u'#', u'', 1)
+ xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ doc = self._download_xml(
+ xml_url, video_id,
+ note=u'Downloading video info',
+ errnote=u'Failed to download video info')
+
+ title = doc.find('.//information/title').text
+ description = doc.find('.//information/detail').text
+ uploader_node = doc.find('.//details/originChannelTitle')
+ uploader = None if uploader_node is None else uploader_node.text
+ duration_str = doc.find('.//details/length').text
+ duration_m = re.match(r'''(?x)^
+ (?P<hours>[0-9]{2})
+ :(?P<minutes>[0-9]{2})
+ :(?P<seconds>[0-9]{2})
+ (?:\.(?P<ms>[0-9]+)?)
+ ''', duration_str)
+ duration = (
+ (
+ (int(duration_m.group('hours')) * 60 * 60) +
+ (int(duration_m.group('minutes')) * 60) +
+ int(duration_m.group('seconds'))
+ )
+ if duration_m
+ else None
+ )
+ upload_date = unified_strdate(doc.find('.//details/airtime').text)
+
+ def xml_to_format(fnode):
+ video_url = fnode.find('url').text
+ is_available = u'http://www.metafilegenerator' not in video_url
- html = self._download_webpage(url, video_id)
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
- if streams is None:
- raise ExtractorError(u'No media url found.')
+ format_id = fnode.attrib['basetype']
+ format_m = re.match(r'''(?x)
+ (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+ (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+ ''', format_id)
- # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
- # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
- # choose first/default media type and highest quality for now
- def stream_pref(s):
- TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+ ext = format_m.group('container')
+ is_supported = ext != 'f4f'
+
+ PROTO_ORDER = ['http', 'rtmp', 'rtsp']
try:
- type_pref = TYPE_ORDER.index(s['media_type'])
+ proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
except ValueError:
- type_pref = 999
+ proto_pref = 999
- QUALITY_ORDER = ['veryhigh', '300']
+ quality = fnode.find('./quality').text
+ QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
try:
- quality_pref = QUALITY_ORDER.index(s['quality'])
+ quality_pref = -QUALITY_ORDER.index(quality)
except ValueError:
quality_pref = 999
- return (type_pref, quality_pref)
-
- sorted_streams = sorted(streams, key=stream_pref)
- if not sorted_streams:
- raise ExtractorError(u'No stream found.')
- stream = sorted_streams[0]
-
- media_link = self._download_webpage(
- stream['video_url'],
- video_id,
- u'Get stream URL')
+ abr = int(fnode.find('./audioBitrate').text) // 1000
+ vbr = int(fnode.find('./videoBitrate').text) // 1000
+ pref = (is_available, is_supported,
+ proto_pref, quality_pref, vbr, abr)
- MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
- RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+ format_note = u''
+ if not is_supported:
+ format_note += u'(unsupported)'
+ if not format_note:
+ format_note = None
- mobj = re.search(self._MEDIA_STREAM, media_link)
- if mobj is None:
- mobj = re.search(RTSP_STREAM, media_link)
- if mobj is None:
- raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
- video_url = mobj.group('video_url')
+ return {
+ 'format_id': format_id + u'-' + quality,
+ 'url': video_url,
+ 'ext': ext,
+ 'acodec': format_m.group('acodec'),
+ 'vcodec': format_m.group('vcodec'),
+ 'abr': abr,
+ 'vbr': vbr,
+ 'width': int(fnode.find('./width').text),
+ 'height': int(fnode.find('./height').text),
+ 'filesize': int(fnode.find('./filesize').text),
+ 'format_note': format_note,
+ '_pref': pref,
+ '_available': is_available,
+ }
- title = self._html_search_regex(
- r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
- html, u'title')
+ format_nodes = doc.findall('.//formitaeten/formitaet')
+ formats = sorted(filter(lambda f: f['_available'],
+ map(xml_to_format, format_nodes)),
+ key=operator.itemgetter('_pref'))
return {
'id': video_id,
- 'url': video_url,
'title': title,
- 'ext': determine_ext(video_url)
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'duration': duration,
+ 'upload_date': upload_date,
}
import json
import traceback
import hashlib
+import os
import subprocess
import sys
from zipimport import zipimporter
-from .utils import *
+from .utils import (
+ compat_str,
+ compat_urllib_request,
+)
from .version import __version__
def rsa_verify(message, signature, key):
if signature != sha256(message).digest(): return False
return True
+
def update_self(to_screen, verbose):
"""Update the program file with the latest version from the repository"""
return
version_id = versions_info['latest']
+
+ def version_tuple(version_str):
+ return tuple(map(int, version_str.split('.')))
+ if version_tuple(__version__) >= version_tuple(version_id):
+ to_screen(u'youtube-dl is up to date (%s)' % __version__)
+ return
+
to_screen(u'Updating to version ' + version_id + '...')
version = versions_info['versions'][version_id]
urlh = compat_urllib_request.urlopen(version['exe'][0])
newcontent = urlh.read()
urlh.close()
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to download latest version')
return
try:
with open(exe + '.new', 'wb') as outf:
outf.write(newcontent)
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to write the new version')
return
subprocess.Popen([bat]) # Continues to run in the background
return # Do not show premature success messages
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to overwrite current version')
return
urlh = compat_urllib_request.urlopen(version['bin'][0])
newcontent = urlh.read()
urlh.close()
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to download latest version')
return
try:
with open(filename, 'wb') as outf:
outf.write(newcontent)
- except (IOError, OSError) as err:
+ except (IOError, OSError):
if verbose: to_screen(compat_str(traceback.format_exc()))
to_screen(u'ERROR: unable to overwrite current version')
return
import io
import json
import locale
+import math
import os
import pipes
import platform
import re
+import ssl
import socket
import sys
import traceback
+import xml.etree.ElementTree
import zlib
try:
else:
return '%d' % secs
-def make_HTTPS_handler(opts):
- if sys.version_info < (3,2):
- # Python's 2.x handler is very simplistic
- return compat_urllib_request.HTTPSHandler()
+def make_HTTPS_handler(opts_no_check_certificate):
+ if sys.version_info < (3, 2):
+ import httplib
+
+ class HTTPSConnectionV3(httplib.HTTPSConnection):
+ def __init__(self, *args, **kwargs):
+ httplib.HTTPSConnection.__init__(self, *args, **kwargs)
+
+ def connect(self):
+ sock = socket.create_connection((self.host, self.port), self.timeout)
+ if self._tunnel_host:
+ self.sock = sock
+ self._tunnel()
+ try:
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
+ except ssl.SSLError:
+ self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
+
+ class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
+ def https_open(self, req):
+ return self.do_open(HTTPSConnectionV3, req)
+ return HTTPSHandlerV3()
else:
- import ssl
- context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+ context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
context.set_default_verify_paths()
context.verify_mode = (ssl.CERT_NONE
- if opts.no_check_certificate
+ if opts_no_check_certificate
else ssl.CERT_REQUIRED)
return compat_urllib_request.HTTPSHandler(context=context)
'%Y/%m/%d %H:%M:%S',
'%d.%m.%Y %H:%M',
'%Y-%m-%dT%H:%M:%SZ',
+ '%Y-%m-%dT%H:%M:%S.%fZ',
+ '%Y-%m-%dT%H:%M:%S.%f0Z',
'%Y-%m-%dT%H:%M:%S',
]
for expression in format_expressions:
def shell_quote(args):
- return ' '.join(map(pipes.quote, args))
+ quoted_args = []
+ encoding = sys.getfilesystemencoding()
+ if encoding is None:
+ encoding = 'utf-8'
+ for a in args:
+ if isinstance(a, bytes):
+ # We may get a filename encoded with 'encodeFilename'
+ a = a.decode(encoding)
+ quoted_args.append(pipes.quote(a))
+ return u' '.join(quoted_args)
def takewhile_inclusive(pred, seq):
jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
data = json.loads(jsond)
return url, data
+
+
+def format_bytes(bytes):
+ if bytes is None:
+ return u'N/A'
+ if type(bytes) is str:
+ bytes = float(bytes)
+ if bytes == 0.0:
+ exponent = 0
+ else:
+ exponent = int(math.log(bytes, 1024.0))
+ suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+ converted = float(bytes) / float(1024 ** exponent)
+ return u'%.2f%s' % (converted, suffix)
-__version__ = '2013.11.11'
+__version__ = '2013.12.04'