From: Rogério Brito Date: Fri, 21 Jun 2013 00:13:34 +0000 (-0300) Subject: Merge tag 'upstream/2013.06.21' X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/b559da0987abed92b05a784938d25c722b86dc70?hp=7d51f6f7c3910fa53b143529e7de31802c5073fb Merge tag 'upstream/2013.06.21' Upstream version 2013.06.21 Conflicts: .gitignore --- diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 77469b8..0000000 --- a/.gitignore +++ /dev/null @@ -1,20 +0,0 @@ -*.pyc -*.pyo -*~ -*.DS_Store -wine-py2exe/ -py2exe.log -*.kate-swp -build/ -dist/ -MANIFEST -README.txt -youtube-dl.1 -youtube-dl.bash-completion -youtube-dl -youtube-dl.exe -youtube-dl.tar.gz -.coverage -cover/ -updates_key.pem -*.egg-info diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 7f1fa8a..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python -python: - - "2.6" - - "2.7" - - "3.3" -script: nosetests test --verbose -notifications: - email: - - filippo.valsorda@gmail.com - - phihag@phihag.de - - jaime.marquinez.ferrandiz+travis@gmail.com -# irc: -# channels: -# - "irc.freenode.org#youtube-dl" -# skip_join: true diff --git a/LATEST_VERSION b/LATEST_VERSION deleted file mode 100644 index a334573..0000000 --- a/LATEST_VERSION +++ /dev/null @@ -1 +0,0 @@ -2012.12.99 diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c5e2ce3 --- /dev/null +++ b/README.txt @@ -0,0 +1,332 @@ +NAME +==== + +youtube-dl + +SYNOPSIS +======== + +youtube-dl OPTIONS URL [URL...] + +DESCRIPTION +=========== + +youtube-dl is a small command-line program to download videos from +YouTube.com and a few more sites. It requires the Python interpreter, +version 2.6, 2.7, or 3.3+, and it is not platform specific. It should +work on your Unix box, on Windows or on Mac OS X. It is released to the +public domain, which means you can modify it, redistribute it or use it +however you like. + +OPTIONS +======= + + -h, --help print this help text and exit + --version print program version and exit + -U, --update update this program to latest version + -i, --ignore-errors continue on download errors + -r, --rate-limit LIMIT maximum download rate (e.g. 50k or 44.6m) + -R, --retries RETRIES number of retries (default is 10) + --buffer-size SIZE size of download buffer (e.g. 1024 or 16k) + (default is 1024) + --no-resize-buffer do not automatically adjust the buffer size. By + default, the buffer size is automatically resized + from an initial value of SIZE. + --dump-user-agent display the current browser identification + --user-agent UA specify a custom user agent + --referer REF specify a custom referer, use if the video access + is restricted to one domain + --list-extractors List all supported extractors and the URLs they + would handle + --proxy URL Use the specified HTTP/HTTPS proxy + --no-check-certificate Suppress HTTPS certificate validation. + +Video Selection: +---------------- + + --playlist-start NUMBER playlist video to start at (default is 1) + --playlist-end NUMBER playlist video to end at (default is last) + --match-title REGEX download only matching titles (regex or caseless + sub-string) + --reject-title REGEX skip download for matching titles (regex or + caseless sub-string) + --max-downloads NUMBER Abort after downloading NUMBER files + --min-filesize SIZE Do not download any videos smaller than SIZE + (e.g. 50k or 44.6m) + --max-filesize SIZE Do not download any videos larger than SIZE (e.g. + 50k or 44.6m) + --date DATE download only videos uploaded in this date + --datebefore DATE download only videos uploaded before this date + --dateafter DATE download only videos uploaded after this date + +Filesystem Options: +------------------- + + -t, --title use title in file name (default) + --id use only video ID in file name + -l, --literal [deprecated] alias of --title + -A, --auto-number number downloaded files starting from 00000 + -o, --output TEMPLATE output filename template. Use %(title)s to get + the title, %(uploader)s for the uploader name, + %(uploader_id)s for the uploader nickname if + different, %(autonumber)s to get an automatically + incremented number, %(ext)s for the filename + extension, %(upload_date)s for the upload date + (YYYYMMDD), %(extractor)s for the provider + (youtube, metacafe, etc), %(id)s for the video id + , %(playlist)s for the playlist the video is in, + %(playlist_index)s for the position in the + playlist and %% for a literal percent. Use - to + output to stdout. Can also be used to download to + a different directory, for example with -o '/my/d + ownloads/%(uploader)s/%(title)s-%(id)s.%(ext)s' . + --autonumber-size NUMBER Specifies the number of digits in %(autonumber)s + when it is present in output filename template or + --autonumber option is given + --restrict-filenames Restrict filenames to only ASCII characters, and + avoid "&" and spaces in filenames + -a, --batch-file FILE file containing URLs to download ('-' for stdin) + -w, --no-overwrites do not overwrite files + -c, --continue resume partially downloaded files + --no-continue do not resume partially downloaded files (restart + from beginning) + --cookies FILE file to read cookies from and dump cookie jar in + --no-part do not use .part files + --no-mtime do not use the Last-modified header to set the + file modification time + --write-description write video description to a .description file + --write-info-json write video metadata to a .info.json file + --write-thumbnail write thumbnail image to disk + +Verbosity / Simulation Options: +------------------------------- + + -q, --quiet activates quiet mode + -s, --simulate do not download the video and do not write + anything to disk + --skip-download do not download the video + -g, --get-url simulate, quiet but print URL + -e, --get-title simulate, quiet but print title + --get-id simulate, quiet but print id + --get-thumbnail simulate, quiet but print thumbnail URL + --get-description simulate, quiet but print video description + --get-filename simulate, quiet but print output filename + --get-format simulate, quiet but print output format + --newline output progress bar as new lines + --no-progress do not print progress bar + --console-title display progress in console titlebar + -v, --verbose print various debugging information + --dump-intermediate-pages print downloaded pages to debug problems(very + verbose) + +Video Format Options: +--------------------- + + -f, --format FORMAT video format code, specifiy the order of + preference using slashes: "-f 22/17/18" + --all-formats download all available video formats + --prefer-free-formats prefer free video formats unless a specific one + is requested + --max-quality FORMAT highest quality format to download + -F, --list-formats list all available formats (currently youtube + only) + --write-sub write subtitle file (currently youtube only) + --only-sub [deprecated] alias of --skip-download + --all-subs downloads all the available subtitles of the + video (currently youtube only) + --list-subs lists all available subtitles for the video + (currently youtube only) + --sub-format LANG subtitle format [srt/sbv] (default=srt) + (currently youtube only) + --sub-lang LANG language of the subtitles to download (optional) + use IETF language tags like 'en' + +Authentication Options: +----------------------- + + -u, --username USERNAME account username + -p, --password PASSWORD account password + -n, --netrc use .netrc authentication data + +Post-processing Options: +------------------------ + + -x, --extract-audio convert video files to audio-only files (requires + ffmpeg or avconv and ffprobe or avprobe) + --audio-format FORMAT "best", "aac", "vorbis", "mp3", "m4a", "opus", or + "wav"; best by default + --audio-quality QUALITY ffmpeg/avconv audio quality specification, insert + a value between 0 (better) and 9 (worse) for VBR + or a specific bitrate like 128K (default 5) + --recode-video FORMAT Encode the video to another format if necessary + (currently supported: mp4|flv|ogg|webm) + -k, --keep-video keeps the video file on disk after the post- + processing; the video is erased by default + --no-post-overwrites do not overwrite post-processed files; the post- + processed files are overwritten by default + +CONFIGURATION +============= + +You can configure youtube-dl by placing default arguments (such as +--extract-audio --no-mtime to always extract the audio and not copy the +mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. + +OUTPUT TEMPLATE +=============== + +The -o option allows users to indicate a template for the output file +names. The basic usage is not to set any template arguments when +downloading a single file, like in +youtube-dl -o funny_video.flv "http://some/video". However, it may +contain special sequences that will be replaced when downloading each +video. The special sequences have the format %(NAME)s. To clarify, that +is a percent symbol followed by a name in parenthesis, followed by a +lowercase S. Allowed names are: + +- id: The sequence will be replaced by the video identifier. +- url: The sequence will be replaced by the video URL. +- uploader: The sequence will be replaced by the nickname of the + person who uploaded the video. +- upload_date: The sequence will be replaced by the upload date in + YYYYMMDD format. +- title: The sequence will be replaced by the video title. +- ext: The sequence will be replaced by the appropriate extension + (like flv or mp4). +- epoch: The sequence will be replaced by the Unix epoch when creating + the file. +- autonumber: The sequence will be replaced by a five-digit number + that will be increased with each download, starting at zero. +- playlist: The name or the id of the playlist that contains the + video. +- playlist_index: The index of the video in the playlist, a five-digit + number. + +The current default template is %(id)s.%(ext)s, but that will be +switchted to %(title)s-%(id)s.%(ext)s (which can be requested with -t at +the moment). + +In some cases, you don't want special characters such as 中, spaces, or +&, such as when transferring the downloaded filename to a Windows system +or the filename through an 8bit-unsafe channel. In these cases, add the +--restrict-filenames flag to get a shorter title: + + $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc + youtube-dl test video ''_ä↭𝕐.mp4 # All kinds of weird characters + $ youtube-dl --get-filename -o "%(title)s.%(ext)s" BaW_jenozKc --restrict-filenames + youtube-dl_test_video_.mp4 # A simple file name + +VIDEO SELECTION +=============== + +Videos can be filtered by their upload date using the options --date, +--datebefore or --dateafter, they accept dates in two formats: + +- Absolute dates: Dates in the format YYYYMMDD. +- Relative dates: Dates in the format + (now|today)[+-][0-9](day|week|month|year)(s)? + +Examples: + + $ youtube-dl --dateafter now-6months #will only download the videos uploaded in the last 6 months + $ youtube-dl --date 19700101 #will only download the videos uploaded in January 1, 1970 + $ youtube-dl --dateafter 20000101 --datebefore 20100101 #will only download the videos uploaded between 2000 and 2010 + +FAQ +=== + +Can you please put the -b option back? + +Most people asking this question are not aware that youtube-dl now +defaults to downloading the highest available quality as reported by +YouTube, which will be 1080p or 720p in some cases, so you no longer +need the -b option. For some specific videos, maybe YouTube does not +report them to be available in a specific high quality format you''re +interested in. In that case, simply request it with the -f option and +youtube-dl will try to download it. + +I get HTTP error 402 when trying to download a video. What's this? + +Apparently YouTube requires you to pass a CAPTCHA test if you download +too much. We''re considering to provide a way to let you solve the +CAPTCHA, but at the moment, your best course of action is pointing a +webbrowser to the youtube URL, solving the CAPTCHA, and restart +youtube-dl. + +I have downloaded a video but how can I play it? + +Once the video is fully downloaded, use any video player, such as vlc or +mplayer. + +The links provided by youtube-dl -g are not working anymore + +The URLs youtube-dl outputs require the downloader to have the correct +cookies. Use the --cookies option to write the required cookies into a +file, and advise your downloader to read cookies from that file. Some +sites also require a common user agent to be used, use --dump-user-agent +to see the one in use by youtube-dl. + +ERROR: no fmt_url_map or conn information found in video info + +youtube has switched to a new video info format in July 2011 which is +not supported by old versions of youtube-dl. You can update youtube-dl +with sudo youtube-dl --update. + +ERROR: unable to download video + +youtube requires an additional signature since September 2012 which is +not supported by old versions of youtube-dl. You can update youtube-dl +with sudo youtube-dl --update. + +SyntaxError: Non-ASCII character + +The error + + File "youtube-dl", line 2 + SyntaxError: Non-ASCII character '\x93' ... + +means you're using an outdated version of Python. Please update to +Python 2.6 or 2.7. + +What is this binary file? Where has the code gone? + +Since June 2012 (#342) youtube-dl is packed as an executable zipfile, +simply unzip it (might need renaming to youtube-dl.zip first on some +systems) or clone the git repository, as laid out above. If you modify +the code, you can run it by executing the __main__.py file. To recompile +the executable, run make youtube-dl. + +The exe throws a Runtime error from Visual C++ + +To run the exe you need to install first the Microsoft Visual C++ 2008 +Redistributable Package. + +COPYRIGHT +========= + +youtube-dl is released into the public domain by the copyright holders. + +This README file was originally written by Daniel Bolton +(https://github.com/dbbolton) and is likewise released into the public +domain. + +BUGS +==== + +Bugs and suggestions should be reported at: +https://github.com/rg3/youtube-dl/issues + +Please include: + +- Your exact command line, like + youtube-dl -t "http://www.youtube.com/watch?v=uHlDtZ6Oc3s&feature=channel_video_title". + A common mistake is not to escape the &. Putting URLs in quotes + should solve this problem. +- If possible re-run the command with --verbose, and include the full + output, it is really helpful to us. +- The output of youtube-dl --version +- The output of python --version +- The name and version of your Operating System ("Ubuntu 11.04 x64" or + "Windows 7 x64" is usually enough). + +For discussions, join us in the irc channel #youtube-dl on freenode. diff --git a/devscripts/release.sh b/devscripts/release.sh index b2a91f8..b8efdab 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -22,7 +22,7 @@ if [ ! -f "updates_key.pem" ]; then echo 'ERROR: updates_key.pem missing'; exit /bin/echo -e "\n### First of all, testing..." make cleanall -nosetests --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1 +nosetests --verbose --with-coverage --cover-package=youtube_dl --cover-html test --stop || exit 1 /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py diff --git a/test/test_all_urls.py b/test/test_all_urls.py index a403601..dd67286 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -7,7 +7,7 @@ import unittest import os sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE +from youtube_dl.InfoExtractors import YoutubeIE, YoutubePlaylistIE, YoutubeChannelIE, JustinTVIE class TestAllURLsMatching(unittest.TestCase): def test_youtube_playlist_matching(self): @@ -29,6 +29,22 @@ class TestAllURLsMatching(unittest.TestCase): self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')) self.assertTrue(YoutubeChannelIE.suitable('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')) + def test_justin_tv_channelid_matching(self): + self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv")) + self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/")) + self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/")) + + def test_justintv_videoid_matching(self): + self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483")) + + def test_justin_tv_chapterid_matching(self): + self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) + def test_youtube_extract(self): self.assertEqual(YoutubeIE()._extract_id('http://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') self.assertEqual(YoutubeIE()._extract_id('https://www.youtube.com/watch?&v=BaW_jenozKc'), 'BaW_jenozKc') diff --git a/test/test_download.py b/test/test_download.py index 3eca333..577bcdb 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -7,8 +7,8 @@ import os import json import unittest import sys -import hashlib import socket +import binascii # Allow direct execution sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -38,11 +38,16 @@ def _try_rm(filename): if ose.errno != errno.ENOENT: raise +md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() + class FileDownloader(youtube_dl.FileDownloader): def __init__(self, *args, **kwargs): self.to_stderr = self.to_screen self.processed_info_dicts = [] return youtube_dl.FileDownloader.__init__(self, *args, **kwargs) + def report_warning(self, message): + # Don't accept warnings during tests + raise ExtractorError(message) def process_info(self, info_dict): self.processed_info_dicts.append(info_dict) return youtube_dl.FileDownloader.process_info(self, info_dict) @@ -121,7 +126,21 @@ def generator(test_case): with io.open(tc['file'] + '.info.json', encoding='utf-8') as infof: info_dict = json.load(infof) for (info_field, value) in tc.get('info_dict', {}).items(): - self.assertEqual(value, info_dict.get(info_field)) + if isinstance(value, compat_str) and value.startswith('md5:'): + self.assertEqual(value, 'md5:' + md5(info_dict.get(info_field))) + else: + self.assertEqual(value, info_dict.get(info_field)) + + # If checkable fields are missing from the test case, print the info_dict + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) + for key, value in info_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) + if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=2) + u'\n') + + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(key in info_dict.keys() and info_dict[key]) finally: for tc in test_cases: _try_rm(tc['file']) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 78657b5..e8b49ff 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -53,8 +53,7 @@ class TestYoutubeLists(unittest.TestCase): dl = FakeDownloader() ie = YoutubePlaylistIE(dl) result = ie.extract('PLBB231211A4F62143')[0] - self.assertEqual(result['title'], 'Team Fortress 2') - self.assertTrue(len(result['entries']) > 40) + self.assertTrue(len(result['entries']) > 25) def test_youtube_playlist_long(self): dl = FakeDownloader() @@ -105,5 +104,11 @@ class TestYoutubeLists(unittest.TestCase): result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0] self.assertTrue(len(result['entries']) >= 320) + def test_youtube_safe_search(self): + dl = FakeDownloader() + ie = YoutubePlaylistIE(dl) + result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0] + self.assertEqual(len(result['entries']), 2) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_subtitles.py b/test/test_youtube_subtitles.py index a123e6d..c80c90c 100644 --- a/test/test_youtube_subtitles.py +++ b/test/test_youtube_subtitles.py @@ -28,7 +28,9 @@ compat_urllib_request.install_opener(opener) class FakeDownloader(FileDownloader): def __init__(self): self.result = [] - self.params = parameters + # Different instances of the downloader can't share the same dictionary + # some test set the "sublang" parameter, which would break the md5 checks. + self.params = dict(parameters) def to_screen(self, s): print(s) def trouble(self, s, tb=None): @@ -96,6 +98,14 @@ class TestYoutubeSubtitles(unittest.TestCase): IE = YoutubeIE(DL) info_dict = IE.extract('QRS8MkLhQmM') self.assertEqual(info_dict, None) + def test_youtube_automatic_captions(self): + DL = FakeDownloader() + DL.params['writesubtitles'] = True + DL.params['subtitleslang'] = 'it' + IE = YoutubeIE(DL) + info_dict = IE.extract('8YoUxe5ncPo') + sub = info_dict[0]['subtitles'][0] + self.assertTrue(sub[2] is not None) if __name__ == '__main__': unittest.main() diff --git a/test/tests.json b/test/tests.json index f57ebf1..3e0db29 100644 --- a/test/tests.json +++ b/test/tests.json @@ -15,43 +15,76 @@ "name": "Dailymotion", "md5": "392c4b85a60a90dc4792da41ce3144eb", "url": "http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech", - "file": "x33vw9.mp4" + "file": "x33vw9.mp4", + "info_dict": { + "uploader": "Alex and Van .", + "title": "Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\"" + } }, { "name": "Metacafe", "add_ie": ["Youtube"], "url": "http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", - "file": "_aUehQsCQtM.flv" + "file": "_aUehQsCQtM.flv", + "info_dict": { + "upload_date": "20090102", + "title": "The Electric Company | \"Short I\" | PBS KIDS GO!", + "description": "md5:2439a8ef6d5a70e380c22f5ad323e5a8", + "uploader": "PBS", + "uploader_id": "PBS" + } }, { "name": "BlipTV", "md5": "b2d849efcf7ee18917e4b4d9ff37cafe", "url": "http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352", - "file": "5779306.m4v" + "file": "5779306.m4v", + "info_dict": { + "upload_date": "20111205", + "description": "md5:9bc31f227219cde65e47eeec8d2dc596", + "uploader": "Comic Book Resources - CBR TV", + "title": "CBR EXCLUSIVE: \"Gotham City Imposters\" Bats VS Jokerz Short 3" + } }, { "name": "XVideos", "md5": "1d0c835822f0a71a7bf011855db929d0", "url": "http://www.xvideos.com/video939581/funny_porns_by_s_-1", - "file": "939581.flv" + "file": "939581.flv", + "info_dict": { + "title": "Funny Porns By >>>>S<<<<<< -1" + } }, { "name": "YouPorn", "md5": "c37ddbaaa39058c76a7e86c6813423c1", "url": "http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/", - "file": "505835.mp4" + "file": "505835.mp4", + "info_dict": { + "upload_date": "20101221", + "description": "Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", + "uploader": "Ask Dan And Jennifer", + "title": "Sex Ed: Is It Safe To Masturbate Daily?" + } }, { "name": "Pornotube", "md5": "374dd6dcedd24234453b295209aa69b6", "url": "http://pornotube.com/c/173/m/1689755/Marilyn-Monroe-Bathing", - "file": "1689755.flv" + "file": "1689755.flv", + "info_dict": { + "upload_date": "20090708", + "title": "Marilyn-Monroe-Bathing" + } }, { "name": "YouJizz", "md5": "07e15fa469ba384c7693fd246905547c", "url": "http://www.youjizz.com/videos/zeichentrick-1-2189178.html", - "file": "2189178.flv" + "file": "2189178.flv", + "info_dict": { + "title": "Zeichentrick 1" + } }, { "name": "Vimeo", @@ -70,61 +103,103 @@ "name": "Soundcloud", "md5": "ebef0a451b909710ed1d7787dddbf0d7", "url": "http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy", - "file": "62986583.mp3" + "file": "62986583.mp3", + "info_dict": { + "upload_date": "20121011", + "description": "No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o'd", + "uploader": "E.T. ExTerrestrial Music", + "title": "Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1" + } }, { "name": "StanfordOpenClassroom", "md5": "544a9468546059d4e80d76265b0443b8", "url": "http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100", - "file": "PracticalUnix_intro-environment.mp4" + "file": "PracticalUnix_intro-environment.mp4", + "info_dict": { + "title": "Intro Environment" + } }, { "name": "XNXX", "md5": "0831677e2b4761795f68d417e0b7b445", "url": "http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_", - "file": "1135332.flv" + "file": "1135332.flv", + "info_dict": { + "title": "lida » Naked Funny Actress (5)" + } }, { "name": "Youku", "url": "http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html", "file": "XNDgyMDQ2NTQw_part00.flv", "md5": "ffe3f2e435663dc2d1eea34faeff5b5b", - "params": { "test": false } + "params": { "test": false }, + "info_dict": { + "title": "youtube-dl test video \"'/\\ä↭𝕐" + } }, { "name": "NBA", "url": "http://www.nba.com/video/games/nets/2012/12/04/0021200253-okc-bkn-recap.nba/index.html", "file": "0021200253-okc-bkn-recap.nba.mp4", - "md5": "c0edcfc37607344e2ff8f13c378c88a4" + "md5": "c0edcfc37607344e2ff8f13c378c88a4", + "info_dict": { + "description": "Kevin Durant scores 32 points and dishes out six assists as the Thunder beat the Nets in Brooklyn.", + "title": "Thunder vs. Nets" + } }, { "name": "JustinTV", "url": "http://www.twitch.tv/thegamedevhub/b/296128360", "file": "296128360.flv", - "md5": "ecaa8a790c22a40770901460af191c9a" + "md5": "ecaa8a790c22a40770901460af191c9a", + "info_dict": { + "upload_date": "20110927", + "uploader_id": 25114803, + "uploader": "thegamedevhub", + "title": "Beginner Series - Scripting With Python Pt.1" + } }, { "name": "MyVideo", "url": "http://www.myvideo.de/watch/8229274/bowling_fail_or_win", "file": "8229274.flv", - "md5": "2d2753e8130479ba2cb7e0a37002053e" + "md5": "2d2753e8130479ba2cb7e0a37002053e", + "info_dict": { + "title": "bowling-fail-or-win" + } }, { "name": "Escapist", "url": "http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate", "file": "6618-Breaking-Down-Baldurs-Gate.mp4", - "md5": "c6793dbda81388f4264c1ba18684a74d" + "md5": "c6793dbda81388f4264c1ba18684a74d", + "info_dict": { + "description": "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", + "uploader": "the-escapist-presents", + "title": "Breaking Down Baldur's Gate" + } }, { "name": "GooglePlus", "url": "https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH", - "file": "ZButuJc6CtH.flv" + "file": "ZButuJc6CtH.flv", + "info_dict": { + "upload_date": "20120613", + "uploader": "井上ヨシマサ", + "title": "嘆きの天使 降臨" + } }, { "name": "FunnyOrDie", "url": "http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version", "file": "0732f586d7.mp4", - "md5": "f647e9e90064b53b6e046e75d0241fbd" + "md5": "f647e9e90064b53b6e046e75d0241fbd", + "info_dict": { + "description": "Lyrics changed to match the video. Spoken cameo by Obscurus Lupa (from ThatGuyWithTheGlasses.com). Based on a concept by Dustin McLean (DustFilms.com). Performed, edited, and written by David A. Scott.", + "title": "Heart-Shaped Box: Literal Video Version" + } }, { "name": "Steam", @@ -161,6 +236,7 @@ "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things", "file": "12-jan-pythonthings.mp4", "info_dict": { + "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.", "title": "A Few of My Favorite [Python] Things" }, "params": { @@ -173,7 +249,10 @@ "file": "422212.mp4", "md5": "4e2f5cb088a83cd8cdb7756132f9739d", "info_dict": { - "title": "thedailyshow-kristen-stewart part 1" + "upload_date": "20121214", + "description": "Kristen Stewart", + "uploader": "thedailyshow", + "title": "thedailyshow-kristen-stewart part 1" } }, { @@ -224,42 +303,48 @@ "file": "11885679.m4a", "md5": "d30b5b5f74217410f4689605c35d1fd7", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 3 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885680.m4a", "md5": "4eb0a669317cd725f6bbd336a29f923a", "info_dict": { - "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad" + "title": "youtube-dl project as well - youtube-dl test track 4 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885682.m4a", "md5": "1893e872e263a2705558d1d319ad19e8", "info_dict": { - "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 5 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885683.m4a", "md5": "b673c46f47a216ab1741ae8836af5899", "info_dict": { - "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad" + "title": "PH - youtube-dl test track 6 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885684.m4a", "md5": "1d74534e95df54986da7f5abf7d842b7", "info_dict": { - "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 7 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } }, { "file": "11885685.m4a", "md5": "f081f47af8f6ae782ed131d38b9cd1c0", "info_dict": { - "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad" + "title": "phihag - youtube-dl test track 8 \"'/\\\u00e4\u21ad", + "uploader_id": "ytdl" } } ] @@ -270,18 +355,18 @@ "file": "NODfbab.mp4", "md5": "9b0636f8c0f7614afa4ea5e4c6e57e83", "info_dict": { + "uploader": "ytdl", "title": "test chars: \"'/\\ä<>This is a test video for youtube-dl.For more information, contact phihag@phihag.de ." } - }, { "name": "TED", "url": "http://www.ted.com/talks/dan_dennett_on_our_consciousness.html", "file": "102.mp4", - "md5": "7bc087e71d16f18f9b8ab9fa62a8a031", + "md5": "8cd9dfa41ee000ce658fd48fb5d89a61", "info_dict": { "title": "Dan Dennett: The illusion of consciousness", - "thumbnail": "http://images.ted.com/images/ted/488_389x292.jpg" + "description": "md5:c6fa72e6eedbd938c9caf6b2702f5922" } }, { @@ -290,14 +375,19 @@ "file": "11741.mp4", "md5": "0b49f4844a068f8b33f4b7c88405862b", "info_dict": { - "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" + "description": "Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?", + "title": "Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2" } }, { "name": "Generic", "url": "http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html", "file": "13601338388002.mp4", - "md5": "85b90ccc9d73b4acd9138d3af4c27f89" + "md5": "85b90ccc9d73b4acd9138d3af4c27f89", + "info_dict": { + "uploader": "www.hodiho.fr", + "title": "Régis plante sa Jeep" + } }, { "name": "Spiegel", @@ -325,7 +415,7 @@ "file": "wshh6a7q1ny0G34ZwuIO.mp4", "md5": "9d04de741161603bf7071bbf4e883186", "info_dict": { - "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick! " + "title": "Video: KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!" } }, { @@ -340,11 +430,11 @@ }, { "name": "Tumblr", - "url": "http://birthdayproject2012.tumblr.com/post/17258355236/a-sample-video-from-leeann-if-you-need-an-idea", - "file": "17258355236.mp4", - "md5": "7c6a514d691b034ccf8567999e9e88a3", + "url": "http://resigno.tumblr.com/post/53364321212/e-de-extrema-importancia-que-esse-video-seja", + "file": "53364321212.mp4", + "md5": "0716d3dd51baf68a28b40fdf1251494e", "info_dict": { - "title": "Calling all Pris! - A sample video from LeeAnn. (If you need an idea..." + "title": "Rafael Lemos | Tumblr" } }, { @@ -355,42 +445,59 @@ "file":"30510138.mp3", "md5":"f9136bf103901728f29e419d2c70f55d", "info_dict": { - "title":"D-D-Dance" + "upload_date": "20111213", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "D-D-Dance" } }, { "file":"47127625.mp3", "md5":"09b6758a018470570f8fd423c9453dd8", "info_dict": { - "title":"The Royal Concept - Gimme Twice" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "The Royal Concept - Gimme Twice" } }, { "file":"47127627.mp3", "md5":"154abd4e418cea19c3b901f1e1306d9c", "info_dict": { - "title":"Goldrushed" + "upload_date": "20120521", + "uploader": "The Royal Concept", + "title": "Goldrushed" } }, { "file":"47127629.mp3", "md5":"2f5471edc79ad3f33a683153e96a79c1", "info_dict": { - "title":"In the End" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / Povel / David / Magnus\r\nwww.royalconceptband.com", + "uploader": "The Royal Concept", + "title": "In the End" } }, { "file":"47127631.mp3", "md5":"f9ba87aa940af7213f98949254f1c6e2", "info_dict": { - "title":"Knocked Up" + "upload_date": "20120521", + "description": "The Royal Concept from Stockholm\r\nFilip / David / Povel / Magnus\r\nwww.theroyalconceptband.com", + "uploader": "The Royal Concept", + "title": "Knocked Up" } }, { "file":"75206121.mp3", "md5":"f9d1fe9406717e302980c30de4af9353", "info_dict": { - "title":"World On Fire" + "upload_date": "20130116", + "description": "The unreleased track World on Fire premiered on the CW's hit show Arrow (8pm/7pm central). \r\nAs a gift to our fans we would like to offer you a free download of the track! ", + "uploader": "The Royal Concept", + "title": "World On Fire" } } ] @@ -419,8 +526,10 @@ "url": "http://media.photobucket.com/user/rachaneronas/media/TiredofLinkBuildingTryBacklinkMyDomaincom_zpsc0c3b9fa.mp4.html?filters[term]=search&filters[primary]=videos&filters[secondary]=images&sort=1&o=0", "file": "zpsc0c3b9fa.mp4", "md5": "7dabfb92b0a31f6c16cebc0f8e60ff99", - "info_dict":{ - "title":"Tired of Link Building? Try BacklinkMyDomain.com!" + "info_dict": { + "upload_date": "20130504", + "uploader": "rachaneronas", + "title": "Tired of Link Building? Try BacklinkMyDomain.com!" } }, { @@ -434,11 +543,11 @@ }, { "name": "Yahoo", - "url": "http://screen.yahoo.com/obama-celebrates-iraq-victory-27592561.html", - "file": "27592561.flv", - "md5": "c6179bed843512823fd284fa2e7f012d", + "url": "http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html", + "file": "214727115.flv", + "md5": "2e717f169c1be93d84d3794a00d4a325", "info_dict": { - "title": "Obama Celebrates Iraq Victory" + "title": "Julian Smith & Travis Legg Watch Julian Smith" }, "skip": "Requires rtmpdump" }, @@ -482,5 +591,44 @@ "title": "Louis C.K. Interview Pt. 1 11/3/11", "description": "Louis C.K. got starstruck by George W. Bush, so what? Part one." } + }, + { + "name": "XHamster", + "url": "http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html", + "file": "1509445.flv", + "md5": "9f48e0e8d58e3076bb236ff412ab62fa", + "info_dict": { + "upload_date": "20121014", + "uploader_id": "Ruseful2011", + "title": "FemaleAgent Shy beauty takes the bait" + } + }, + { + "name": "Hypem", + "url": "http://hypem.com/track/1v6ga/BODYWORK+-+TAME", + "file": "1v6ga.mp3", + "md5": "b9cc91b5af8995e9f0c1cee04c575828", + "info_dict":{ + "title":"Tame" + } + }, + { + "name": "Vbox7", + "url": "http://vbox7.com/play:249bb972c2", + "file": "249bb972c2.flv", + "md5": "9c70d6d956f888bdc08c124acc120cfe", + "info_dict":{ + "title":"Смях! Чудо - чист за секунди - Скрита камера" + } + }, + { + "name": "Gametrailers", + "url": "http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer", + "file": "zbvr8i.flv", + "md5": "c3edbc995ab4081976e16779bd96a878", + "info_dict": { + "title": "E3 2013: Debut Trailer" + }, + "skip": "Requires rtmpdump" } ] diff --git a/youtube-dl b/youtube-dl index e3eb877..ea6d889 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube-dl.1 b/youtube-dl.1 new file mode 100644 index 0000000..28e8311 --- /dev/null +++ b/youtube-dl.1 @@ -0,0 +1,361 @@ +.TH YOUTUBE\-DL 1 "" +.SH NAME +.PP +youtube\-dl +.SH SYNOPSIS +.PP +\f[B]youtube\-dl\f[] OPTIONS (#options) URL [URL...] +.SH DESCRIPTION +.PP +\f[B]youtube\-dl\f[] is a small command\-line program to download videos +from YouTube.com and a few more sites. +It requires the Python interpreter, version 2.6, 2.7, or 3.3+, and it is +not platform specific. +It should work on your Unix box, on Windows or on Mac OS X. +It is released to the public domain, which means you can modify it, +redistribute it or use it however you like. +.SH OPTIONS +.IP +.nf +\f[C] +\-h,\ \-\-help\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ this\ help\ text\ and\ exit +\-\-version\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ program\ version\ and\ exit +\-U,\ \-\-update\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ update\ this\ program\ to\ latest\ version +\-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ continue\ on\ download\ errors +\-r,\ \-\-rate\-limit\ LIMIT\ \ \ \ \ maximum\ download\ rate\ (e.g.\ 50k\ or\ 44.6m) +\-R,\ \-\-retries\ RETRIES\ \ \ \ \ \ number\ of\ retries\ (default\ is\ 10) +\-\-buffer\-size\ SIZE\ \ \ \ \ \ \ \ \ size\ of\ download\ buffer\ (e.g.\ 1024\ or\ 16k) +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (default\ is\ 1024) +\-\-no\-resize\-buffer\ \ \ \ \ \ \ \ \ do\ not\ automatically\ adjust\ the\ buffer\ size.\ By +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default,\ the\ buffer\ size\ is\ automatically\ resized +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ from\ an\ initial\ value\ of\ SIZE. +\-\-dump\-user\-agent\ \ \ \ \ \ \ \ \ \ display\ the\ current\ browser\ identification +\-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent +\-\-referer\ REF\ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ referer,\ use\ if\ the\ video\ access +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ is\ restricted\ to\ one\ domain +\-\-list\-extractors\ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs\ they +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ would\ handle +\-\-proxy\ URL\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ the\ specified\ HTTP/HTTPS\ proxy +\-\-no\-check\-certificate\ \ \ \ \ Suppress\ HTTPS\ certificate\ validation. +\f[] +.fi +.SS Video Selection: +.IP +.nf +\f[C] +\-\-playlist\-start\ NUMBER\ \ \ \ playlist\ video\ to\ start\ at\ (default\ is\ 1) +\-\-playlist\-end\ NUMBER\ \ \ \ \ \ playlist\ video\ to\ end\ at\ (default\ is\ last) +\-\-match\-title\ REGEX\ \ \ \ \ \ \ \ download\ only\ matching\ titles\ (regex\ or\ caseless +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sub\-string) +\-\-reject\-title\ REGEX\ \ \ \ \ \ \ skip\ download\ for\ matching\ titles\ (regex\ or +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ caseless\ sub\-string) +\-\-max\-downloads\ NUMBER\ \ \ \ \ Abort\ after\ downloading\ NUMBER\ files +\-\-min\-filesize\ SIZE\ \ \ \ \ \ \ \ Do\ not\ download\ any\ videos\ smaller\ than\ SIZE +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (e.g.\ 50k\ or\ 44.6m) +\-\-max\-filesize\ SIZE\ \ \ \ \ \ \ \ Do\ not\ download\ any\ videos\ larger\ than\ SIZE\ (e.g. +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 50k\ or\ 44.6m) +\-\-date\ DATE\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ only\ videos\ uploaded\ in\ this\ date +\-\-datebefore\ DATE\ \ \ \ \ \ \ \ \ \ download\ only\ videos\ uploaded\ before\ this\ date +\-\-dateafter\ DATE\ \ \ \ \ \ \ \ \ \ \ download\ only\ videos\ uploaded\ after\ this\ date +\f[] +.fi +.SS Filesystem Options: +.IP +.nf +\f[C] +\-t,\ \-\-title\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ use\ title\ in\ file\ name\ (default) +\-\-id\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ use\ only\ video\ ID\ in\ file\ name +\-l,\ \-\-literal\ \ \ \ \ \ \ \ \ \ \ \ \ \ [deprecated]\ alias\ of\ \-\-title +\-A,\ \-\-auto\-number\ \ \ \ \ \ \ \ \ \ number\ downloaded\ files\ starting\ from\ 00000 +\-o,\ \-\-output\ TEMPLATE\ \ \ \ \ \ output\ filename\ template.\ Use\ %(title)s\ to\ get +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ the\ title,\ %(uploader)s\ for\ the\ uploader\ name, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ %(uploader_id)s\ for\ the\ uploader\ nickname\ if +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ different,\ %(autonumber)s\ to\ get\ an\ automatically +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ incremented\ number,\ %(ext)s\ for\ the\ filename +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ extension,\ %(upload_date)s\ for\ the\ upload\ date +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (YYYYMMDD),\ %(extractor)s\ for\ the\ provider +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (youtube,\ metacafe,\ etc),\ %(id)s\ for\ the\ video\ id +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ,\ %(playlist)s\ for\ the\ playlist\ the\ video\ is\ in, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ %(playlist_index)s\ for\ the\ position\ in\ the +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ playlist\ and\ %%\ for\ a\ literal\ percent.\ Use\ \-\ to +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ to\ stdout.\ Can\ also\ be\ used\ to\ download\ to +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ different\ directory,\ for\ example\ with\ \-o\ \[aq]/my/d +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ownloads/%(uploader)s/%(title)s\-%(id)s.%(ext)s\[aq]\ . +\-\-autonumber\-size\ NUMBER\ \ \ Specifies\ the\ number\ of\ digits\ in\ %(autonumber)s +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ when\ it\ is\ present\ in\ output\ filename\ template\ or +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \-\-autonumber\ option\ is\ given +\-\-restrict\-filenames\ \ \ \ \ \ \ Restrict\ filenames\ to\ only\ ASCII\ characters,\ and +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ avoid\ "&"\ and\ spaces\ in\ filenames +\-a,\ \-\-batch\-file\ FILE\ \ \ \ \ \ file\ containing\ URLs\ to\ download\ (\[aq]\-\[aq]\ for\ stdin) +\-w,\ \-\-no\-overwrites\ \ \ \ \ \ \ \ do\ not\ overwrite\ files +\-c,\ \-\-continue\ \ \ \ \ \ \ \ \ \ \ \ \ resume\ partially\ downloaded\ files +\-\-no\-continue\ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ resume\ partially\ downloaded\ files\ (restart +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ from\ beginning) +\-\-cookies\ FILE\ \ \ \ \ \ \ \ \ \ \ \ \ file\ to\ read\ cookies\ from\ and\ dump\ cookie\ jar\ in +\-\-no\-part\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ use\ .part\ files +\-\-no\-mtime\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ use\ the\ Last\-modified\ header\ to\ set\ the +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ file\ modification\ time +\-\-write\-description\ \ \ \ \ \ \ \ write\ video\ description\ to\ a\ .description\ file +\-\-write\-info\-json\ \ \ \ \ \ \ \ \ \ write\ video\ metadata\ to\ a\ .info.json\ file +\-\-write\-thumbnail\ \ \ \ \ \ \ \ \ \ write\ thumbnail\ image\ to\ disk +\f[] +.fi +.SS Verbosity / Simulation Options: +.IP +.nf +\f[C] +\-q,\ \-\-quiet\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ activates\ quiet\ mode +\-s,\ \-\-simulate\ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video\ and\ do\ not\ write +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ anything\ to\ disk +\-\-skip\-download\ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video +\-g,\ \-\-get\-url\ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ URL +\-e,\ \-\-get\-title\ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ title +\-\-get\-id\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ id +\-\-get\-thumbnail\ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ thumbnail\ URL +\-\-get\-description\ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ description +\-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename +\-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format +\-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines +\-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar +\-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar +\-v,\ \-\-verbose\ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ various\ debugging\ information +\-\-dump\-intermediate\-pages\ \ print\ downloaded\ pages\ to\ debug\ problems(very +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ verbose) +\f[] +.fi +.SS Video Format Options: +.IP +.nf +\f[C] +\-f,\ \-\-format\ FORMAT\ \ \ \ \ \ \ \ video\ format\ code,\ specifiy\ the\ order\ of +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18" +\-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats +\-\-prefer\-free\-formats\ \ \ \ \ \ prefer\ free\ video\ formats\ unless\ a\ specific\ one +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ is\ requested +\-\-max\-quality\ FORMAT\ \ \ \ \ \ \ highest\ quality\ format\ to\ download +\-F,\ \-\-list\-formats\ \ \ \ \ \ \ \ \ list\ all\ available\ formats\ (currently\ youtube +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ only) +\-\-write\-sub\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ subtitle\ file\ (currently\ youtube\ only) +\-\-only\-sub\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ [deprecated]\ alias\ of\ \-\-skip\-download +\-\-all\-subs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ downloads\ all\ the\ available\ subtitles\ of\ the +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ (currently\ youtube\ only) +\-\-list\-subs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ lists\ all\ available\ subtitles\ for\ the\ video +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (currently\ youtube\ only) +\-\-sub\-format\ LANG\ \ \ \ \ \ \ \ \ \ subtitle\ format\ [srt/sbv]\ (default=srt) +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (currently\ youtube\ only) +\-\-sub\-lang\ LANG\ \ \ \ \ \ \ \ \ \ \ \ language\ of\ the\ subtitles\ to\ download\ (optional) +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ use\ IETF\ language\ tags\ like\ \[aq]en\[aq] +\f[] +.fi +.SS Authentication Options: +.IP +.nf +\f[C] +\-u,\ \-\-username\ USERNAME\ \ \ \ account\ username +\-p,\ \-\-password\ PASSWORD\ \ \ \ account\ password +\-n,\ \-\-netrc\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ use\ .netrc\ authentication\ data +\f[] +.fi +.SS Post\-processing Options: +.IP +.nf +\f[C] +\-x,\ \-\-extract\-audio\ \ \ \ \ \ \ \ convert\ video\ files\ to\ audio\-only\ files\ (requires +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ffmpeg\ or\ avconv\ and\ ffprobe\ or\ avprobe) +\-\-audio\-format\ FORMAT\ \ \ \ \ \ "best",\ "aac",\ "vorbis",\ "mp3",\ "m4a",\ "opus",\ or +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "wav";\ best\ by\ default +\-\-audio\-quality\ QUALITY\ \ \ \ ffmpeg/avconv\ audio\ quality\ specification,\ insert +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ a\ value\ between\ 0\ (better)\ and\ 9\ (worse)\ for\ VBR +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ or\ a\ specific\ bitrate\ like\ 128K\ (default\ 5) +\-\-recode\-video\ FORMAT\ \ \ \ \ \ Encode\ the\ video\ to\ another\ format\ if\ necessary +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (currently\ supported:\ mp4|flv|ogg|webm) +\-k,\ \-\-keep\-video\ \ \ \ \ \ \ \ \ \ \ keeps\ the\ video\ file\ on\ disk\ after\ the\ post\- +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ processing;\ the\ video\ is\ erased\ by\ default +\-\-no\-post\-overwrites\ \ \ \ \ \ \ do\ not\ overwrite\ post\-processed\ files;\ the\ post\- +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ processed\ files\ are\ overwritten\ by\ default +\f[] +.fi +.SH CONFIGURATION +.PP +You can configure youtube\-dl by placing default arguments (such as +\f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio +and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or +\f[C]~/.config/youtube\-dl.conf\f[]. +.SH OUTPUT TEMPLATE +.PP +The \f[C]\-o\f[] option allows users to indicate a template for the +output file names. +The basic usage is not to set any template arguments when downloading a +single file, like in +\f[C]youtube\-dl\ \-o\ funny_video.flv\ "http://some/video"\f[]. +However, it may contain special sequences that will be replaced when +downloading each video. +The special sequences have the format \f[C]%(NAME)s\f[]. +To clarify, that is a percent symbol followed by a name in parenthesis, +followed by a lowercase S. +Allowed names are: +.IP \[bu] 2 +\f[C]id\f[]: The sequence will be replaced by the video identifier. +.IP \[bu] 2 +\f[C]url\f[]: The sequence will be replaced by the video URL. +.IP \[bu] 2 +\f[C]uploader\f[]: The sequence will be replaced by the nickname of the +person who uploaded the video. +.IP \[bu] 2 +\f[C]upload_date\f[]: The sequence will be replaced by the upload date +in YYYYMMDD format. +.IP \[bu] 2 +\f[C]title\f[]: The sequence will be replaced by the video title. +.IP \[bu] 2 +\f[C]ext\f[]: The sequence will be replaced by the appropriate extension +(like flv or mp4). +.IP \[bu] 2 +\f[C]epoch\f[]: The sequence will be replaced by the Unix epoch when +creating the file. +.IP \[bu] 2 +\f[C]autonumber\f[]: The sequence will be replaced by a five\-digit +number that will be increased with each download, starting at zero. +.IP \[bu] 2 +\f[C]playlist\f[]: The name or the id of the playlist that contains the +video. +.IP \[bu] 2 +\f[C]playlist_index\f[]: The index of the video in the playlist, a +five\-digit number. +.PP +The current default template is \f[C]%(id)s.%(ext)s\f[], but that will +be switchted to \f[C]%(title)s\-%(id)s.%(ext)s\f[] (which can be +requested with \f[C]\-t\f[] at the moment). +.PP +In some cases, you don\[aq]t want special characters such as 中, spaces, +or &, such as when transferring the downloaded filename to a Windows +system or the filename through an 8bit\-unsafe channel. +In these cases, add the \f[C]\-\-restrict\-filenames\f[] flag to get a +shorter title: +.IP +.nf +\f[C] +$\ youtube\-dl\ \-\-get\-filename\ \-o\ "%(title)s.%(ext)s"\ BaW_jenozKc +youtube\-dl\ test\ video\ \[aq]\[aq]_ä↭𝕐.mp4\ \ \ \ #\ All\ kinds\ of\ weird\ characters +$\ youtube\-dl\ \-\-get\-filename\ \-o\ "%(title)s.%(ext)s"\ BaW_jenozKc\ \-\-restrict\-filenames +youtube\-dl_test_video_.mp4\ \ \ \ \ \ \ \ \ \ #\ A\ simple\ file\ name +\f[] +.fi +.SH VIDEO SELECTION +.PP +Videos can be filtered by their upload date using the options +\f[C]\-\-date\f[], \f[C]\-\-datebefore\f[] or \f[C]\-\-dateafter\f[], +they accept dates in two formats: +.IP \[bu] 2 +Absolute dates: Dates in the format \f[C]YYYYMMDD\f[]. +.IP \[bu] 2 +Relative dates: Dates in the format +\f[C](now|today)[+\-][0\-9](day|week|month|year)(s)?\f[] +.PP +Examples: +.IP +.nf +\f[C] +$\ youtube\-dl\ \-\-dateafter\ now\-6months\ #will\ only\ download\ the\ videos\ uploaded\ in\ the\ last\ 6\ months +$\ youtube\-dl\ \-\-date\ 19700101\ #will\ only\ download\ the\ videos\ uploaded\ in\ January\ 1,\ 1970 +$\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20100101\ #will\ only\ download\ the\ videos\ uploaded\ between\ 2000\ and\ 2010 +\f[] +.fi +.SH FAQ +.SS Can you please put the \-b option back? +.PP +Most people asking this question are not aware that youtube\-dl now +defaults to downloading the highest available quality as reported by +YouTube, which will be 1080p or 720p in some cases, so you no longer +need the \-b option. +For some specific videos, maybe YouTube does not report them to be +available in a specific high quality format you\[aq]\[aq]re interested +in. +In that case, simply request it with the \-f option and youtube\-dl will +try to download it. +.SS I get HTTP error 402 when trying to download a video. What\[aq]s +this? +.PP +Apparently YouTube requires you to pass a CAPTCHA test if you download +too much. +We\[aq]\[aq]re considering to provide a way to let you solve the +CAPTCHA (https://github.com/rg3/youtube-dl/issues/154), but at the +moment, your best course of action is pointing a webbrowser to the +youtube URL, solving the CAPTCHA, and restart youtube\-dl. +.SS I have downloaded a video but how can I play it? +.PP +Once the video is fully downloaded, use any video player, such as +vlc (http://www.videolan.org) or mplayer (http://www.mplayerhq.hu/). +.SS The links provided by youtube\-dl \-g are not working anymore +.PP +The URLs youtube\-dl outputs require the downloader to have the correct +cookies. +Use the \f[C]\-\-cookies\f[] option to write the required cookies into a +file, and advise your downloader to read cookies from that file. +Some sites also require a common user agent to be used, use +\f[C]\-\-dump\-user\-agent\f[] to see the one in use by youtube\-dl. +.SS ERROR: no fmt_url_map or conn information found in video info +.PP +youtube has switched to a new video info format in July 2011 which is +not supported by old versions of youtube\-dl. +You can update youtube\-dl with \f[C]sudo\ youtube\-dl\ \-\-update\f[]. +.SS ERROR: unable to download video +.PP +youtube requires an additional signature since September 2012 which is +not supported by old versions of youtube\-dl. +You can update youtube\-dl with \f[C]sudo\ youtube\-dl\ \-\-update\f[]. +.SS SyntaxError: Non\-ASCII character +.PP +The error +.IP +.nf +\f[C] +File\ "youtube\-dl",\ line\ 2 +SyntaxError:\ Non\-ASCII\ character\ \[aq]\\x93\[aq]\ ... +\f[] +.fi +.PP +means you\[aq]re using an outdated version of Python. +Please update to Python 2.6 or 2.7. +.SS What is this binary file? Where has the code gone? +.PP +Since June 2012 (#342) youtube\-dl is packed as an executable zipfile, +simply unzip it (might need renaming to \f[C]youtube\-dl.zip\f[] first +on some systems) or clone the git repository, as laid out above. +If you modify the code, you can run it by executing the +\f[C]__main__.py\f[] file. +To recompile the executable, run \f[C]make\ youtube\-dl\f[]. +.SS The exe throws a \f[I]Runtime error from Visual C++\f[] +.PP +To run the exe you need to install first the Microsoft Visual C++ 2008 +Redistributable +Package (http://www.microsoft.com/en-us/download/details.aspx?id=29). +.SH COPYRIGHT +.PP +youtube\-dl is released into the public domain by the copyright holders. +.PP +This README file was originally written by Daniel Bolton +() and is likewise released into the public +domain. +.SH BUGS +.PP +Bugs and suggestions should be reported at: + +.PP +Please include: +.IP \[bu] 2 +Your exact command line, like +\f[C]youtube\-dl\ \-t\ "http://www.youtube.com/watch?v=uHlDtZ6Oc3s&feature=channel_video_title"\f[]. +A common mistake is not to escape the \f[C]&\f[]. +Putting URLs in quotes should solve this problem. +.IP \[bu] 2 +If possible re\-run the command with \f[C]\-\-verbose\f[], and include +the full output, it is really helpful to us. +.IP \[bu] 2 +The output of \f[C]youtube\-dl\ \-\-version\f[] +.IP \[bu] 2 +The output of \f[C]python\ \-\-version\f[] +.IP \[bu] 2 +The name and version of your Operating System ("Ubuntu 11.04 x64" or +"Windows 7 x64" is usually enough). +.PP +For discussions, join us in the irc channel #youtube\-dl on freenode. diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion new file mode 100644 index 0000000..e4f73e3 --- /dev/null +++ b/youtube-dl.bash-completion @@ -0,0 +1,14 @@ +__youtube-dl() +{ + local cur prev opts + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + opts="--help --version --update --ignore-errors --rate-limit --retries --buffer-size --no-resize-buffer --dump-user-agent --user-agent --referer --list-extractors --proxy --no-check-certificate --test --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-filename --get-format --newline --no-progress --console-title --verbose --dump-intermediate-pages --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --only-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites" + + if [[ ${cur} == * ]] ; then + COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) ) + return 0 + fi +} + +complete -F __youtube-dl youtube-dl diff --git a/youtube-dl.exe b/youtube-dl.exe deleted file mode 100644 index 45eee04..0000000 Binary files a/youtube-dl.exe and /dev/null differ diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py index 49f3a87..f4ce480 100644 --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@ -322,6 +322,9 @@ class FileDownloader(object): filetime = timeconvert(timestr) if filetime is None: return filetime + # Ignore obviously invalid dates + if filetime == 0: + return try: os.utime(filename, (time.time(), filetime)) except: @@ -539,6 +542,11 @@ class FileDownloader(object): 'playlist': playlist, 'playlist_index': i + playliststart, } + if not 'extractor' in entry: + # We set the extractor, if it's an url it will be set then to + # the new extractor, but if it's already a video we must make + # sure it's present: see issue #877 + entry['extractor'] = ie_result['extractor'] entry_result = self.process_ie_result(entry, download=download, extra_info=extra) @@ -758,21 +766,21 @@ class FileDownloader(object): except (OSError, IOError): self.report_error(u'RTMP download detected but "rtmpdump" could not be run') return False + verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet' # Download using rtmpdump. rtmpdump returns exit code 2 when # the connection was interrumpted and resuming appears to be # possible. This is part of rtmpdump's normal usage, AFAIK. - basic_args = ['rtmpdump', '-q', '-r', url, '-o', tmpfilename] - if self.params.get('verbose', False): basic_args[1] = '-v' + basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename] if player_url is not None: - basic_args += ['-W', player_url] + basic_args += ['--swfVfy', player_url] if page_url is not None: basic_args += ['--pageUrl', page_url] if play_path is not None: - basic_args += ['-y', play_path] + basic_args += ['--playpath', play_path] if tc_url is not None: basic_args += ['--tcUrl', url] - args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)] + args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] if self.params.get('verbose', False): try: import pipes @@ -810,6 +818,37 @@ class FileDownloader(object): self.report_error(u'rtmpdump exited with code %d' % retval) return False + def _download_with_mplayer(self, filename, url): + self.report_destination(filename) + tmpfilename = self.temp_name(filename) + + args = ['mplayer', '-really-quiet', '-vo', 'null', '-vc', 'dummy', '-dumpstream', '-dumpfile', tmpfilename, url] + # Check for mplayer first + try: + subprocess.call(['mplayer', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) + except (OSError, IOError): + self.report_error(u'MMS or RTSP download detected but "%s" could not be run' % args[0] ) + return False + + # Download using mplayer. + retval = subprocess.call(args) + if retval == 0: + fsize = os.path.getsize(encodeFilename(tmpfilename)) + self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize)) + self.try_rename(tmpfilename, filename) + self._hook_progress({ + 'downloaded_bytes': fsize, + 'total_bytes': fsize, + 'filename': filename, + 'status': 'finished', + }) + return True + else: + self.to_stderr(u"\n") + self.report_error(u'mplayer exited with code %d' % retval) + return False + + def _do_download(self, filename, info_dict): url = info_dict['url'] @@ -830,6 +869,10 @@ class FileDownloader(object): info_dict.get('play_path', None), info_dict.get('tc_url', None)) + # Attempt to download using mplayer + if url.startswith('mms') or url.startswith('rtsp'): + return self._download_with_mplayer(filename, url) + tmpfilename = self.temp_name(filename) stream = None diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d318b4b..17e0f83 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -191,6 +191,47 @@ class InfoExtractor(object): video_info['title'] = playlist_title return video_info + def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Perform a regex search on the given string, using a single or a list of + patterns returning the first matching group. + In case of failure return a default value or raise a WARNING or a + ExtractorError, depending on fatal, specifying the field name. + """ + if isinstance(pattern, (str, compat_str, compiled_regex_type)): + mobj = re.search(pattern, string, flags) + else: + for p in pattern: + mobj = re.search(p, string, flags) + if mobj: break + + if sys.stderr.isatty() and os.name != 'nt': + _name = u'\033[0;34m%s\033[0m' % name + else: + _name = name + + if mobj: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + elif default is not None: + return default + elif fatal: + raise ExtractorError(u'Unable to extract %s' % _name) + else: + self._downloader.report_warning(u'unable to extract %s; ' + u'please report this issue on GitHub.' % _name) + return None + + def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): + """ + Like _search_regex, but strips HTML tags and unescapes entities. + """ + res = self._search_regex(pattern, string, name, default, fatal, flags) + if res: + return clean_html(res).strip() + else: + return res + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. @@ -376,6 +417,34 @@ class YoutubeIE(InfoExtractor): return (u'Did not fetch video subtitles', None, None) return (None, sub_lang, sub) + def _request_automatic_caption(self, video_id, webpage): + """We need the webpage for getting the captions url, pass it as an + argument to speed up the process.""" + sub_lang = self._downloader.params.get('subtitleslang') + sub_format = self._downloader.params.get('subtitlesformat') + self.to_screen(u'%s: Looking for automatic captions' % video_id) + mobj = re.search(r';ytplayer.config = ({.*?});', webpage) + err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + if mobj is None: + return [(err_msg, None, None)] + player_config = json.loads(mobj.group(1)) + try: + args = player_config[u'args'] + caption_url = args[u'ttsurl'] + timestamp = args[u'timestamp'] + params = compat_urllib_parse.urlencode({ + 'lang': 'en', + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + subtitles_url = caption_url + '&' + params + sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') + return [(None, sub_lang, sub)] + except KeyError: + return [(err_msg, None, None)] + def _extract_subtitle(self, video_id): """ Return a list with a tuple: @@ -623,7 +692,14 @@ class YoutubeIE(InfoExtractor): if video_subtitles: (sub_error, sub_lang, sub) = video_subtitles[0] if sub_error: - self._downloader.report_error(sub_error) + # We try with the automatic captions + video_subtitles = self._request_automatic_caption(video_id, video_webpage) + (sub_error_auto, sub_lang, sub) = video_subtitles[0] + if sub is not None: + pass + else: + # We report the original error + self._downloader.report_error(sub_error) if self._downloader.params.get('allsubtitles', False): video_subtitles = self._extract_all_subtitles(video_id) @@ -864,16 +940,10 @@ class DailymotionIE(InfoExtractor): video_title = unescapeHTML(mobj.group('title')) video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.report_warning(u'unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) + video_uploader = self._search_regex([r'(?im)[^<]+?]+?>([^<]+?)', + # Looking for official user + r'<(?:span|a) .*?rel="author".*?>([^<]+?)([0-9]{2})-([0-9]{2})-([0-9]{4})', webpage) @@ -929,18 +999,13 @@ class PhotobucketIE(InfoExtractor): }] # We try looking in other parts of the webpage - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL + video_url = self._search_regex(r'', + webpage, u'video URL') mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) if mobj is None: raise ExtractorError(u'Unable to extract title') video_title = mobj.group(1).decode('utf-8') - video_uploader = mobj.group(2).decode('utf-8') return [{ @@ -1025,7 +1090,7 @@ class VimeoIE(InfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs - _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' + _VALID_URL = r'(?Phttps?://)?(?:(?:www|player)\.)?vimeo(?Ppro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' IE_NAME = u'vimeo' def _real_extract(self, url, new_video=True): @@ -1037,7 +1102,7 @@ class VimeoIE(InfoExtractor): video_id = mobj.group('id') if not mobj.group('proto'): url = 'https://' + url - if mobj.group('direct_link'): + if mobj.group('direct_link') or mobj.group('pro'): url = 'https://vimeo.com/' + video_id # Retrieve video webpage to extract further information @@ -1064,7 +1129,7 @@ class VimeoIE(InfoExtractor): # Extract uploader and uploader_id video_uploader = config["video"]["owner"]["name"] - video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] + video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None # Extract video thumbnail video_thumbnail = config["video"]["thumbnail"] @@ -1338,6 +1403,9 @@ class GenericIE(InfoExtractor): if mobj is None: # Broaden the search a little bit: JWPlayer JS loader mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage) + if mobj is None: + # Try to find twitter cards info + mobj = re.search(r'(.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex(r'(.*)', + webpage, u'video title') # video uploader is domain name - mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_uploader = mobj.group(1) + video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', + url, u'video uploader') return [{ 'id': video_id, @@ -1389,7 +1453,6 @@ class YoutubeSearchIE(SearchInfoExtractor): def report_download_page(self, query, pagenum): """Report attempt to download search page with given number.""" - query = query.decode(preferredencoding()) self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) def _get_n_results(self, query, n): @@ -1507,7 +1570,7 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' + _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 IE_NAME = u'youtube:playlist' @@ -1768,10 +1831,7 @@ class DepositFilesIE(InfoExtractor): file_extension = os.path.splitext(file_url)[1][1:] # Search for file title - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - file_title = mobj.group(1).decode('utf-8') + file_title = self._search_regex(r'', webpage, u'title') return [{ 'id': file_id.decode('utf-8'), @@ -1865,10 +1925,8 @@ class FacebookIE(InfoExtractor): video_duration = int(video_data['video_duration']) thumbnail = video_data['thumbnail_src'] - m = re.search('

([^<]+)

', webpage) - if not m: - raise ExtractorError(u'Cannot find title in webpage') - video_title = unescapeHTML(m.group(1)) + video_title = self._html_search_regex('

([^<]+)

', + webpage, u'title') info = { 'id': video_id, @@ -1884,7 +1942,7 @@ class FacebookIE(InfoExtractor): class BlipTVIE(InfoExtractor): """Information extractor for blip.tv""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' + _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' _URL_EXT = r'^.*\.([a-z0-9]+)$' IE_NAME = u'blip.tv' @@ -1897,6 +1955,10 @@ class BlipTVIE(InfoExtractor): if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) + # See https://github.com/rg3/youtube-dl/issues/857 + api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P[\d\w]+)', url) + if api_mobj is not None: + url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id') urlp = compat_urllib_parse_urlparse(url) if urlp.path.startswith('/play/'): request = compat_urllib_request.Request(url) @@ -2026,15 +2088,10 @@ class MyVideoIE(InfoExtractor): self.report_extraction(video_id) video_url = mobj.group(1) + '.flv' - mobj = re.search('([^<]+)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex('([^<]+)', + webpage, u'title') - mobj = re.search('[.](.+?)$', video_url) - if mobj is None: - raise ExtractorError(u'Unable to extract extention') - video_ext = mobj.group(1) + video_ext = self._search_regex('[.](.+?)$', video_url, u'extension') return [{ 'id': video_id, @@ -2082,25 +2139,23 @@ class MyVideoIE(InfoExtractor): # extracting infos self.report_extraction(video_id) + video_url = None mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract rtmpurl') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) - if 'myvideo2flash' in video_rtmpurl: - self._downloader.report_warning(u'forcing RTMPT ...') - video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://') - - # extract non rtmp videos - if (video_rtmpurl is None) or (video_rtmpurl == ''): + if mobj: + video_url = compat_urllib_parse.unquote(mobj.group(1)) + if 'myvideo2flash' in video_url: + self._downloader.report_warning(u'forcing RTMPT ...') + video_url = video_url.replace('rtmpe://', 'rtmpt://') + + if not video_url: + # extract non rtmp videos mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) if mobj is None: raise ExtractorError(u'unable to extract url') - video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) + video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2)) - mobj = re.search('source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_file = compat_urllib_parse.unquote(mobj.group(1)) + video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file') + video_file = compat_urllib_parse.unquote(video_file) if not video_file.endswith('f4m'): ppath, prefix = video_file.split('.') @@ -2112,20 +2167,16 @@ class MyVideoIE(InfoExtractor): video_filepath + video_file ).replace('.f4m', '.m3u8') - mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage) - if mobj is None: - raise ExtractorError(u'unable to extract swfobj') - video_swfobj = compat_urllib_parse.unquote(mobj.group(1)) + video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj') + video_swfobj = compat_urllib_parse.unquote(video_swfobj) - mobj = re.search("(.*?)", webpage) - if mobj is None: - raise ExtractorError(u'unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex("(.*?)", + webpage, u'title') return [{ 'id': video_id, - 'url': video_rtmpurl, - 'tc_url': video_rtmpurl, + 'url': video_url, + 'tc_url': video_url, 'uploader': None, 'upload_date': None, 'title': video_title, @@ -2136,6 +2187,7 @@ class MyVideoIE(InfoExtractor): 'player_url': video_swfobj, }] + class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ @@ -2317,19 +2369,25 @@ class EscapistIE(InfoExtractor): showName = mobj.group('showname') videoId = mobj.group('episode') - self.report_extraction(showName) - webPage = self._download_webpage(url, showName) + self.report_extraction(videoId) + webpage = self._download_webpage(url, videoId) + + videoDesc = self._html_search_regex('(.*?)\s+-\s+XVID', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) - + video_title = self._html_search_regex(r'(.*?)\s+-\s+XVID', + webpage, u'title') # Extract video thumbnail - mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(0) + video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', + webpage, u'thumbnail', fatal=False) info = { 'id': video_id, @@ -2613,16 +2662,12 @@ class InfoQIE(InfoExtractor): video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id # Extract title - mobj = re.search(r'contentTitle = "(.*?)";', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1) + video_title = self._search_regex(r'contentTitle = "(.*?)";', + webpage, u'title') # Extract description - video_description = u'No description available.' - mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage) - if mobj is not None: - video_description = mobj.group(1) + video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>', + webpage, u'description', fatal=False) video_filename = video_url.split('/')[-1] video_id, extension = video_filename.split('.') @@ -2793,15 +2838,10 @@ class StanfordOpenClassroomIE(InfoExtractor): note='Downloading course info page', errnote='Unable to download course info page') - m = re.search('<h1>([^<]+)</h1>', coursepage) - if m: - info['title'] = unescapeHTML(m.group(1)) - else: - info['title'] = info['id'] + info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id']) - m = re.search('<description>([^<]+)</description>', coursepage) - if m: - info['description'] = unescapeHTML(m.group(1)) + info['description'] = self._html_search_regex('<description>([^<]+)</description>', + coursepage, u'description', fatal=False) links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) info['list'] = [ @@ -2862,25 +2902,17 @@ class MTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract song name') - song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract performer') - performer = unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name + song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>', + webpage, u'song name', fatal=False) - mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to mtvn_uri') - mtvn_uri = mobj.group(1) + video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>', + webpage, u'title') - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract content id') - content_id = mobj.group(1) + mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>', + webpage, u'mtvn_uri', fatal=False) + + content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', + webpage, u'content id', fatal=False) videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri self.report_extraction(video_id) @@ -3028,20 +3060,15 @@ class XNXXIE(InfoExtractor): # Get webpage content webpage = self._download_webpage(url, video_id) - result = re.search(self.VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group(1)) + video_url = self._search_regex(self.VIDEO_URL_RE, + webpage, u'video URL') + video_url = compat_urllib_parse.unquote(video_url) - result = re.search(self.VIDEO_TITLE_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group(1) + video_title = self._html_search_regex(self.VIDEO_TITLE_RE, + webpage, u'title') - result = re.search(self.VIDEO_THUMB_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = result.group(1) + video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, + webpage, u'thumbnail', fatal=False) return [{ 'id': video_id, @@ -3061,26 +3088,6 @@ class GooglePlusIE(InfoExtractor): _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' IE_NAME = u'plus.google' - def report_extract_entry(self, url): - """Report downloading extry""" - self.to_screen(u'Downloading entry: %s' % url) - - def report_date(self, upload_date): - """Report downloading extry""" - self.to_screen(u'Entry date: %s' % upload_date) - - def report_uploader(self, uploader): - """Report downloading extry""" - self.to_screen(u'Uploader: %s' % uploader) - - def report_title(self, video_title): - """Report downloading extry""" - self.to_screen(u'Title: %s' % video_title) - - def report_extract_vid_page(self, video_page): - """Report information extraction.""" - self.to_screen(u'Extracting video page: %s' % video_page) - def _real_extract(self, url): # Extract id from URL mobj = re.match(self._VALID_URL, url) @@ -3093,47 +3100,31 @@ class GooglePlusIE(InfoExtractor): video_extension = 'flv' # Step 1, Retrieve post webpage to extract further information - self.report_extract_entry(post_url) webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage') + self.report_extraction(video_id) + # Extract update date - upload_date = None - pattern = 'title="Timestamp">(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - upload_date = mobj.group(1) + upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>', + webpage, u'upload date', fatal=False) + if upload_date: # Convert timestring to a format suitable for filename upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d") upload_date = upload_date.strftime('%Y%m%d') - self.report_date(upload_date) # Extract uploader - uploader = None - pattern = r'rel\="author".*?>(.*?)</a>' - mobj = re.search(pattern, webpage) - if mobj: - uploader = mobj.group(1) - self.report_uploader(uploader) + uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>', + webpage, u'uploader', fatal=False) # Extract title # Get the first line for title - video_title = u'NA' - pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' - mobj = re.search(pattern, webpage) - if mobj: - video_title = mobj.group(1) - self.report_title(video_title) + video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', + webpage, 'title', default=u'NA') # Step 2, Stimulate clicking the image box to launch video - pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' - mobj = re.search(pattern, webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video page URL') - - video_page = mobj.group(1) + video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', + webpage, u'video page URL') webpage = self._download_webpage(video_page, video_id, u'Downloading video page') - self.report_extract_vid_page(video_page) - # Extract video links on video page """Extract video links of all sizes""" @@ -3166,7 +3157,7 @@ class GooglePlusIE(InfoExtractor): }] class NBAIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' + _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' IE_NAME = u'nba' def _real_extract(self, url): @@ -3175,28 +3166,27 @@ class NBAIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group(1) - if video_id.endswith('/index.html'): - video_id = video_id[:-len('/index.html')] webpage = self._download_webpage(url, video_id) video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4' - def _findProp(rexp, default=None): - m = re.search(rexp, webpage) - if m: - return unescapeHTML(m.group(1)) - else: - return default shortened_video_id = video_id.rpartition('/')[2] - title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '') + title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"', + webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '') + + # It isn't there in the HTML it returns to us + # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) + + description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False) + info = { 'id': shortened_video_id, 'url': video_url, 'ext': 'mp4', 'title': title, - 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'), - 'description': _findProp(r'<div class="description">(.*?)</h1>'), + # 'uploader_date': uploader_date, + 'description': description, } return [info] @@ -3344,30 +3334,21 @@ class FunnyOrDieIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL) - if not m: - raise ExtractorError(u'Unable to find video information') - video_url = unescapeHTML(m.group('url')) + video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', + webpage, u'video URL', flags=re.DOTALL) - m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL) - if not m: - m = re.search(r'<title>(?P<title>[^<]+?)', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = clean_html(m.group('title')) + title = self._html_search_regex((r"

(?P.*?)</h1>", + r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - m = re.search(r'\d+)/? (?P\d*)(?P\??) #For urltype == video we sometimes get the videoID """ + _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/' + _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' @classmethod def suitable(cls, url): @@ -3387,11 +3370,19 @@ class SteamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url, re.VERBOSE) gameID = m.group('gameID') - videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID - self.report_age_confirmation() + + videourl = self._VIDEO_PAGE_TEMPLATE % gameID webpage = self._download_webpage(videourl, gameID) - game_title = re.search(r'', webpage).group('game_title') - + + if re.search('

Please enter your birth date to continue:

', webpage) is not None: + videourl = self._AGECHECK_TEMPLATE % gameID + self.report_age_confirmation() + webpage = self._download_webpage(videourl, gameID) + + self.report_extraction(gameID) + game_title = self._html_search_regex(r'', + webpage, 'game title') + urlRE = r"'movie_(?P\d+)': \{\s*FILENAME: \"(?P[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P[\w:/\.\?=\+-]+)\")?\s*\}," mweb = re.finditer(urlRE, webpage) namesRE = r'(?P.+?)' @@ -3423,27 +3414,29 @@ class UstreamIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://tcdn.ustream.tv/video/%s' % video_id webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - try: - m = re.search(r'data-title="(?P.+)"',webpage) - title = m.group('title') - m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', - webpage, re.DOTALL) - uploader = unescapeHTML(m.group('uploader').strip()) - m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage) - thumb = m.group('thumb') - except AttributeError: - raise ExtractorError(u'Unable to extract info') + + video_title = self._html_search_regex(r'data-title="(?P<title>.+)"', + webpage, u'title') + + uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>', + webpage, u'uploader', fatal=False, flags=re.DOTALL) + + thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"', + webpage, u'thumbnail', fatal=False) + info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'flv', - 'title': title, + 'title': video_title, 'uploader': uploader, - 'thumbnail': thumb, - } + 'thumbnail': thumbnail, + } return info class WorldStarHipHopIE(InfoExtractor): @@ -3451,45 +3444,36 @@ class WorldStarHipHopIE(InfoExtractor): IE_NAME = u'WorldStarHipHop' def _real_extract(self, url): - _src_url = r'so\.addVariable\("file","(.*?)"\)' - m = re.match(self._VALID_URL, url) video_id = m.group('id') - webpage_src = self._download_webpage(url, video_id) + webpage_src = self._download_webpage(url, video_id) - mobj = re.search(_src_url, webpage_src) + video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)', + webpage_src, u'video URL') - if mobj is not None: - video_url = mobj.group(1) - if 'mp4' in video_url: - ext = 'mp4' - else: - ext = 'flv' + if 'mp4' in video_url: + ext = 'mp4' else: - raise ExtractorError(u'Cannot find video url for %s' % video_id) + ext = 'flv' - mobj = re.search(r"<title>(.*)", webpage_src) + video_title = self._html_search_regex(r"(.*)", + webpage_src, u'title') - if mobj is None: - raise ExtractorError(u'Cannot determine title') - title = mobj.group(1) - - mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src) # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - if mobj is not None: - thumbnail = mobj.group(1) - else: + thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />', + webpage_src, u'thumbnail', fatal=False) + + if not thumbnail: _title = r"""candytitles.*>(.*)""" mobj = re.search(_title, webpage_src) if mobj is not None: - title = mobj.group(1) - thumbnail = None + video_title = mobj.group(1) results = [{ 'id': video_id, 'url' : video_url, - 'title' : title, + 'title' : video_title, 'thumbnail' : thumbnail, 'ext' : ext, }] @@ -3503,10 +3487,9 @@ class RBMARadioIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) + + json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', + webpage, u'json data', flags=re.MULTILINE) try: data = json.loads(json_data) @@ -3553,42 +3536,33 @@ class YouPornIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('videoid') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - # Get the video title - result = re.search(r'(?P.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() - - # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) + # Get JSON parameters + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + try: + params = json.loads(json_params) + except: + raise ExtractorError(u'Invalid JSON') - # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) + self.report_extraction(video_id) + try: + video_title = params['title'] + upload_date = unified_strdate(params['release_date_f']) + video_description = params['description'] + video_uploader = params['submitted_by'] + thumbnail = params['thumbnails'][0]['image'] + except KeyError: + raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) # Get all of the formats available DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() + download_list_html = self._search_regex(DOWNLOAD_LIST_RE, + webpage, u'download list').strip() # Get all of the links from the page LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' @@ -3612,19 +3586,18 @@ class YouPornIE(InfoExtractor): size = format[0] bitrate = format[1] format = "-".join( format ) - title = u'%s-%s-%s' % (video_title, size, bitrate) + # title = u'%s-%s-%s' % (video_title, size, bitrate) formats.append({ 'id': video_id, 'url': video_url, 'uploader': video_uploader, 'upload_date': upload_date, - 'title': title, + 'title': video_title, 'ext': extension, 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None + 'thumbnail': thumbnail, + 'description': video_description }) if self._downloader.params.get('listformats', None): @@ -3665,17 +3638,13 @@ class PornotubeIE(InfoExtractor): # Get the video URL VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) + video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url') + video_url = compat_urllib_parse.unquote(video_url) #Get the uploaded date VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) + upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False) + if upload_date: upload_date = unified_strdate(upload_date) info = {'id': video_id, 'url': video_url, @@ -3702,10 +3671,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # Get the video title - result = re.search(r'<title>(?P<title>.*)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() + video_title = self._html_search_regex(r'(?P<title>.*)', + webpage, u'title').strip() # Get the embed page result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) @@ -3718,10 +3685,8 @@ class YouJizzIE(InfoExtractor): webpage = self._download_webpage(embed_page_url, video_id) # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') + video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', + webpage, u'video URL') info = {'id': video_id, 'url': video_url, @@ -3744,10 +3709,7 @@ class EightTracksIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) + json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL) data = json.loads(json_like) session = str(random.randint(0, 1000000000)) @@ -3783,18 +3745,22 @@ class KeekIE(InfoExtractor): def _real_extract(self, url): m = re.match(self._VALID_URL, url) video_id = m.group('videoID') + video_url = u'http://cdn.keek.com/keek/video/%s' % video_id thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - m = re.search(r'[\S\s]+?

(?P.+?)

', webpage) - uploader = clean_html(m.group('uploader')) + + video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', + webpage, u'uploader', fatal=False) + info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, + 'title': video_title, 'thumbnail': thumbnail, 'uploader': uploader } @@ -3826,10 +3792,6 @@ class TEDIE(InfoExtractor): self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) return [self._playlist_videos_info(url,name,playlist_id)] - def _talk_video_link(self,mediaSlug): - '''Returns the video link for that mediaSlug''' - return 'http://download.ted.com/talks/%s.mp4' % mediaSlug - def _playlist_videos_info(self,url,name,playlist_id=0): '''Returns the videos of the playlist''' video_RE=r''' @@ -3842,9 +3804,8 @@ class TEDIE(InfoExtractor): m_videos=re.finditer(video_RE,webpage,re.VERBOSE) m_names=re.finditer(video_name_RE,webpage) - playlist_RE = r'div class="headline">(\s*?)

(\s*?)(?P.*?)' - m_playlist = re.search(playlist_RE, webpage) - playlist_title = m_playlist.group('playlist_title') + playlist_title = self._html_search_regex(r'div class="headline">\s*?

\s*?(.*?)', + webpage, 'playlist title') playlist_entries = [] for m_video, m_name in zip(m_videos,m_names): @@ -3855,27 +3816,28 @@ class TEDIE(InfoExtractor): def _talk_info(self, url, video_id=0): """Return the video for the talk in the url""" - m=re.match(self._VALID_URL, url,re.VERBOSE) - videoName=m.group('name') - webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) + m = re.match(self._VALID_URL, url,re.VERBOSE) + video_name = m.group('name') + webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) + self.report_extraction(video_name) # If the url includes the language we get the title translated - title_RE=r'(?P.*)</span>' - title=re.search(title_RE, webpage).group('title') - info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) - "id":(?P<videoID>[\d]+).*? - "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' - thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' - thumb_match=re.search(thumb_RE,webpage) - info_match=re.search(info_RE,webpage,re.VERBOSE) - video_id=info_match.group('videoID') - mediaSlug=info_match.group('mediaSlug') - video_url=self._talk_video_link(mediaSlug) + title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>', + webpage, 'title') + json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', + webpage, 'json data') + info = json.loads(json_data) + desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>', + webpage, 'description', flags = re.DOTALL) + + thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"', + webpage, 'thumbnail') info = { - 'id': video_id, - 'url': video_url, + 'id': info['id'], + 'url': info['htmlStreams'][-1]['file'], 'ext': 'mp4', 'title': title, - 'thumbnail': thumb_match.group('thumbnail') + 'thumbnail': thumbnail, + 'description': desc, } return info @@ -3941,10 +3903,9 @@ class SpiegelIE(InfoExtractor): video_id = m.group('videoID') webpage = self._download_webpage(url, video_id) - m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) + + video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>', + webpage, u'title') xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' xml_code = self._download_webpage(xml_url, video_id, @@ -3980,35 +3941,25 @@ class LiveLeakIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) + video_url = self._search_regex(r'file: "(.*?)",', + webpage, u'video URL') - m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() + video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"', + webpage, u'title').replace('LiveLeak.com -', '').strip() - m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) - if m: - desc = unescapeHTML(m.group('desc')) - else: - desc = None + video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"', + webpage, u'description', fatal=False) - m = re.search(r'By:.*?(\w+)</a>', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None + video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', + webpage, u'uploader', fatal=False) info = { 'id': video_id, 'url': video_url, 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader } return [info] @@ -4052,6 +4003,64 @@ class ARDIE(InfoExtractor): info["url"] = stream["video_url"] return [info] +class ZDFIE(InfoExtractor): + _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' + _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' + _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' + _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"' + _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('video_id') + + html = self._download_webpage(url, video_id) + streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] + if streams is None: + raise ExtractorError(u'No media url found.') + + # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url + # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url + # choose first/default media type and highest quality for now + for s in streams: #find 300 - dsl1000mbit + if s['quality'] == '300' and s['media_type'] == 'wstreaming': + stream_=s + break + for s in streams: #find veryhigh - dsl2000mbit + if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working + stream_=s + break + if stream_ is None: + raise ExtractorError(u'No stream found.') + + media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL') + + self.report_extraction(video_id) + mobj = re.search(self._TITLE, html) + if mobj is None: + raise ExtractorError(u'Cannot extract title') + title = unescapeHTML(mobj.group('title')) + + mobj = re.search(self._MMS_STREAM, media_link) + if mobj is None: + mobj = re.search(self._RTSP_STREAM, media_link) + if mobj is None: + raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL') + mms_url = mobj.group('video_url') + + mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url) + if mobj is None: + raise ExtractorError(u'Cannot extract extention') + ext = mobj.group('ext') + + return [{'id': video_id, + 'url': mms_url, + 'title': title, + 'ext': ext + }] + class TumblrIE(InfoExtractor): _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' @@ -4066,23 +4075,23 @@ class TumblrIE(InfoExtractor): re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) video = re.search(re_video, webpage) if video is None: - self.to_screen("No video found") - return [] + raise ExtractorError(u'Unable to extract video') video_url = video.group('video_url') ext = video.group('ext') - re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') + video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', + webpage, u'thumbnail', fatal=False) # We pick the first poster + if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '') # The only place where you can get a title, it's not complete, # but searching in other places doesn't work for all videos - re_title = r'<title>(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) + video_title = self._html_search_regex(r'(?P<title>.*?)', + webpage, u'title', flags=re.DOTALL) return [{'id': video_id, 'url': video_url, - 'title': title, - 'thumbnail': thumb, + 'title': video_title, + 'thumbnail': video_thumbnail, 'ext': ext }] @@ -4096,7 +4105,7 @@ class BandcampIE(InfoExtractor): # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if m_download is None: - raise ExtractorError(u'No free songs founded') + raise ExtractorError(u'No free songs found') download_link = m_download.group(1) id = re.search(r'var TralbumData = {(.*?)id: (?P\d*?)$', @@ -4124,10 +4133,10 @@ class BandcampIE(InfoExtractor): track_info = {'id':id, 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, + 'ext' : 'mp3', + 'url' : final_url, 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] + 'uploader' : info[u'artist'] } return [track_info] @@ -4144,17 +4153,14 @@ class RedTubeIE(InfoExtractor): video_id = mobj.group('id') video_extension = 'mp4' webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) - mobj = re.search(r'',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') + video_url = self._html_search_regex(r'', + webpage, u'video URL') - video_url = mobj.group(1) - mobj = re.search('

(.+)

',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_title = self._html_search_regex('

(.+?)

', + webpage, u'title') return [{ 'id': video_id, @@ -4175,15 +4181,13 @@ class InaIE(InfoExtractor): video_extension = 'mp4' webpage = self._download_webpage(mrss_url, video_id) - mobj = re.search(r'.*?)]]>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) + video_url = self._html_search_regex(r'.*?)]]>', + webpage, u'title') return [{ 'id': video_id, @@ -4205,27 +4209,17 @@ class HowcastIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) + video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', + webpage, u'video URL') - mobj = re.search(r'\w+)' def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') @@ -4250,25 +4243,17 @@ class VineIE(InfoExtractor): self.report_extraction(video_id) - mobj = re.search(r'.*?

(.+?)

', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) + uploader = self._html_search_regex(r'
.*?

(.+?)

', + webpage, u'uploader', fatal=False, flags=re.DOTALL) return [{ 'id': video_id, @@ -4291,18 +4276,13 @@ class FlickrIE(InfoExtractor): webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id webpage = self._download_webpage(webpage_url, video_id) - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) + secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret') first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - mobj = re.search(r'(\d+-\d+)', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) + node_id = self._html_search_regex(r'(\d+-\d+)', + first_xml, u'node_id') second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') @@ -4314,22 +4294,14 @@ class FlickrIE(InfoExtractor): raise ExtractorError(u'Unable to extract video url') video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) - mobj = re.search(r'(.*?)', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) + + video_url = self._html_search_regex(r'(.*?)', + data, u'video URL') return [{ 'id': video_id, @@ -4384,9 +4349,198 @@ class TeamcocoIE(InfoExtractor): 'ext': 'mp4', 'title': video_title, 'thumbnail': thumbnail, - 'description': description, + 'description': video_description, + }] + +class XHamsterIE(InfoExtractor): + """Information Extractor for xHamster""" + _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P[0-9]+)/.*\.html' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id + webpage = self._download_webpage(mrss_url, video_id) + + mobj = re.search(r'\'srv\': \'(?P[^\']*)\',\s*\'file\': \'(?P[^\']+)\',', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract media URL') + if len(mobj.group('server')) == 0: + video_url = compat_urllib_parse.unquote(mobj.group('file')) + else: + video_url = mobj.group('server')+'/key='+mobj.group('file') + video_extension = video_url.split('.')[-1] + + video_title = self._html_search_regex(r'(?P<title>.+?) - xHamster\.com', + webpage, u'title') + + # Can't see the description anywhere in the UI + # video_description = self._html_search_regex(r'Description: (?P[^<]+)', + # webpage, u'description', fatal=False) + # if video_description: video_description = unescapeHTML(video_description) + + mobj = re.search(r'hint=\'(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage) + if mobj: + video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d') + else: + video_upload_date = None + self._downloader.report_warning(u'Unable to extract upload date') + + video_uploader_id = self._html_search_regex(r']+>(?P[^<]+)', + webpage, u'uploader id', default=u'anonymous') + + video_thumbnail = self._search_regex(r'\'image\':\'(?P[^\']+)\'', + webpage, u'thumbnail', fatal=False) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + # 'description': video_description, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'thumbnail': video_thumbnail + }] + +class HypemIE(InfoExtractor): + """Information Extractor for hypem""" + _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + track_id = mobj.group(1) + + data = { 'ax': 1, 'ts': time.time() } + data_encoded = compat_urllib_parse.urlencode(data) + complete_url = url + "?" + data_encoded + request = compat_urllib_request.Request(complete_url) + response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') + cookie = urlh.headers.get('Set-Cookie', '') + + self.report_extraction(track_id) + + html_tracks = self._html_search_regex(r'', + response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() + try: + track_list = json.loads(html_tracks) + track = track_list[u'tracks'][0] + except ValueError: + raise ExtractorError(u'Hypemachine contained invalid JSON.') + + key = track[u"key"] + track_id = track[u"id"] + artist = track[u"artist"] + title = track[u"song"] + + serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) + request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) + request.add_header('cookie', cookie) + song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') + try: + song_data = json.loads(song_data_json) + except ValueError: + raise ExtractorError(u'Hypemachine contained invalid JSON.') + final_url = song_data[u"url"] + + return [{ + 'id': track_id, + 'url': final_url, + 'ext': "mp3", + 'title': title, + 'artist': artist, + }] + +class Vbox7IE(InfoExtractor): + """Information Extractor for Vbox7""" + _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + + redirect_page, urlh = self._download_webpage_handle(url, video_id) + new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location') + redirect_url = urlh.geturl() + new_location + webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page') + + title = self._html_search_regex(r'(.*)', + webpage, u'title').split('/')[0].strip() + + ext = "flv" + info_url = "http://vbox7.com/play/magare.do" + data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id}) + info_request = compat_urllib_request.Request(info_url, data) + info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage') + if info_response is None: + raise ExtractorError(u'Unable to extract the media url') + (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + + return [{ + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': thumbnail_url, }] +class GametrailersIE(InfoExtractor): + _VALID_URL = r'http://www.gametrailers.com/(?Pvideos|reviews|full-episodes)/(?P.*?)/(?P.*)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group('id') + video_type = mobj.group('type') + webpage = self._download_webpage(url, video_id) + if video_type == 'full-episodes': + mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' + else: + mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' + mgid = self._search_regex(mgid_re, webpage, u'mgid') + data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) + + info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, + video_id, u'Downloading video info') + links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, + video_id, u'Downloading video urls info') + + self.report_extraction(video_id) + info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .* + (?P.*?).* + ''' + + m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) + if m_info is None: + raise ExtractorError(u'Unable to extract video info') + video_title = m_info.group('title') + video_description = m_info.group('description') + video_thumb = m_info.group('thumb') + + m_urls = list(re.finditer(r'(?P.*)', links_webpage)) + if m_urls is None or len(m_urls) == 0: + raise ExtractError(u'Unable to extrat video url') + # They are sorted from worst to best quality + video_url = m_urls[-1].group('url') + + return {'url': video_url, + 'id': video_id, + 'title': video_title, + # Videos are actually flv not mp4 + 'ext': 'flv', + 'thumbnail': video_thumb, + 'description': video_description, + } + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@ -4405,8 +4559,8 @@ def gen_extractors(): YahooSearchIE(), DepositFilesIE(), FacebookIE(), - BlipTVUserIE(), BlipTVIE(), + BlipTVUserIE(), VimeoIE(), MyVideoIE(), ComedyCentralIE(), @@ -4440,6 +4594,7 @@ def gen_extractors(): SpiegelIE(), LiveLeakIE(), ARDIE(), + ZDFIE(), TumblrIE(), BandcampIE(), RedTubeIE(), @@ -4448,6 +4603,10 @@ def gen_extractors(): VineIE(), FlickrIE(), TeamcocoIE(), + XHamsterIE(), + HypemIE(), + Vbox7IE(), + GametrailersIE(), GenericIE() ] diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 308c48f..9279ce7 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -392,8 +392,11 @@ def _real_main(argv=None): # General configuration cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar) - if opts.proxy: - proxies = {'http': opts.proxy, 'https': opts.proxy} + if opts.proxy is not None: + if opts.proxy == '': + proxies = {} + else: + proxies = {'http': opts.proxy, 'https': opts.proxy} else: proxies = compat_urllib_request.getproxies() # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 63d9d0a..66ae41e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,7 +12,7 @@ import sys import traceback import zlib import email.utils -import json +import socket import datetime try: @@ -154,6 +154,9 @@ def compat_ord(c): if type(c) is int: return c else: return ord(c) +# This is not clearly defined otherwise +compiled_regex_type = type(re.compile('')) + std_headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', @@ -469,7 +472,11 @@ class ExtractorError(Exception): """Error during info extraction.""" def __init__(self, msg, tb=None): """ tb, if given, is the original traceback (so that it can be printed out). """ + + if not sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): + msg = msg + u'; please report this issue on GitHub.' super(ExtractorError, self).__init__(msg) + self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 1cda7fa..7c6757e 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.05.23' +__version__ = '2013.06.21'