From: Rogério Brito Date: Sun, 8 Jun 2014 13:58:42 +0000 (-0300) Subject: Imported Upstream version 2014.06.07 X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/c512650955de0b16d37e7fa7fb29ea0985e415bb Imported Upstream version 2014.06.07 --- diff --git a/CHANGELOG b/CHANGELOG deleted file mode 100644 index 3fa1167..0000000 --- a/CHANGELOG +++ /dev/null @@ -1,14 +0,0 @@ -2013.01.02 Codename: GIULIA - - * Add support for ComedyCentral clips - * Corrected Vimeo description fetching - * Added the --no-post-overwrites argument - * --verbose offers more environment info - * New info_dict field: uploader_id - * New updates system, with signature checking - * New IEs: NBA, JustinTV, FunnyOrDie, TweetReel, Steam, Ustream - * Fixed IEs: BlipTv - * Fixed for Python 3 IEs: Xvideo, Youku, XNXX, Dailymotion, Vimeo, InfoQ - * Simplified IEs and test code - * Various (Python 3 and other) fixes - * Revamped and expanded tests diff --git a/MANIFEST.in b/MANIFEST.in index 8f8af7a..d43cc1f 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,3 +3,4 @@ include test/*.py include test/*.json include youtube-dl.bash-completion include youtube-dl.1 +recursive-include docs Makefile conf.py *.rst diff --git a/Makefile b/Makefile index c6d0993..c079761 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ all: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash-completion clean: - rm -rf youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz + rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz cleanall: clean rm -f youtube-dl youtube-dl.exe @@ -55,7 +55,9 @@ README.txt: README.md pandoc -f markdown -t plain README.md -o README.txt youtube-dl.1: README.md - pandoc -s -f markdown -t man README.md -o youtube-dl.1 + python devscripts/prepare_manpage.py >youtube-dl.1.temp.md + pandoc -s -f markdown -t man youtube-dl.1.temp.md -o youtube-dl.1 + rm -f youtube-dl.1.temp.md youtube-dl.bash-completion: youtube_dl/*.py youtube_dl/*/*.py devscripts/bash-completion.in python devscripts/bash-completion.py @@ -72,8 +74,9 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '__pycache' \ --exclude '.git' \ --exclude 'testdata' \ + --exclude 'docs/_build' \ -- \ - bin devscripts test youtube_dl \ - CHANGELOG LICENSE README.md README.txt \ + bin devscripts test youtube_dl docs \ + LICENSE README.md README.txt \ Makefile MANIFEST.in youtube-dl.1 youtube-dl.bash-completion setup.py \ youtube-dl diff --git a/README.md b/README.md index 35876d9..2bea609 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,24 @@ -% YOUTUBE-DL(1) - -# NAME youtube-dl - download videos from youtube.com or other video platforms # SYNOPSIS **youtube-dl** [OPTIONS] URL [URL...] +# INSTALLATION + +To install it right away for all UNIX users (Linux, OS X, etc.), type: + + sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +If you do not have curl, you can alternatively use a recent wget: + + sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29). + +Alternatively, refer to the developer instructions below for how to check out and work with the git repository. For further options, including PGP signatures, see https://rg3.github.io/youtube-dl/download.html . + # DESCRIPTION **youtube-dl** is a small command-line program to download videos from YouTube.com and a few more sites. It requires the Python interpreter, version @@ -20,7 +33,7 @@ which means you can modify it, redistribute it or use it however you like. sure that you have sufficient permissions (run with sudo if needed) -i, --ignore-errors continue on download errors, for example to - to skip unavailable videos in a playlist + skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs @@ -28,6 +41,9 @@ which means you can modify it, redistribute it or use it however you like. --user-agent UA specify a custom user agent --referer REF specify a custom referer, use if the video access is restricted to one domain + --add-header FIELD:VALUE specify a custom HTTP header and its value, + separated by a colon ':'. You can use this + option multiple times --list-extractors List all supported extractors and the URLs they would handle --extractor-descriptions Output descriptions of all supported @@ -36,6 +52,9 @@ which means you can modify it, redistribute it or use it however you like. an empty string (--proxy "") for direct connection --no-check-certificate Suppress HTTPS certificate validation. + --prefer-insecure Use an unencrypted connection to retrieve + information about the video. (Currently + supported only for YouTube) --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME @@ -59,6 +78,7 @@ which means you can modify it, redistribute it or use it however you like. configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows) + --encoding ENCODING Force the specified encoding (experimental) ## Video Selection: --playlist-start NUMBER playlist video to start at (default is 1) @@ -124,8 +144,12 @@ which means you can modify it, redistribute it or use it however you like. video id, %(playlist)s for the playlist the video is in, %(playlist_index)s for the position in the playlist and %% for a - literal percent. Use - to output to stdout. - Can also be used to download to a different + literal percent. %(height)s and %(width)s + for the width and height of the video + format. %(resolution)s for a textual + description of the resolution of the video + format. Use - to output to stdout. Can also + be used to download to a different directory, for example with -o '/my/downloa ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' . --autonumber-size NUMBER Specifies the number of digits in @@ -159,6 +183,7 @@ which means you can modify it, redistribute it or use it however you like. ## Verbosity / Simulation Options: -q, --quiet activates quiet mode + --no-warnings Ignore warnings -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video @@ -170,7 +195,9 @@ which means you can modify it, redistribute it or use it however you like. --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information + -j, --dump-json simulate, quiet but print JSON information. + See --output for a description of available + keys. --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar @@ -187,9 +214,9 @@ which means you can modify it, redistribute it or use it however you like. preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", - "bestaudio", "worst", and "worstaudio". By - default, youtube-dl will pick the best - quality. + "bestvideo", "bestaudio", "worst", + "worstvideo" and "worstaudio". By default, + youtube-dl will pick the best quality. --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested @@ -236,6 +263,7 @@ which means you can modify it, redistribute it or use it however you like. default --embed-subs embed subtitles in the video (only for mp4 videos) + --embed-thumbnail embed thumbnail in the audio as cover art --add-metadata write metadata to the video file --xattrs write metadata to the video file's xattrs (using dublin core and xdg standards) @@ -246,7 +274,7 @@ which means you can modify it, redistribute it or use it however you like. # CONFIGURATION -You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. +You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl/config`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\\youtube-dl.conf`. # OUTPUT TEMPLATE @@ -281,12 +309,14 @@ Videos can be filtered by their upload date using the options `--date`, `--dateb Examples: - $ # Download only the videos uploaded in the last 6 months - $ youtube-dl --dateafter now-6months - $ # Download only the videos uploaded on January 1, 1970 - $ youtube-dl --date 19700101 - $ # will only download the videos uploaded in the 200x decade - $ youtube-dl --dateafter 20000101 --datebefore 20091231 + # Download only the videos uploaded in the last 6 months + $ youtube-dl --dateafter now-6months + + # Download only the videos uploaded on January 1, 1970 + $ youtube-dl --date 19700101 + + $ # will only download the videos uploaded in the 200x decade + $ youtube-dl --dateafter 20000101 --datebefore 20091231 # FAQ @@ -355,7 +385,67 @@ If you want to create a build of youtube-dl yourself, you'll need ### Adding support for a new site -If you want to add support for a new site, copy *any* [recently modified](https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) file in `youtube_dl/extractor`, add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Don't forget to run the tests with `python test/test_download.py Test_Download.test_YourExtractor`! For a detailed tutorial, refer to [this blog post](http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). +If you want to add support for a new site, you can follow this quick list (assuming your service is called `yourextractor`): + +1. [Fork this repository](https://github.com/rg3/youtube-dl/fork) +2. Check out the source code with `git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git` +3. Start a new git branch with `cd youtube-dl; git checkout -b yourextractor` +4. Start with this simple template and save it to `youtube_dl/extractor/yourextractor.py`: + + # coding: utf-8 + from __future__ import unicode_literals + + import re + + from .common import InfoExtractor + + + class YourExtractorIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + # TODO more properties (see youtube_dl/extractor/common.py) + } + + +5. Add an import in [`youtube_dl/extractor/__init__.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. +7. Have a look at [`youtube_dl/common/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). Add tests and code for as many as you want. +8. If you can, check the code with [pyflakes](https://pypi.python.org/pypi/pyflakes) (a good idea) and [pep8](https://pypi.python.org/pypi/pep8) (optional, ignore E501). +9. When the tests pass, [add](https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the new files and [commit](https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) them and [push](https://www.kernel.org/pub/software/scm/git/docs/git-push.html) the result, like this: + + $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add new extractor' + $ git push origin yourextractor + +10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it. + +In any case, thank you very much for your contributions! # BUGS @@ -381,7 +471,7 @@ If your report is shorter than two lines, it is almost certainly missing some of For bug reports, this means that your report should contain the *complete* output of youtube-dl when called with the -v flag. The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. -Site support requests must contain an example URL. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. +Site support requests **must contain an example URL**. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. http://www.youtube.com/ ) is *not* an example URL. ### Are you using the latest version? diff --git a/README.txt b/README.txt index 0015a74..4757a33 100644 --- a/README.txt +++ b/README.txt @@ -1,6 +1,3 @@ -NAME -==== - youtube-dl - download videos from youtube.com or other video platforms SYNOPSIS @@ -8,6 +5,27 @@ SYNOPSIS youtube-dl OPTIONS URL [URL...] +INSTALLATION +============ + +To install it right away for all UNIX users (Linux, OS X, etc.), type: + + sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +If you do not have curl, you can alternatively use a recent wget: + + sudo wget https://yt-dl.org/downloads/2014.05.13/youtube-dl -O /usr/local/bin/youtube-dl + sudo chmod a+x /usr/local/bin/youtube-dl + +Windows users can download a .exe file and place it in their home +directory or any other location on their PATH. + +Alternatively, refer to the developer instructions below for how to +check out and work with the git repository. For further options, +including PGP signatures, see +https://rg3.github.io/youtube-dl/download.html . + DESCRIPTION =========== @@ -27,7 +45,7 @@ OPTIONS sure that you have sufficient permissions (run with sudo if needed) -i, --ignore-errors continue on download errors, for example to - to skip unavailable videos in a playlist + skip unavailable videos in a playlist --abort-on-error Abort downloading of further videos (in the playlist or the command line) if an error occurs @@ -35,6 +53,9 @@ OPTIONS --user-agent UA specify a custom user agent --referer REF specify a custom referer, use if the video access is restricted to one domain + --add-header FIELD:VALUE specify a custom HTTP header and its value, + separated by a colon ':'. You can use this + option multiple times --list-extractors List all supported extractors and the URLs they would handle --extractor-descriptions Output descriptions of all supported @@ -43,6 +64,9 @@ OPTIONS an empty string (--proxy "") for direct connection --no-check-certificate Suppress HTTPS certificate validation. + --prefer-insecure Use an unencrypted connection to retrieve + information about the video. (Currently + supported only for YouTube) --cache-dir DIR Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME @@ -66,6 +90,7 @@ OPTIONS configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows) + --encoding ENCODING Force the specified encoding (experimental) Video Selection: ---------------- @@ -137,8 +162,12 @@ Filesystem Options: video id, %(playlist)s for the playlist the video is in, %(playlist_index)s for the position in the playlist and %% for a - literal percent. Use - to output to stdout. - Can also be used to download to a different + literal percent. %(height)s and %(width)s + for the width and height of the video + format. %(resolution)s for a textual + description of the resolution of the video + format. Use - to output to stdout. Can also + be used to download to a different directory, for example with -o '/my/downloa ds/%(uploader)s/%(title)s-%(id)s.%(ext)s' . --autonumber-size NUMBER Specifies the number of digits in @@ -174,6 +203,7 @@ Verbosity / Simulation Options: ------------------------------- -q, --quiet activates quiet mode + --no-warnings Ignore warnings -s, --simulate do not download the video and do not write anything to disk --skip-download do not download the video @@ -185,7 +215,9 @@ Verbosity / Simulation Options: --get-duration simulate, quiet but print video length --get-filename simulate, quiet but print output filename --get-format simulate, quiet but print output format - -j, --dump-json simulate, quiet but print JSON information + -j, --dump-json simulate, quiet but print JSON information. + See --output for a description of available + keys. --newline output progress bar as new lines --no-progress do not print progress bar --console-title display progress in console titlebar @@ -204,9 +236,9 @@ Video Format Options: preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", - "bestaudio", "worst", and "worstaudio". By - default, youtube-dl will pick the best - quality. + "bestvideo", "bestaudio", "worst", + "worstvideo" and "worstaudio". By default, + youtube-dl will pick the best quality. --all-formats download all available video formats --prefer-free-formats prefer free video formats unless a specific one is requested @@ -259,6 +291,7 @@ Post-processing Options: default --embed-subs embed subtitles in the video (only for mp4 videos) + --embed-thumbnail embed thumbnail in the audio as cover art --add-metadata write metadata to the video file --xattrs write metadata to the video file's xattrs (using dublin core and xdg standards) @@ -272,7 +305,7 @@ CONFIGURATION You can configure youtube-dl by placing default arguments (such as --extract-audio --no-mtime to always extract the audio and not copy the -mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. On +mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl/config. On Windows, the configuration file locations are %APPDATA%\youtube-dl\config.txt and C:\Users\\youtube-dl.conf. @@ -330,11 +363,14 @@ Videos can be filtered by their upload date using the options --date, Examples: -$ # Download only the videos uploaded in the last 6 months $ youtube-dl ---dateafter now-6months $ # Download only the videos uploaded on January -1, 1970 $ youtube-dl --date 19700101 $ # will only download the videos -uploaded in the 200x decade $ youtube-dl --dateafter 20000101 ---datebefore 20091231 + # Download only the videos uploaded in the last 6 months + $ youtube-dl --dateafter now-6months + + # Download only the videos uploaded on January 1, 1970 + $ youtube-dl --date 19700101 + + $ # will only download the videos uploaded in the 200x decade + $ youtube-dl --dateafter 20000101 --datebefore 20091231 FAQ === @@ -433,14 +469,76 @@ If you want to create a build of youtube-dl yourself, you'll need Adding support for a new site -If you want to add support for a new site, copy any recently modified -file in youtube_dl/extractor, add an import in -youtube_dl/extractor/__init__.py. Have a look at -youtube_dl/common/extractor/common.py for possible helper methods and a -detailed description of what your extractor should return. Don't forget -to run the tests with -python test/test_download.py Test_Download.test_YourExtractor! For a -detailed tutorial, refer to this blog post. +If you want to add support for a new site, you can follow this quick +list (assuming your service is called yourextractor): + +1. Fork this repository +2. Check out the source code with + git clone git@github.com:YOUR_GITHUB_USERNAME/youtube-dl.git +3. Start a new git branch with + cd youtube-dl; git checkout -b yourextractor +4. Start with this simple template and save it to + youtube_dl/extractor/yourextractor.py: + + # coding: utf-8 + from __future__ import unicode_literals + + import re + + from .common import InfoExtractor + + + class YourExtractorIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' + _TEST = { + 'url': 'http://yourextractor.com/watch/42', + 'md5': 'TODO: md5 sum of the first 10KiB of the video file', + 'info_dict': { + 'id': '42', + 'ext': 'mp4', + 'title': 'Video title goes here', + # TODO more properties, either as: + # * A value + # * MD5 checksum; start the string with md5: + # * A regular expression; start the string with re: + # * Any Python type (for example int or float) + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + # TODO more code goes here, for example ... + webpage = self._download_webpage(url, video_id) + title = self._html_search_regex(r'

(.*?)

', webpage, 'title') + + return { + 'id': video_id, + 'title': title, + # TODO more properties (see youtube_dl/extractor/common.py) + } + +5. Add an import in youtube_dl/extractor/__init__.py. +6. Run python test/test_download.py TestDownload.test_YourExtractor. + This should fail at first, but you can continually re-run it until + you're done. +7. Have a look at youtube_dl/common/extractor/common.py for possible + helper methods and a detailed description of what your extractor + should return. Add tests and code for as many as you want. +8. If you can, check the code with pyflakes (a good idea) and pep8 + (optional, ignore E501). +9. When the tests pass, add the new files and commit them and push the + result, like this: + + $ git add youtube_dl/extractor/__init__.py + $ git add youtube_dl/extractor/yourextractor.py + $ git commit -m '[yourextractor] Add new extractor' + $ git push origin yourextractor + +10. Finally, create a pull request. We'll then review and merge it. + +In any case, thank you very much for your contributions! BUGS ==== diff --git a/devscripts/make_readme.py b/devscripts/make_readme.py index cae1fa4..70fa942 100755 --- a/devscripts/make_readme.py +++ b/devscripts/make_readme.py @@ -15,7 +15,7 @@ header = oldreadme[:oldreadme.index('# OPTIONS')] footer = oldreadme[oldreadme.index('# CONFIGURATION'):] options = helptext[helptext.index(' General Options:') + 19:] -options = re.sub(r'^ (\w.+)$', r'## \1', options, flags=re.M) +options = re.sub(r'(?m)^ (\w.+)$', r'## \1', options) options = '# OPTIONS\n' + options + '\n' with io.open(README_FILE, 'w', encoding='utf-8') as f: diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py new file mode 100644 index 0000000..d9c8570 --- /dev/null +++ b/devscripts/prepare_manpage.py @@ -0,0 +1,20 @@ + +import io +import os.path +import sys +import re + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +README_FILE = os.path.join(ROOT_DIR, 'README.md') + +with io.open(README_FILE, encoding='utf-8') as f: + readme = f.read() + +PREFIX = '%YOUTUBE-DL(1)\n\n# NAME\n' +readme = re.sub(r'(?s)# INSTALLATION.*?(?=# DESCRIPTION)', '', readme) +readme = PREFIX + readme + +if sys.version_info < (3, 0): + print(readme.encode('utf-8')) +else: + print(readme) diff --git a/devscripts/release.sh b/devscripts/release.sh index 323acf8..453087e 100755 --- a/devscripts/release.sh +++ b/devscripts/release.sh @@ -14,14 +14,20 @@ set -e -skip_tests=false -if [ "$1" = '--skip-test' ]; then - skip_tests=true +skip_tests=true +if [ "$1" = '--run-tests' ]; then + skip_tests=false shift fi if [ -z "$1" ]; then echo "ERROR: specify version number like this: $0 1994.09.06"; exit 1; fi version="$1" +major_version=$(echo "$version" | sed -n 's#^\([0-9]*\.[0-9]*\.[0-9]*\).*#\1#p') +if test "$major_version" '!=' "$(date '+%Y.%m.%d')"; then + echo "$version does not start with today's date!" + exit 1 +fi + if [ ! -z "`git tag | grep "$version"`" ]; then echo 'ERROR: version already present'; exit 1; fi if [ ! -z "`git status --porcelain | grep -v CHANGELOG`" ]; then echo 'ERROR: the working directory is not clean; commit or stash changes'; exit 1; fi useless_files=$(find youtube_dl -type f -not -name '*.py') @@ -39,9 +45,9 @@ fi /bin/echo -e "\n### Changing version in version.py..." sed -i "s/__version__ = '.*'/__version__ = '$version'/" youtube_dl/version.py -/bin/echo -e "\n### Committing CHANGELOG README.md and youtube_dl/version.py..." +/bin/echo -e "\n### Committing README.md and youtube_dl/version.py..." make README.md -git add CHANGELOG README.md youtube_dl/version.py +git add README.md youtube_dl/version.py git commit -m "release $version" /bin/echo -e "\n### Now tagging, signing and pushing..." @@ -70,7 +76,7 @@ RELEASE_FILES="youtube-dl youtube-dl.exe youtube-dl-$version.tar.gz" git checkout HEAD -- youtube-dl youtube-dl.exe /bin/echo -e "\n### Signing and uploading the new binaries to yt-dl.org ..." -for f in $RELEASE_FILES; do gpg --detach-sig "build/$version/$f"; done +for f in $RELEASE_FILES; do gpg --passphrase-repeat 5 --detach-sig "build/$version/$f"; done scp -r "build/$version" ytdl@yt-dl.org:html/tmp/ ssh ytdl@yt-dl.org "mv html/tmp/$version html/downloads/" ssh ytdl@yt-dl.org "sh html/update_latest.sh $version" @@ -97,7 +103,7 @@ rm -rf build make pypi-files echo "Uploading to PyPi ..." -python setup.py sdist upload +python setup.py sdist bdist_wheel upload make clean /bin/echo -e "\n### DONE!" diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..69fa449 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1 @@ +_build/ diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..7122180 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,177 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/youtube-dl.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/youtube-dl.qhc" + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/youtube-dl" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/youtube-dl" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..4a04ad7 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,71 @@ +# -*- coding: utf-8 -*- +# +# youtube-dl documentation build configuration file, created by +# sphinx-quickstart on Fri Mar 14 21:05:43 2014. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +# Allows to import youtube_dl +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +# -- General configuration ------------------------------------------------ + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix of source filenames. +source_suffix = '.rst' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'youtube-dl' +copyright = u'2014, Ricardo Garcia Gonzalez' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +import youtube_dl +version = youtube_dl.__version__ +# The full version, including alpha/beta/rc tags. +release = version + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'default' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Output file base name for HTML help builder. +htmlhelp_basename = 'youtube-dldoc' diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..b746ff9 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,23 @@ +Welcome to youtube-dl's documentation! +====================================== + +*youtube-dl* is a command-line program to download videos from YouTube.com and more sites. +It can also be used in Python code. + +Developer guide +--------------- + +This section contains information for using *youtube-dl* from Python programs. + +.. toctree:: + :maxdepth: 2 + + module_guide + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + diff --git a/docs/module_guide.rst b/docs/module_guide.rst new file mode 100644 index 0000000..03d7288 --- /dev/null +++ b/docs/module_guide.rst @@ -0,0 +1,67 @@ +Using the ``youtube_dl`` module +=============================== + +When using the ``youtube_dl`` module, you start by creating an instance of :class:`YoutubeDL` and adding all the available extractors: + +.. code-block:: python + + >>> from youtube_dl import YoutubeDL + >>> ydl = YoutubeDL() + >>> ydl.add_default_info_extractors() + +Extracting video information +---------------------------- + +You use the :meth:`YoutubeDL.extract_info` method for getting the video information, which returns a dictionary: + +.. code-block:: python + + >>> info = ydl.extract_info('http://www.youtube.com/watch?v=BaW_jenozKc', download=False) + [youtube] Setting language + [youtube] BaW_jenozKc: Downloading webpage + [youtube] BaW_jenozKc: Downloading video info webpage + [youtube] BaW_jenozKc: Extracting video information + >>> info['title'] + 'youtube-dl test video "\'/\\ä↭𝕐' + >>> info['height'], info['width'] + (720, 1280) + +If you want to download or play the video you can get its url: + +.. code-block:: python + + >>> info['url'] + 'https://...' + +Extracting playlist information +------------------------------- + +The playlist information is extracted in a similar way, but the dictionary is a bit different: + +.. code-block:: python + + >>> playlist = ydl.extract_info('http://www.ted.com/playlists/13/open_source_open_world', download=False) + [TED] open_source_open_world: Downloading playlist webpage + ... + >>> playlist['title'] + 'Open-source, open world' + + + +You can access the videos in the playlist with the ``entries`` field: + +.. code-block:: python + + >>> for video in playlist['entries']: + ... print('Video #%d: %s' % (video['playlist_index'], video['title'])) + + Video #1: How Arduino is open-sourcing imagination + Video #2: The year open data went worldwide + Video #3: Massive-scale online collaboration + Video #4: The art of asking + Video #5: How cognitive surplus will change the world + Video #6: The birth of Wikipedia + Video #7: Coding a better government + Video #8: The era of open innovation + Video #9: The currency of the new economy is trust + diff --git a/test/helper.py b/test/helper.py index b1f421a..230d2bd 100644 --- a/test/helper.py +++ b/test/helper.py @@ -9,7 +9,10 @@ import sys import youtube_dl.extractor from youtube_dl import YoutubeDL -from youtube_dl.utils import preferredencoding +from youtube_dl.utils import ( + compat_str, + preferredencoding, +) def get_params(override=None): @@ -71,15 +74,77 @@ class FakeYDL(YoutubeDL): old_report_warning(message) self.report_warning = types.MethodType(report_warning, self) -def get_testcases(): + +def gettestcases(include_onlymatching=False): for ie in youtube_dl.extractor.gen_extractors(): t = getattr(ie, '_TEST', None) if t: - t['name'] = type(ie).__name__[:-len('IE')] - yield t - for t in getattr(ie, '_TESTS', []): + assert not hasattr(ie, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(ie).__name__ + tests = [t] + else: + tests = getattr(ie, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue t['name'] = type(ie).__name__[:-len('IE')] yield t md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() + + +def expect_info_dict(self, expected_dict, got_dict): + for info_field, expected in expected_dict.items(): + if isinstance(expected, compat_str) and expected.startswith('re:'): + got = got_dict.get(info_field) + match_str = expected[len('re:'):] + match_rex = re.compile(match_str) + + self.assertTrue( + isinstance(got, compat_str) and match_rex.match(got), + u'field %s (value: %r) should match %r' % (info_field, got, match_str)) + elif isinstance(expected, type): + got = got_dict.get(info_field) + self.assertTrue(isinstance(got, expected), + u'Expected type %r for field %s, but got value %r of type %r' % (expected, info_field, got, type(got))) + else: + if isinstance(expected, compat_str) and expected.startswith('md5:'): + got = 'md5:' + md5(got_dict.get(info_field)) + else: + got = got_dict.get(info_field) + self.assertEqual(expected, got, + u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) + + # Check for the presence of mandatory fields + for key in ('id', 'url', 'title', 'ext'): + self.assertTrue(got_dict.get(key), 'Missing mandatory field %s' % key) + # Check for mandatory fields that are automatically set by YoutubeDL + for key in ['webpage_url', 'extractor', 'extractor_key']: + self.assertTrue(got_dict.get(key), u'Missing field: %s' % key) + + # Are checkable fields missing from the test case definition? + test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) + for key, value in got_dict.items() + if value and key in ('title', 'description', 'uploader', 'upload_date', 'timestamp', 'uploader_id', 'location')) + missing_keys = set(test_info_dict.keys()) - set(expected_dict.keys()) + if missing_keys: + sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n') + self.assertFalse( + missing_keys, + 'Missing keys in test definition: %s' % ( + ', '.join(sorted(missing_keys)))) + + +def assertRegexpMatches(self, text, regexp, msg=None): + if hasattr(self, 'assertRegexpMatches'): + return self.assertRegexpMatches(text, regexp, msg) + else: + m = re.match(regexp, text) + if not m: + note = 'Regexp didn\'t match: %r not found in %r' % (regexp, text) + if msg is None: + msg = note + else: + msg = note + ', ' + msg + self.assertTrue(m, msg) diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py new file mode 100644 index 0000000..13c18ed --- /dev/null +++ b/test/test_InfoExtractor.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import FakeYDL +from youtube_dl.extractor.common import InfoExtractor +from youtube_dl.extractor import YoutubeIE, get_info_extractor + + +class TestIE(InfoExtractor): + pass + + +class TestInfoExtractor(unittest.TestCase): + def setUp(self): + self.ie = TestIE(FakeYDL()) + + def test_ie_key(self): + self.assertEqual(get_info_extractor(YoutubeIE.ie_key()), YoutubeIE) + + def test_html_search_regex(self): + html = '

Watch this video

' + search = lambda re, *args: self.ie._html_search_regex(re, html, *args) + self.assertEqual(search(r'

(.+?)

', 'foo'), 'Watch this video') + + def test_opengraph(self): + ie = self.ie + html = ''' + + + + ''' + self.assertEqual(ie._og_search_title(html), 'Foo') + self.assertEqual(ie._og_search_description(html), 'Some video\'s description ') + self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2') + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 37e7b9b..e794cc9 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL +from test.helper import FakeYDL, assertRegexpMatches from youtube_dl import YoutubeDL from youtube_dl.extractor import YoutubeIE @@ -26,16 +26,27 @@ class YDL(FakeYDL): self.msgs.append(msg) +def _make_result(formats, **kwargs): + res = { + 'formats': formats, + 'id': 'testid', + 'title': 'testttitle', + 'extractor': 'testex', + } + res.update(**kwargs) + return res + + class TestFormatSelection(unittest.TestCase): def test_prefer_free_formats(self): # Same resolution => download webm ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 460}, - {'ext': 'mp4', 'height': 460}, + {'ext': 'webm', 'height': 460, 'url': 'x'}, + {'ext': 'mp4', 'height': 460, 'url': 'y'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) ydl.process_ie_result(info_dict) @@ -46,8 +57,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = True formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 1080}, + {'ext': 'webm', 'height': 720, 'url': 'a'}, + {'ext': 'mp4', 'height': 1080, 'url': 'b'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -56,13 +67,13 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['ext'], 'mp4') - # No prefer_free_formats => prefer mp4 and flv for greater compatibilty + # No prefer_free_formats => prefer mp4 and flv for greater compatibility ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'webm', 'height': 720}, - {'ext': 'mp4', 'height': 720}, - {'ext': 'flv', 'height': 720}, + {'ext': 'webm', 'height': 720, 'url': '_'}, + {'ext': 'mp4', 'height': 720, 'url': '_'}, + {'ext': 'flv', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -74,8 +85,8 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL() ydl.params['prefer_free_formats'] = False formats = [ - {'ext': 'flv', 'height': 720}, - {'ext': 'webm', 'height': 720}, + {'ext': 'flv', 'height': 720, 'url': '_'}, + {'ext': 'webm', 'height': 720, 'url': '_'}, ] info_dict['formats'] = formats yie = YoutubeIE(ydl) @@ -91,8 +102,7 @@ class TestFormatSelection(unittest.TestCase): {'format_id': 'great', 'url': 'http://example.com/great', 'preference': 3}, {'format_id': 'excellent', 'url': 'http://example.com/exc', 'preference': 4}, ] - info_dict = { - 'formats': formats, 'extractor': 'test', 'id': 'testvid'} + info_dict = _make_result(formats) ydl = YDL() ydl.process_ie_result(info_dict) @@ -120,12 +130,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection(self): formats = [ - {'format_id': '35', 'ext': 'mp4', 'preference': 1}, - {'format_id': '45', 'ext': 'webm', 'preference': 2}, - {'format_id': '47', 'ext': 'webm', 'preference': 3}, - {'format_id': '2', 'ext': 'flv', 'preference': 4}, + {'format_id': '35', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': '45', 'ext': 'webm', 'preference': 2, 'url': '_'}, + {'format_id': '47', 'ext': 'webm', 'preference': 3, 'url': '_'}, + {'format_id': '2', 'ext': 'flv', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': '20/47'}) ydl.process_ie_result(info_dict.copy()) @@ -154,12 +164,12 @@ class TestFormatSelection(unittest.TestCase): def test_format_selection_audio(self): formats = [ - {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none'}, - {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none'}, - {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none'}, - {'format_id': 'vid', 'ext': 'mp4', 'preference': 4}, + {'format_id': 'audio-low', 'ext': 'webm', 'preference': 1, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-mid', 'ext': 'webm', 'preference': 2, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'audio-high', 'ext': 'flv', 'preference': 3, 'vcodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 4, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio'}) ydl.process_ie_result(info_dict.copy()) @@ -172,16 +182,34 @@ class TestFormatSelection(unittest.TestCase): self.assertEqual(downloaded['format_id'], 'audio-low') formats = [ - {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1}, - {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2}, + {'format_id': 'vid-low', 'ext': 'mp4', 'preference': 1, 'url': '_'}, + {'format_id': 'vid-high', 'ext': 'mp4', 'preference': 2, 'url': '_'}, ] - info_dict = {'formats': formats, 'extractor': 'test'} + info_dict = _make_result(formats) ydl = YDL({'format': 'bestaudio/worstaudio/best'}) ydl.process_ie_result(info_dict.copy()) downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'vid-high') + def test_format_selection_video(self): + formats = [ + {'format_id': 'dash-video-low', 'ext': 'mp4', 'preference': 1, 'acodec': 'none', 'url': '_'}, + {'format_id': 'dash-video-high', 'ext': 'mp4', 'preference': 2, 'acodec': 'none', 'url': '_'}, + {'format_id': 'vid', 'ext': 'mp4', 'preference': 3, 'url': '_'}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'bestvideo'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-high') + + ydl = YDL({'format': 'worstvideo'}) + ydl.process_ie_result(info_dict.copy()) + downloaded = ydl.downloaded_info_dicts[0] + self.assertEqual(downloaded['format_id'], 'dash-video-low') + def test_youtube_format_selection(self): order = [ '38', '37', '46', '22', '45', '35', '44', '18', '34', '43', '6', '5', '36', '17', '13', @@ -199,10 +227,12 @@ class TestFormatSelection(unittest.TestCase): for f1id, f2id in zip(order, order[1:]): f1 = YoutubeIE._formats[f1id].copy() f1['format_id'] = f1id + f1['url'] = 'url:' + f1id f2 = YoutubeIE._formats[f2id].copy() f2['format_id'] = f2id + f2['url'] = 'url:' + f2id - info_dict = {'formats': [f1, f2], 'extractor': 'youtube'} + info_dict = _make_result([f1, f2], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) @@ -210,7 +240,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], f1id) - info_dict = {'formats': [f2, f1], 'extractor': 'youtube'} + info_dict = _make_result([f2, f1], extractor='youtube') ydl = YDL() yie = YoutubeIE(ydl) yie._sort_formats(info_dict['formats']) @@ -244,6 +274,12 @@ class TestFormatSelection(unittest.TestCase): # Replace missing fields with 'NA' self.assertEqual(fname('%(uploader_date)s-%(id)s.%(ext)s'), 'NA-1234.mp4') + def test_format_note(self): + ydl = YoutubeDL() + self.assertEqual(ydl._format_note({}), '') + assertRegexpMatches(self, ydl._format_note({ + 'vbr': 10, + }), '^\s*10k$') if __name__ == '__main__': unittest.main() diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py index c9cdb96..71e80b0 100644 --- a/test/test_age_restriction.py +++ b/test/test_age_restriction.py @@ -13,7 +13,7 @@ from youtube_dl import YoutubeDL def _download_restricted(url, filename, age): - """ Returns true iff the file has been downloaded """ + """ Returns true if the file has been downloaded """ params = { 'age_limit': age, diff --git a/test/test_all_urls.py b/test/test_all_urls.py index aa8e4e4..4b56137 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -9,7 +9,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.extractor import ( FacebookIE, @@ -49,6 +49,7 @@ class TestAllURLsMatching(unittest.TestCase): self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube']) + self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube']) def test_youtube_channel_matching(self): assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) @@ -68,21 +69,28 @@ class TestAllURLsMatching(unittest.TestCase): def test_youtube_show_matching(self): self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) + def test_youtube_truncated(self): + self.assertMatch('http://www.youtube.com/watch?', ['youtube:truncated_url']) + + def test_youtube_search_matching(self): + self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) + self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) + def test_justin_tv_channelid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv")) - self.assertTrue(JustinTVIE.suitable(u"http://www.justin.tv/vanillatv/")) - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/")) + self.assertTrue(JustinTVIE.suitable('justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv')) + self.assertTrue(JustinTVIE.suitable('http://www.justin.tv/vanillatv/')) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/')) def test_justintv_videoid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/vanillatv/b/328087483")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/vanillatv/b/328087483')) def test_justin_tv_chapterid_matching(self): - self.assertTrue(JustinTVIE.suitable(u"http://www.twitch.tv/tsm_theoddone/c/2349361")) + self.assertTrue(JustinTVIE.suitable('http://www.twitch.tv/tsm_theoddone/c/2349361')) def test_youtube_extract(self): assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) @@ -98,7 +106,7 @@ class TestAllURLsMatching(unittest.TestCase): def test_no_duplicates(self): ies = gen_extractors() - for tc in get_testcases(): + for tc in gettestcases(include_onlymatching=True): url = tc['url'] for ie in ies: if type(ie).__name__ in ('GenericIE', tc['name'] + 'IE'): @@ -117,6 +125,8 @@ class TestAllURLsMatching(unittest.TestCase): def test_vimeo_matching(self): self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel']) + self.assertMatch('http://vimeo.com/channels/31259', ['vimeo:channel']) + self.assertMatch('http://vimeo.com/channels/31259/53576664', ['vimeo']) self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user']) self.assertMatch('http://vimeo.com/user7108434/videos', ['vimeo:user']) self.assertMatch('https://vimeo.com/user21297594/review/75524534/3c257a1b5d', ['vimeo:review']) @@ -132,6 +142,40 @@ class TestAllURLsMatching(unittest.TestCase): def test_pbs(self): # https://github.com/rg3/youtube-dl/issues/2350 self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['PBS']) + self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['PBS']) + + def test_ComedyCentralShows(self): + self.assertMatch( + 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', + ['ComedyCentralShows']) + self.assertMatch( + 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', + ['ComedyCentralShows']) + + def test_yahoo_https(self): + # https://github.com/rg3/youtube-dl/issues/2701 + self.assertMatch( + 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html', + ['Yahoo']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_download.py b/test/test_download.py index 7587a18..f171c10 100644 --- a/test/test_download.py +++ b/test/test_download.py @@ -8,10 +8,11 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import ( get_params, - get_testcases, - try_rm, + gettestcases, + expect_info_dict, md5, - report_warning + try_rm, + report_warning, ) @@ -50,7 +51,7 @@ def _file_md5(fn): with open(fn, 'rb') as f: return hashlib.md5(f.read()).hexdigest() -defs = get_testcases() +defs = gettestcases() class TestDownload(unittest.TestCase): @@ -72,9 +73,7 @@ def generator(test_case): if 'playlist' not in test_case: info_dict = test_case.get('info_dict', {}) if not test_case.get('file') and not (info_dict.get('id') and info_dict.get('ext')): - print_skipping('The output file cannot be know, the "file" ' - 'key is missing or the info_dict is incomplete') - return + raise Exception('Test definition incorrect. The output file cannot be known. Are both \'id\' and \'ext\' keys present?') if 'skip' in test_case: print_skipping(test_case['skip']) return @@ -136,27 +135,8 @@ def generator(test_case): self.assertEqual(md5_for_file, tc['md5']) with io.open(info_json_fn, encoding='utf-8') as infof: info_dict = json.load(infof) - for (info_field, expected) in tc.get('info_dict', {}).items(): - if isinstance(expected, compat_str) and expected.startswith('md5:'): - got = 'md5:' + md5(info_dict.get(info_field)) - else: - got = info_dict.get(info_field) - self.assertEqual(expected, got, - u'invalid value for field %s, expected %r, got %r' % (info_field, expected, got)) - - # If checkable fields are missing from the test case, print the info_dict - test_info_dict = dict((key, value if not isinstance(value, compat_str) or len(value) < 250 else 'md5:' + md5(value)) - for key, value in info_dict.items() - if value and key in ('title', 'description', 'uploader', 'upload_date', 'uploader_id', 'location')) - if not all(key in tc.get('info_dict', {}).keys() for key in test_info_dict.keys()): - sys.stderr.write(u'\n"info_dict": ' + json.dumps(test_info_dict, ensure_ascii=False, indent=4) + u'\n') - - # Check for the presence of mandatory fields - for key in ('id', 'url', 'title', 'ext'): - self.assertTrue(key in info_dict.keys() and info_dict[key]) - # Check for mandatory fields that are automatically set by YoutubeDL - for key in ['webpage_url', 'extractor', 'extractor_key']: - self.assertTrue(info_dict.get(key), u'Missing field: %s' % key) + + expect_info_dict(self, tc.get('info_dict', {}), info_dict) finally: try_rm_tcs_files() diff --git a/test/test_playlists.py b/test/test_playlists.py index 1de9e8e..465b07b 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -9,8 +9,11 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import FakeYDL - +from test.helper import ( + assertRegexpMatches, + expect_info_dict, + FakeYDL, +) from youtube_dl.extractor import ( AcademicEarthCourseIE, @@ -20,9 +23,12 @@ from youtube_dl.extractor import ( VimeoUserIE, VimeoAlbumIE, VimeoGroupsIE, + VineUserIE, UstreamChannelIE, SoundcloudSetIE, SoundcloudUserIE, + SoundcloudPlaylistIE, + TeacherTubeClassroomIE, LivestreamIE, NHLVideocenterIE, BambuserChannelIE, @@ -36,6 +42,12 @@ from youtube_dl.extractor import ( RutubeChannelIE, GoogleSearchIE, GenericIE, + TEDIE, + ToypicsUserIE, + XTubeUserIE, + InstagramUserIE, + CSpanIE, + AolIE, ) @@ -92,13 +104,20 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['title'], 'Rolex Awards for Enterprise') self.assertTrue(len(result['entries']) > 72) + def test_vine_user(self): + dl = FakeYDL() + ie = VineUserIE(dl) + result = ie.extract('https://vine.co/Visa') + self.assertIsPlaylist(result) + self.assertTrue(len(result['entries']) >= 50) + def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty') self.assertIsPlaylist(result) self.assertEqual(result['id'], '5124905') - self.assertTrue(len(result['entries']) >= 11) + self.assertTrue(len(result['entries']) >= 6) def test_soundcloud_set(self): dl = FakeYDL() @@ -116,6 +135,17 @@ class TestPlaylists(unittest.TestCase): self.assertEqual(result['id'], '9615865') self.assertTrue(len(result['entries']) >= 12) + def test_soundcloud_playlist(self): + dl = FakeYDL() + ie = SoundcloudPlaylistIE(dl) + result = ie.extract('http://api.soundcloud.com/playlists/4110309') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '4110309') + self.assertEqual(result['title'], 'TILT Brass - Bowery Poetry Club, August \'03 [Non-Site SCR 02]') + assertRegexpMatches( + self, result['description'], r'TILT Brass - Bowery Poetry Club') + self.assertEqual(len(result['entries']), 6) + def test_livestream_event(self): dl = FakeYDL() ie = LivestreamIE(dl) @@ -170,30 +200,30 @@ class TestPlaylists(unittest.TestCase): def test_AcademicEarthCourse(self): dl = FakeYDL() ie = AcademicEarthCourseIE(dl) - result = ie.extract('http://academicearth.org/courses/building-dynamic-websites/') + result = ie.extract('http://academicearth.org/playlists/laws-of-nature/') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'building-dynamic-websites') - self.assertEqual(result['title'], 'Building Dynamic Websites') - self.assertEqual(result['description'], u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") - self.assertEqual(len(result['entries']), 10) + self.assertEqual(result['id'], 'laws-of-nature') + self.assertEqual(result['title'], 'Laws of Nature') + self.assertEqual(result['description'],u'Introduce yourself to the laws of nature with these free online college lectures from Yale, Harvard, and MIT.')# u"Today's websites are increasingly dynamic. Pages are no longer static HTML files but instead generated by scripts and database calls. User interfaces are more seamless, with technologies like Ajax replacing traditional page reloads. This course teaches students how to build dynamic websites with Ajax and with Linux, Apache, MySQL, and PHP (LAMP), one of today's most popular frameworks. Students learn how to set up domain names with DNS, how to structure pages with XHTML and CSS, how to program in JavaScript and PHP, how to configure Apache and MySQL, how to design and query databases with SQL, how to use Ajax with both XML and JSON, and how to build mashups. The course explores issues of security, scalability, and cross-browser support and also discusses enterprise-level deployments of websites, including third-party hosting, virtualization, colocation in data centers, firewalling, and load-balancing.") + self.assertEqual(len(result['entries']), 4) def test_ivi_compilation(self): dl = FakeYDL() ie = IviCompilationIE(dl) - result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel') + result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'dezhurnyi_angel') - self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012)') - self.assertTrue(len(result['entries']) >= 36) - + self.assertEqual(result['id'], 'dvoe_iz_lartsa') + self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008)') + self.assertTrue(len(result['entries']) >= 24) + def test_ivi_compilation_season(self): dl = FakeYDL() ie = IviCompilationIE(dl) - result = ie.extract('http://www.ivi.ru/watch/dezhurnyi_angel/season2') + result = ie.extract('http://www.ivi.ru/watch/dvoe_iz_lartsa/season1') self.assertIsPlaylist(result) - self.assertEqual(result['id'], 'dezhurnyi_angel/season2') - self.assertEqual(result['title'], 'Дежурный ангел (2010 - 2012) 2 сезон') - self.assertTrue(len(result['entries']) >= 20) + self.assertEqual(result['id'], 'dvoe_iz_lartsa/season1') + self.assertEqual(result['title'], 'Двое из ларца (2006 - 2008) 1 сезон') + self.assertTrue(len(result['entries']) >= 12) def test_imdb_list(self): dl = FakeYDL() @@ -248,7 +278,96 @@ class TestPlaylists(unittest.TestCase): self.assertIsPlaylist(result) self.assertEqual(result['id'], 'python language') self.assertEqual(result['title'], 'python language') - self.assertTrue(len(result['entries']) == 15) + self.assertEqual(len(result['entries']), 15) + + def test_generic_rss_feed(self): + dl = FakeYDL() + ie = GenericIE(dl) + result = ie.extract('http://phihag.de/2014/youtube-dl/rss.xml') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'http://phihag.de/2014/youtube-dl/rss.xml') + self.assertEqual(result['title'], 'Zero Punctuation') + self.assertTrue(len(result['entries']) > 10) + + def test_ted_playlist(self): + dl = FakeYDL() + ie = TEDIE(dl) + result = ie.extract('http://www.ted.com/playlists/who_are_the_hackers') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '10') + self.assertEqual(result['title'], 'Who are the hackers?') + self.assertTrue(len(result['entries']) >= 6) + + def test_toypics_user(self): + dl = FakeYDL() + ie = ToypicsUserIE(dl) + result = ie.extract('http://videos.toypics.net/Mikey') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'Mikey') + self.assertTrue(len(result['entries']) >= 17) + + def test_xtube_user(self): + dl = FakeYDL() + ie = XTubeUserIE(dl) + result = ie.extract('http://www.xtube.com/community/profile.php?user=greenshowers') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'greenshowers') + self.assertTrue(len(result['entries']) >= 155) + + def test_InstagramUser(self): + dl = FakeYDL() + ie = InstagramUserIE(dl) + result = ie.extract('http://instagram.com/porsche') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'porsche') + self.assertTrue(len(result['entries']) >= 2) + test_video = next( + e for e in result['entries'] + if e['id'] == '614605558512799803_462752227') + dl.add_default_extra_info(test_video, ie, '(irrelevant URL)') + dl.process_video_result(test_video, download=False) + EXPECTED = { + 'id': '614605558512799803_462752227', + 'ext': 'mp4', + 'title': '#Porsche Intelligent Performance.', + 'thumbnail': 're:^https?://.*\.jpg', + 'uploader': 'Porsche', + 'uploader_id': 'porsche', + 'timestamp': 1387486713, + 'upload_date': '20131219', + } + expect_info_dict(self, EXPECTED, test_video) + + def test_CSpan_playlist(self): + dl = FakeYDL() + ie = CSpanIE(dl) + result = ie.extract( + 'http://www.c-span.org/video/?318608-1/gm-ignition-switch-recall') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '342759') + self.assertEqual( + result['title'], 'General Motors Ignition Switch Recall') + whole_duration = sum(e['duration'] for e in result['entries']) + self.assertEqual(whole_duration, 14855) + + def test_aol_playlist(self): + dl = FakeYDL() + ie = AolIE(dl) + result = ie.extract( + 'http://on.aol.com/playlist/brace-yourself---todays-weirdest-news-152147?icid=OnHomepageC4_Omg_Img#_videoid=518184316') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], '152147') + self.assertEqual( + result['title'], 'Brace Yourself - Today\'s Weirdest News') + self.assertTrue(len(result['entries']) >= 10) + + def test_TeacherTubeClassroom(self): + dl = FakeYDL() + ie = TeacherTubeClassroomIE(dl) + result = ie.extract('http://www.teachertube.com/view_classroom.php?user=rbhagwati2') + self.assertIsPlaylist(result) + self.assertEqual(result['id'], 'rbhagwati2') + self.assertTrue(len(result['entries']) >= 20) if __name__ == '__main__': unittest.main() diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 79991e6..5736fe5 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -181,7 +181,7 @@ class TestTedSubtitles(BaseTestSubtitles): self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() - self.assertEqual(len(subtitles.keys()), 28) + self.assertTrue(len(subtitles.keys()) >= 28) def test_list_subtitles(self): self.DL.expect_warning(u'Automatic Captions not supported by this server') diff --git a/test/test_utils.py b/test/test_utils.py index 84553b9..51eb0b6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,6 +9,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) # Various small unit tests +import io +import json import xml.etree.ElementTree #from youtube_dl.utils import htmlentity_transform @@ -21,6 +23,7 @@ from youtube_dl.utils import ( orderedSet, PagedList, parse_duration, + read_batch_urls, sanitize_filename, shell_quote, smuggle_url, @@ -31,7 +34,11 @@ from youtube_dl.utils import ( unified_strdate, unsmuggle_url, url_basename, + urlencode_postdata, xpath_with_ns, + parse_iso8601, + strip_jsonp, + uppercase_escape, ) if sys.version_info < (3, 0): @@ -250,5 +257,32 @@ class TestUtil(unittest.TestCase): def test_struct_unpack(self): self.assertEqual(struct_unpack(u'!B', b'\x00'), (0,)) + def test_read_batch_urls(self): + f = io.StringIO(u'''\xef\xbb\xbf foo + bar\r + baz + # More after this line\r + ; or after this + bam''') + self.assertEqual(read_batch_urls(f), [u'foo', u'bar', u'baz', u'bam']) + + def test_urlencode_postdata(self): + data = urlencode_postdata({'username': 'foo@bar.com', 'password': '1234'}) + self.assertTrue(isinstance(data, bytes)) + + def test_parse_iso8601(self): + self.assertEqual(parse_iso8601('2014-03-23T23:04:26+0100'), 1395612266) + self.assertEqual(parse_iso8601('2014-03-23T22:04:26+0000'), 1395612266) + self.assertEqual(parse_iso8601('2014-03-23T22:04:26Z'), 1395612266) + + def test_strip_jsonp(self): + stripped = strip_jsonp('cb ([ {"id":"532cb",\n\n\n"x":\n3}\n]\n);') + d = json.loads(stripped) + self.assertEqual(d, [{"id": "532cb", "x": 3}]) + + def test_uppercase_escpae(self): + self.assertEqual(uppercase_escape(u'aä'), u'aä') + self.assertEqual(uppercase_escape(u'\\U0001d550'), u'𝕐') + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 38ac989..3aadedd 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -16,6 +16,7 @@ from youtube_dl.extractor import ( YoutubeChannelIE, YoutubeShowIE, YoutubeTopListIE, + YoutubeSearchURLIE, ) @@ -111,13 +112,15 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_mix(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y') + result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') entries = result['entries'] self.assertTrue(len(entries) >= 20) original_video = entries[0] - self.assertEqual(original_video['id'], 'rjFaenf1T-Y') + self.assertEqual(original_video['id'], 'OQpdSVF_k_w') def test_youtube_toptracks(self): + print('Skipping: The playlist page gives error 500') + return dl = FakeYDL() ie = YoutubePlaylistIE(dl) result = ie.extract('https://www.youtube.com/playlist?list=MCUS') @@ -131,5 +134,14 @@ class TestYoutubeLists(unittest.TestCase): entries = result['entries'] self.assertTrue(len(entries) >= 5) + def test_youtube_search_url(self): + dl = FakeYDL() + ie = YoutubeSearchURLIE(dl) + result = ie.extract('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video') + entries = result['entries'] + self.assertIsPlaylist(result) + self.assertEqual(result['title'], 'youtube-dl test video') + self.assertTrue(len(entries) >= 5) + if __name__ == '__main__': unittest.main() diff --git a/youtube-dl b/youtube-dl index 063e40d..b98d36a 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube-dl.1 b/youtube-dl.1 index 7abbe59..f17addd 100644 --- a/youtube-dl.1 +++ b/youtube-dl.1 @@ -24,7 +24,7 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sure\ that\ you\ have\ sufficient\ permissions \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (run\ with\ sudo\ if\ needed) \-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ \ \ \ \ \ \ continue\ on\ download\ errors,\ for\ example\ to -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ to\ skip\ unavailable\ videos\ in\ a\ playlist +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ skip\ unavailable\ videos\ in\ a\ playlist \-\-abort\-on\-error\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Abort\ downloading\ of\ further\ videos\ (in\ the \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ playlist\ or\ the\ command\ line)\ if\ an\ error \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ occurs @@ -32,6 +32,9 @@ redistribute it or use it however you like. \-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent \-\-referer\ REF\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ referer,\ use\ if\ the\ video \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ access\ is\ restricted\ to\ one\ domain +\-\-add\-header\ FIELD:VALUE\ \ \ \ \ \ \ \ \ specify\ a\ custom\ HTTP\ header\ and\ its\ value, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ separated\ by\ a\ colon\ \[aq]:\[aq].\ You\ can\ use\ this +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ option\ multiple\ times \-\-list\-extractors\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ they\ would\ handle \-\-extractor\-descriptions\ \ \ \ \ \ \ \ \ Output\ descriptions\ of\ all\ supported @@ -40,6 +43,9 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ an\ empty\ string\ (\-\-proxy\ "")\ for\ direct \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ connection \-\-no\-check\-certificate\ \ \ \ \ \ \ \ \ \ \ Suppress\ HTTPS\ certificate\ validation. +\-\-prefer\-insecure\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ an\ unencrypted\ connection\ to\ retrieve +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ information\ about\ the\ video.\ (Currently +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ supported\ only\ for\ YouTube) \-\-cache\-dir\ DIR\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Location\ in\ the\ filesystem\ where\ youtube\-dl \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ can\ store\ some\ downloaded\ information \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ permanently.\ By\ default\ $XDG_CACHE_HOME @@ -63,6 +69,7 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ configuration\ in\ ~/.config/youtube\-dl.conf \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (%APPDATA%/youtube\-dl/config.txt\ on \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Windows) +\-\-encoding\ ENCODING\ \ \ \ \ \ \ \ \ \ \ \ \ \ Force\ the\ specified\ encoding\ (experimental) \f[] .fi .SS Video Selection: @@ -140,8 +147,12 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ id,\ %(playlist)s\ for\ the\ playlist\ the \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ video\ is\ in,\ %(playlist_index)s\ for\ the \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ position\ in\ the\ playlist\ and\ %%\ for\ a -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ Use\ \-\ to\ output\ to\ stdout. -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Can\ also\ be\ used\ to\ download\ to\ a\ different +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ literal\ percent.\ %(height)s\ and\ %(width)s +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ for\ the\ width\ and\ height\ of\ the\ video +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ %(resolution)s\ for\ a\ textual +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ description\ of\ the\ resolution\ of\ the\ video +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ format.\ Use\ \-\ to\ output\ to\ stdout.\ Can\ also +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ be\ used\ to\ download\ to\ a\ different \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ directory,\ for\ example\ with\ \-o\ \[aq]/my/downloa \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ds/%(uploader)s/%(title)s\-%(id)s.%(ext)s\[aq]\ . \-\-autonumber\-size\ NUMBER\ \ \ \ \ \ \ \ \ Specifies\ the\ number\ of\ digits\ in @@ -179,6 +190,7 @@ redistribute it or use it however you like. .nf \f[C] \-q,\ \-\-quiet\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ activates\ quiet\ mode +\-\-no\-warnings\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Ignore\ warnings \-s,\ \-\-simulate\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video\ and\ do\ not\ write \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ anything\ to\ disk \-\-skip\-download\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ download\ the\ video @@ -190,7 +202,9 @@ redistribute it or use it however you like. \-\-get\-duration\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ length \-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename \-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format -\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information +\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information. +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ See\ \-\-output\ for\ a\ description\ of\ available +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ keys. \-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines \-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar \-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar @@ -211,9 +225,9 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18". \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "\-f\ mp4"\ and\ "\-f\ flv"\ are\ also\ supported. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ You\ can\ also\ use\ the\ special\ names\ "best", -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestaudio",\ "worst",\ and\ "worstaudio".\ By -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default,\ youtube\-dl\ will\ pick\ the\ best -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ quality. +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "bestvideo",\ "bestaudio",\ "worst", +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ "worstvideo"\ and\ "worstaudio".\ By\ default, +\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ youtube\-dl\ will\ pick\ the\ best\ quality. \-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats \-\-prefer\-free\-formats\ \ \ \ \ \ \ \ \ \ \ \ prefer\ free\ video\ formats\ unless\ a\ specific \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ one\ is\ requested @@ -272,6 +286,7 @@ redistribute it or use it however you like. \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ default \-\-embed\-subs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ subtitles\ in\ the\ video\ (only\ for\ mp4 \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ videos) +\-\-embed\-thumbnail\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ embed\ thumbnail\ in\ the\ audio\ as\ cover\ art \-\-add\-metadata\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file \-\-xattrs\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ write\ metadata\ to\ the\ video\ file\[aq]s\ xattrs \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ (using\ dublin\ core\ and\ xdg\ standards) @@ -286,7 +301,7 @@ redistribute it or use it however you like. You can configure youtube\-dl by placing default arguments (such as \f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or -\f[C]~/.config/youtube\-dl.conf\f[]. +\f[C]~/.config/youtube\-dl/config\f[]. On Windows, the configuration file locations are \f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] and \f[C]C:\\Users\\\\youtube\-dl.conf\f[]. @@ -359,12 +374,19 @@ Relative dates: Dates in the format \f[C](now|today)[+\-][0\-9](day|week|month|year)(s)?\f[] .PP Examples: -.PP -$ # Download only the videos uploaded in the last 6 months $ youtube\-dl -\-\-dateafter now\-6months $ # Download only the videos uploaded on -January 1, 1970 $ youtube\-dl \-\-date 19700101 $ # will only download -the videos uploaded in the 200x decade $ youtube\-dl \-\-dateafter -20000101 \-\-datebefore 20091231 +.IP +.nf +\f[C] +#\ Download\ only\ the\ videos\ uploaded\ in\ the\ last\ 6\ months +$\ youtube\-dl\ \-\-dateafter\ now\-6months + +#\ Download\ only\ the\ videos\ uploaded\ on\ January\ 1,\ 1970 +$\ youtube\-dl\ \-\-date\ 19700101 + +$\ #\ will\ only\ download\ the\ videos\ uploaded\ in\ the\ 200x\ decade +$\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20091231 +\f[] +.fi .SH FAQ .SS Can you please put the \-b option back? .PP @@ -473,19 +495,108 @@ zip nosetests .SS Adding support for a new site .PP -If you want to add support for a new site, copy \f[I]any\f[] recently -modified (https://github.com/rg3/youtube-dl/commits/master/youtube_dl/extractor) -file in \f[C]youtube_dl/extractor\f[], add an import in +If you want to add support for a new site, you can follow this quick +list (assuming your service is called \f[C]yourextractor\f[]): +.IP " 1." 4 +Fork this repository (https://github.com/rg3/youtube-dl/fork) +.IP " 2." 4 +Check out the source code with +\f[C]git\ clone\ git\@github.com:YOUR_GITHUB_USERNAME/youtube\-dl.git\f[] +.IP " 3." 4 +Start a new git branch with +\f[C]cd\ youtube\-dl;\ git\ checkout\ \-b\ yourextractor\f[] +.IP " 4." 4 +Start with this simple template and save it to +\f[C]youtube_dl/extractor/yourextractor.py\f[]: +.RS 4 +.IP +.nf +\f[C] +#\ coding:\ utf\-8 +from\ __future__\ import\ unicode_literals + +import\ re + +from\ .common\ import\ InfoExtractor + + +class\ YourExtractorIE(InfoExtractor): +\ \ \ \ _VALID_URL\ =\ r\[aq]https?://(?:www\\.)?yourextractor\\.com/watch/(?P[0\-9]+)\[aq] +\ \ \ \ _TEST\ =\ { +\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]http://yourextractor.com/watch/42\[aq], +\ \ \ \ \ \ \ \ \[aq]md5\[aq]:\ \[aq]TODO:\ md5\ sum\ of\ the\ first\ 10KiB\ of\ the\ video\ file\[aq], +\ \ \ \ \ \ \ \ \[aq]info_dict\[aq]:\ { +\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ \[aq]42\[aq], +\ \ \ \ \ \ \ \ \ \ \ \ \[aq]ext\[aq]:\ \[aq]mp4\[aq], +\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ \[aq]Video\ title\ goes\ here\[aq], +\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties,\ either\ as: +\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ value +\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ MD5\ checksum;\ start\ the\ string\ with\ md5: +\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ A\ regular\ expression;\ start\ the\ string\ with\ re: +\ \ \ \ \ \ \ \ \ \ \ \ #\ *\ Any\ Python\ type\ (for\ example\ int\ or\ float) +\ \ \ \ \ \ \ \ } +\ \ \ \ } + +\ \ \ \ def\ _real_extract(self,\ url): +\ \ \ \ \ \ \ \ mobj\ =\ re.match(self._VALID_URL,\ url) +\ \ \ \ \ \ \ \ video_id\ =\ mobj.group(\[aq]id\[aq]) + +\ \ \ \ \ \ \ \ #\ TODO\ more\ code\ goes\ here,\ for\ example\ ... +\ \ \ \ \ \ \ \ webpage\ =\ self._download_webpage(url,\ video_id) +\ \ \ \ \ \ \ \ title\ =\ self._html_search_regex(r\[aq]

(.*?)

\[aq],\ webpage,\ \[aq]title\[aq]) + +\ \ \ \ \ \ \ \ return\ { +\ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ video_id, +\ \ \ \ \ \ \ \ \ \ \ \ \[aq]title\[aq]:\ title, +\ \ \ \ \ \ \ \ \ \ \ \ #\ TODO\ more\ properties\ (see\ youtube_dl/extractor/common.py) +\ \ \ \ \ \ \ \ } +\f[] +.fi +.RE +.IP " 5." 4 +Add an import in \f[C]youtube_dl/extractor/__init__.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/__init__.py). +.IP " 6." 4 +Run +\f[C]python\ test/test_download.py\ TestDownload.test_YourExtractor\f[]. +This \f[I]should fail\f[] at first, but you can continually re\-run it +until you\[aq]re done. +.IP " 7." 4 Have a look at \f[C]youtube_dl/common/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a detailed description of what your extractor should return (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L38). -Don\[aq]t forget to run the tests with -\f[C]python\ test/test_download.py\ Test_Download.test_YourExtractor\f[]! -For a detailed tutorial, refer to this blog -post (http://filippo.io/add-support-for-a-new-video-site-to-youtube-dl/). +Add tests and code for as many as you want. +.IP " 8." 4 +If you can, check the code with +pyflakes (https://pypi.python.org/pypi/pyflakes) (a good idea) and +pep8 (https://pypi.python.org/pypi/pep8) (optional, ignore E501). +.IP " 9." 4 +When the tests pass, +add (https://www.kernel.org/pub/software/scm/git/docs/git-add.html) the +new files and +commit (https://www.kernel.org/pub/software/scm/git/docs/git-commit.html) +them and +push (https://www.kernel.org/pub/software/scm/git/docs/git-push.html) +the result, like this: +.RS 4 +.IP +.nf +\f[C] +$\ git\ add\ youtube_dl/extractor/__init__.py +$\ git\ add\ youtube_dl/extractor/yourextractor.py +$\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq] +$\ git\ push\ origin\ yourextractor +\f[] +.fi +.RE +.IP "10." 4 +Finally, create a pull +request (https://help.github.com/articles/creating-a-pull-request). +We\[aq]ll then review and merge it. +.PP +In any case, thank you very much for your contributions! .SH BUGS .PP Bugs and suggestions should be reported at: @@ -537,7 +648,7 @@ For bug reports, this means that your report should contain the The error message you get for (most) bugs even says so, but you would not believe how many of our bug reports do not contain this information. .PP -Site support requests must contain an example URL. +Site support requests \f[B]must contain an example URL\f[]. An example URL is a URL you might want to download, like http://www.youtube.com/watch?v=BaW_jenozKc . There should be an obvious video present. diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion index a5398bb..498e841 100644 --- a/youtube-dl.bash-completion +++ b/youtube-dl.bash-completion @@ -4,7 +4,7 @@ __youtube_dl() COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" - opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg" + opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --add-header --list-extractors --extractor-descriptions --proxy --no-check-certificate --prefer-insecure --cache-dir --no-cache-dir --socket-timeout --bidi-workaround --default-search --ignore-config --encoding --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --no-playlist --age-limit --download-archive --include-ads --youtube-include-dash-manifest --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --load-info --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --print-traffic --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --xattrs --prefer-avconv --prefer-ffmpeg" keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" fileopts="-a|--batch-file|--download-archive|--cookies|--load-info" diropts="--cache-dir" diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py deleted file mode 100755 index 672ef9e..0000000 --- a/youtube_dl/InfoExtractors.py +++ /dev/null @@ -1,4 +0,0 @@ -# Legacy file for backwards compatibility, use youtube_dl.extractor instead! - -from .extractor.common import InfoExtractor, SearchInfoExtractor -from .extractor import gen_extractors, get_info_extractor diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py old mode 100644 new mode 100755 index 42cbcf6..dc0ba98 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -4,9 +4,11 @@ from __future__ import absolute_import, unicode_literals import collections +import datetime import errno import io import json +import locale import os import platform import re @@ -29,6 +31,7 @@ from .utils import ( ContentTooShortError, date_from_str, DateRange, + DEFAULT_OUTTMPL, determine_ext, DownloadError, encodeFilename, @@ -93,6 +96,7 @@ class YoutubeDL(object): usenetrc: Use netrc for authentication instead. verbose: Print additional info to stdout. quiet: Do not print messages to stdout. + no_warnings: Do not print out anything for warnings. forceurl: Force printing final URL. forcetitle: Force printing title. forceid: Force printing ID. @@ -147,6 +151,8 @@ class YoutubeDL(object): again. cookiefile: File name where cookies should be read from and dumped to. nocheckcertificate:Do not verify SSL certificates + prefer_insecure: Use HTTP instead of HTTPS to retrieve information. + At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text @@ -155,6 +161,7 @@ class YoutubeDL(object): include_ads: Download ads as well default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing + encoding: Use this encoding instead of the system-specified. The following parameters are not used by YoutubeDL itself, they are used by the FileDownloader: @@ -280,6 +287,9 @@ class YoutubeDL(object): """Print message to stdout if not in quiet mode.""" return self.to_stdout(message, skip_eol, check_quiet=True) + def _write_string(self, s, out=None): + write_string(s, out=out, encoding=self.params.get('encoding')) + def to_stdout(self, message, skip_eol=False, check_quiet=False): """Print message to stdout if not in quiet mode.""" if self.params.get('logger'): @@ -289,7 +299,7 @@ class YoutubeDL(object): terminator = ['\n', ''][skip_eol] output = message + terminator - write_string(output, self._screen_file) + self._write_string(output, self._screen_file) def to_stderr(self, message): """Print message to stderr.""" @@ -299,7 +309,7 @@ class YoutubeDL(object): else: message = self._bidi_workaround(message) output = message + '\n' - write_string(output, self._err_file) + self._write_string(output, self._err_file) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -309,21 +319,21 @@ class YoutubeDL(object): # already of type unicode() ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message)) elif 'TERM' in os.environ: - write_string('\033]0;%s\007' % message, self._screen_file) + self._write_string('\033]0;%s\007' % message, self._screen_file) def save_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Save the title on stack - write_string('\033[22;0t', self._screen_file) + self._write_string('\033[22;0t', self._screen_file) def restore_console_title(self): if not self.params.get('consoletitle', False): return if 'TERM' in os.environ: # Restore the title from stack - write_string('\033[23;0t', self._screen_file) + self._write_string('\033[23;0t', self._screen_file) def __enter__(self): self.save_console_title() @@ -370,12 +380,17 @@ class YoutubeDL(object): Print the message to stderr, it will be prefixed with 'WARNING:' If stderr is a tty file the 'WARNING:' will be colored ''' - if self._err_file.isatty() and os.name != 'nt': - _msg_header = '\033[0;33mWARNING:\033[0m' + if self.params.get('logger') is not None: + self.params['logger'].warning(message) else: - _msg_header = 'WARNING:' - warning_message = '%s %s' % (_msg_header, message) - self.to_stderr(warning_message) + if self.params.get('no_warnings'): + return + if self._err_file.isatty() and os.name != 'nt': + _msg_header = '\033[0;33mWARNING:\033[0m' + else: + _msg_header = 'WARNING:' + warning_message = '%s %s' % (_msg_header, message) + self.to_stderr(warning_message) def report_error(self, message, tb=None): ''' @@ -409,6 +424,13 @@ class YoutubeDL(object): template_dict['autonumber'] = autonumber_templ % self._num_downloads if template_dict.get('playlist_index') is not None: template_dict['playlist_index'] = '%05d' % template_dict['playlist_index'] + if template_dict.get('resolution') is None: + if template_dict.get('width') and template_dict.get('height'): + template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height']) + elif template_dict.get('height'): + template_dict['resolution'] = '%sp' % template_dict['height'] + elif template_dict.get('width'): + template_dict['resolution'] = '?x%d' % template_dict['width'] sanitize = lambda k, v: sanitize_filename( compat_str(v), @@ -419,7 +441,8 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - tmpl = os.path.expanduser(self.params['outtmpl']) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) + tmpl = os.path.expanduser(outtmpl) filename = tmpl % template_dict return filename except ValueError as err: @@ -499,13 +522,7 @@ class YoutubeDL(object): '_type': 'compat_list', 'entries': ie_result, } - self.add_extra_info(ie_result, - { - 'extractor': ie.IE_NAME, - 'webpage_url': url, - 'webpage_url_basename': url_basename(url), - 'extractor_key': ie.ie_key(), - }) + self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) else: @@ -522,7 +539,15 @@ class YoutubeDL(object): else: raise else: - self.report_error('no suitable InfoExtractor: %s' % url) + self.report_error('no suitable InfoExtractor for URL %s' % url) + + def add_default_extra_info(self, ie_result, ie, url): + self.add_extra_info(ie_result, { + 'extractor': ie.IE_NAME, + 'webpage_url': url, + 'webpage_url_basename': url_basename(url), + 'extractor_key': ie.ie_key(), + }) def process_ie_result(self, ie_result, download=True, extra_info={}): """ @@ -656,6 +681,18 @@ class YoutubeDL(object): if f.get('vcodec') == 'none'] if audio_formats: return audio_formats[0] + elif format_spec == 'bestvideo': + video_formats = [ + f for f in available_formats + if f.get('acodec') == 'none'] + if video_formats: + return video_formats[-1] + elif format_spec == 'worstvideo': + video_formats = [ + f for f in available_formats + if f.get('acodec') == 'none'] + if video_formats: + return video_formats[0] else: extensions = ['mp4', 'flv', 'webm', '3gp'] if format_spec in extensions: @@ -670,11 +707,35 @@ class YoutubeDL(object): def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' + if 'id' not in info_dict: + raise ExtractorError('Missing "id" field in extractor result') + if 'title' not in info_dict: + raise ExtractorError('Missing "title" field in extractor result') + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None info_dict['playlist_index'] = None + thumbnails = info_dict.get('thumbnails') + if thumbnails: + thumbnails.sort(key=lambda t: ( + t.get('width'), t.get('height'), t.get('url'))) + for t in thumbnails: + if 'width' in t and 'height' in t: + t['resolution'] = '%dx%d' % (t['width'], t['height']) + + if thumbnails and 'thumbnail' not in info_dict: + info_dict['thumbnail'] = thumbnails[-1]['url'] + + if 'display_id' not in info_dict and 'id' in info_dict: + info_dict['display_id'] = info_dict['id'] + + if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None: + upload_date = datetime.datetime.utcfromtimestamp( + info_dict['timestamp']) + info_dict['upload_date'] = upload_date.strftime('%Y%m%d') + # This extractors handle format selection themselves if info_dict['extractor'] in ['Youku']: if download: @@ -688,8 +749,14 @@ class YoutubeDL(object): else: formats = info_dict['formats'] + if not formats: + raise ExtractorError('No video formats found!') + # We check that all the formats have the format and format_id fields - for (i, format) in enumerate(formats): + for i, format in enumerate(formats): + if 'url' not in format: + raise ExtractorError('Missing "url" key in result (index %d)' % i) + if format.get('format_id') is None: format['format_id'] = compat_str(i) if format.get('format') is None: @@ -700,7 +767,7 @@ class YoutubeDL(object): ) # Automatically determine file extension if missing if 'ext' not in format: - format['ext'] = determine_ext(format['url']) + format['ext'] = determine_ext(format['url']).lower() format_limit = self.params.get('format_limit', None) if format_limit: @@ -825,7 +892,7 @@ class YoutubeDL(object): try: dn = os.path.dirname(encodeFilename(filename)) - if dn != '' and not os.path.exists(dn): + if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: self.report_error('unable to create directory ' + compat_str(err)) @@ -882,7 +949,7 @@ class YoutubeDL(object): with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: subfile.write(sub) except (OSError, IOError): - self.report_error('Cannot write subtitles file ' + descfn) + self.report_error('Cannot write subtitles file ' + sub_filename) return if self.params.get('writeinfojson', False): @@ -908,7 +975,7 @@ class YoutubeDL(object): self.to_screen('[%s] %s: Downloading thumbnail ...' % (info_dict['extractor'], info_dict['id'])) try: - uf = compat_urllib_request.urlopen(info_dict['thumbnail']) + uf = self.urlopen(info_dict['thumbnail']) with open(thumb_filename, 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) self.to_screen('[%s] %s: Writing thumbnail to: %s' % @@ -971,10 +1038,11 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) if (len(url_list) > 1 and - '%' not in self.params['outtmpl'] + '%' not in outtmpl and self.params.get('max_downloads') != 1): - raise SameFileError(self.params['outtmpl']) + raise SameFileError(outtmpl) for url in url_list: try: @@ -1085,57 +1153,57 @@ class YoutubeDL(object): res = default return res - def list_formats(self, info_dict): - def format_note(fdict): - res = '' - if fdict.get('ext') in ['f4f', 'f4m']: - res += '(unsupported) ' - if fdict.get('format_note') is not None: - res += fdict['format_note'] + ' ' - if fdict.get('tbr') is not None: - res += '%4dk ' % fdict['tbr'] - if fdict.get('container') is not None: - if res: - res += ', ' - res += '%s container' % fdict['container'] - if (fdict.get('vcodec') is not None and - fdict.get('vcodec') != 'none'): - if res: - res += ', ' - res += fdict['vcodec'] - if fdict.get('vbr') is not None: - res += '@' - elif fdict.get('vbr') is not None and fdict.get('abr') is not None: - res += 'video@' + def _format_note(self, fdict): + res = '' + if fdict.get('ext') in ['f4f', 'f4m']: + res += '(unsupported) ' + if fdict.get('format_note') is not None: + res += fdict['format_note'] + ' ' + if fdict.get('tbr') is not None: + res += '%4dk ' % fdict['tbr'] + if fdict.get('container') is not None: + if res: + res += ', ' + res += '%s container' % fdict['container'] + if (fdict.get('vcodec') is not None and + fdict.get('vcodec') != 'none'): + if res: + res += ', ' + res += fdict['vcodec'] if fdict.get('vbr') is not None: - res += '%4dk' % fdict['vbr'] - if fdict.get('acodec') is not None: - if res: - res += ', ' - if fdict['acodec'] == 'none': - res += 'video only' - else: - res += '%-5s' % fdict['acodec'] - elif fdict.get('abr') is not None: - if res: - res += ', ' - res += 'audio' - if fdict.get('abr') is not None: - res += '@%3dk' % fdict['abr'] - if fdict.get('asr') is not None: - res += ' (%5dHz)' % fdict['asr'] - if fdict.get('filesize') is not None: - if res: - res += ', ' - res += format_bytes(fdict['filesize']) - return res + res += '@' + elif fdict.get('vbr') is not None and fdict.get('abr') is not None: + res += 'video@' + if fdict.get('vbr') is not None: + res += '%4dk' % fdict['vbr'] + if fdict.get('acodec') is not None: + if res: + res += ', ' + if fdict['acodec'] == 'none': + res += 'video only' + else: + res += '%-5s' % fdict['acodec'] + elif fdict.get('abr') is not None: + if res: + res += ', ' + res += 'audio' + if fdict.get('abr') is not None: + res += '@%3dk' % fdict['abr'] + if fdict.get('asr') is not None: + res += ' (%5dHz)' % fdict['asr'] + if fdict.get('filesize') is not None: + if res: + res += ', ' + res += format_bytes(fdict['filesize']) + return res + def list_formats(self, info_dict): def line(format, idlen=20): return (('%-' + compat_str(idlen + 1) + 's%-10s%-12s%s') % ( format['format_id'], format['ext'], self.format_resolution(format), - format_note(format), + self._format_note(format), )) formats = info_dict.get('formats', [info_dict]) @@ -1143,8 +1211,8 @@ class YoutubeDL(object): max(len(f['format_id']) for f in formats)) formats_s = [line(f, idlen) for f in formats] if len(formats) > 1: - formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)' - formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)' + formats_s[0] += (' ' if self._format_note(formats[0]) else '') + '(worst)' + formats_s[-1] += (' ' if self._format_note(formats[-1]) else '') + '(best)' header_line = line({ 'format_id': 'format code', 'ext': 'extension', @@ -1154,12 +1222,22 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - return self._opener.open(req) + return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): if not self.params.get('verbose'): return - write_string('[debug] youtube-dl version ' + __version__ + '\n') + + write_string( + '[debug] Encodings: locale %s, fs %s, out %s, pref %s\n' % ( + locale.getpreferredencoding(), + sys.getfilesystemencoding(), + sys.stdout.encoding, + self.get_encoding()), + encoding=None + ) + + self._write_string('[debug] youtube-dl version ' + __version__ + '\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -1168,24 +1246,24 @@ class YoutubeDL(object): out, err = sp.communicate() out = out.decode().strip() if re.match('[0-9a-f]+', out): - write_string('[debug] Git HEAD: ' + out + '\n') + self._write_string('[debug] Git HEAD: ' + out + '\n') except: try: sys.exc_clear() except: pass - write_string('[debug] Python version %s - %s' % + self._write_string('[debug] Python version %s - %s' % (platform.python_version(), platform_name()) + '\n') proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): proxy_map.update(handler.proxies) - write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') + self._write_string('[debug] Proxy map: ' + compat_str(proxy_map) + '\n') def _setup_opener(self): timeout_val = self.params.get('socket_timeout') - timeout = 600 if timeout_val is None else float(timeout_val) + self._socket_timeout = 600 if timeout_val is None else float(timeout_val) opts_cookiefile = self.params.get('cookiefile') opts_proxy = self.params.get('proxy') @@ -1224,6 +1302,18 @@ class YoutubeDL(object): opener.addheaders = [] self._opener = opener - # TODO remove this global modification - compat_urllib_request.install_opener(opener) - socket.setdefaulttimeout(timeout) + def encode(self, s): + if isinstance(s, bytes): + return s # Already encoded + + try: + return s.encode(self.get_encoding()) + except UnicodeEncodeError as err: + err.reason = err.reason + '. Check your system encoding configuration or use the --encoding option.' + raise + + def get_encoding(self): + encoding = self.params.get('encoding') + if encoding is None: + encoding = preferredencoding() + return encoding diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f843036..1e01432 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -46,12 +46,25 @@ __authors__ = ( 'Andreas Schmitz', 'Michael Kaiser', 'Niklas Laxström', + 'David Triendl', + 'Anthony Weems', + 'David Wagner', + 'Juan C. Olivares', + 'Mattias Harrysson', + 'phaer', + 'Sainyam Kapoor', + 'Nicolas Évrard', + 'Jason Normore', + 'Hoje Lee', + 'Adam Thalhammer', + 'Georg Jähnig', + 'Ralf Haring', ) __license__ = 'Public Domain' import codecs -import getpass +import io import locale import optparse import os @@ -62,14 +75,17 @@ import sys from .utils import ( + compat_getpass, compat_print, DateRange, + DEFAULT_OUTTMPL, decodeOption, get_term_width, DownloadError, get_cachedir, MaxDownloadsReached, preferredencoding, + read_batch_urls, SameFileError, setproctitle, std_headers, @@ -83,6 +99,8 @@ from .extractor import gen_extractors from .version import __version__ from .YoutubeDL import YoutubeDL from .postprocessor import ( + AtomicParsleyPP, + FFmpegAudioFixPP, FFmpegMetadataPP, FFmpegVideoConvertor, FFmpegExtractAudioPP, @@ -208,7 +226,7 @@ def parseOpts(overrideArguments=None): general.add_option('-U', '--update', action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)') general.add_option('-i', '--ignore-errors', - action='store_true', dest='ignoreerrors', help='continue on download errors, for example to to skip unavailable videos in a playlist', default=False) + action='store_true', dest='ignoreerrors', help='continue on download errors, for example to skip unavailable videos in a playlist', default=False) general.add_option('--abort-on-error', action='store_false', dest='ignoreerrors', help='Abort downloading of further videos (in the playlist or the command line) if an error occurs') @@ -220,6 +238,9 @@ def parseOpts(overrideArguments=None): general.add_option('--referer', dest='referer', help='specify a custom referer, use if the video access is restricted to one domain', metavar='REF', default=None) + general.add_option('--add-header', + dest='headers', help='specify a custom HTTP header and its value, separated by a colon \':\'. You can use this option multiple times', action="append", + metavar='FIELD:VALUE') general.add_option('--list-extractors', action='store_true', dest='list_extractors', help='List all supported extractors and the URLs they would handle', default=False) @@ -230,6 +251,9 @@ def parseOpts(overrideArguments=None): '--proxy', dest='proxy', default=None, metavar='URL', help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection') general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.') + general.add_option( + '--prefer-insecure', '--prefer-unsecure', action='store_true', dest='prefer_insecure', + help='Use an unencrypted connection to retrieve information about the video. (Currently supported only for YouTube)') general.add_option( '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR', help='Location in the filesystem where youtube-dl can store some downloaded information permanently. By default $XDG_CACHE_HOME/youtube-dl or ~/.cache/youtube-dl . At the moment, only YouTube player files (for videos with obfuscated signatures) are cached, but that may change.') @@ -242,14 +266,17 @@ def parseOpts(overrideArguments=None): general.add_option( '--bidi-workaround', dest='bidi_workaround', action='store_true', help=u'Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') - general.add_option('--default-search', - dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') + general.add_option( + '--default-search', + dest='default_search', metavar='PREFIX', + help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for youtube-dl "large apple". By default (with value "auto") youtube-dl guesses.') general.add_option( '--ignore-config', action='store_true', help='Do not read configuration files. When given in the global configuration file /etc/youtube-dl.conf: do not read the user configuration in ~/.config/youtube-dl.conf (%APPDATA%/youtube-dl/config.txt on Windows)') - + general.add_option( + '--encoding', dest='encoding', metavar='ENCODING', + help='Force the specified encoding (experimental)') selection.add_option( '--playlist-start', @@ -309,7 +336,7 @@ def parseOpts(overrideArguments=None): video_format.add_option('-f', '--format', action='store', dest='format', metavar='FORMAT', default=None, - help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestaudio", "worst", and "worstaudio". By default, youtube-dl will pick the best quality.') + help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported. You can also use the special names "best", "bestvideo", "bestaudio", "worst", "worstvideo" and "worstaudio". By default, youtube-dl will pick the best quality.') video_format.add_option('--all-formats', action='store_const', dest='format', help='download all available video formats', const='all') video_format.add_option('--prefer-free-formats', @@ -352,6 +379,10 @@ def parseOpts(overrideArguments=None): verbosity.add_option('-q', '--quiet', action='store_true', dest='quiet', help='activates quiet mode', default=False) + verbosity.add_option( + '--no-warnings', + dest='no_warnings', action='store_true', default=False, + help='Ignore warnings') verbosity.add_option('-s', '--simulate', action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False) verbosity.add_option('--skip-download', @@ -379,7 +410,7 @@ def parseOpts(overrideArguments=None): help='simulate, quiet but print output format', default=False) verbosity.add_option('-j', '--dump-json', action='store_true', dest='dumpjson', - help='simulate, quiet but print JSON information', default=False) + help='simulate, quiet but print JSON information. See --output for a description of available keys.', default=False) verbosity.add_option('--newline', action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False) verbosity.add_option('--no-progress', @@ -424,6 +455,8 @@ def parseOpts(overrideArguments=None): '%(extractor)s for the provider (youtube, metacafe, etc), ' '%(id)s for the video id, %(playlist)s for the playlist the video is in, ' '%(playlist_index)s for the position in the playlist and %% for a literal percent. ' + '%(height)s and %(width)s for the width and height of the video format. ' + '%(resolution)s for a textual description of the resolution of the video format. ' 'Use - to output to stdout. Can also be used to download to a different directory, ' 'for example with -o \'/my/downloads/%(uploader)s/%(title)s-%(id)s.%(ext)s\' .')) filesystem.add_option('--autonumber-size', @@ -479,6 +512,8 @@ def parseOpts(overrideArguments=None): help='do not overwrite post-processed files; the post-processed files are overwritten by default') postproc.add_option('--embed-subs', action='store_true', dest='embedsubtitles', default=False, help='embed subtitles in the video (only for mp4 videos)') + postproc.add_option('--embed-thumbnail', action='store_true', dest='embedthumbnail', default=False, + help='embed thumbnail in the audio as cover art') postproc.add_option('--add-metadata', action='store_true', dest='addmetadata', default=False, help='write metadata to the video file') postproc.add_option('--xattrs', action='store_true', dest='xattrs', default=False, @@ -521,8 +556,6 @@ def parseOpts(overrideArguments=None): write_string(u'[debug] System config: ' + repr(_hide_login_info(systemConf)) + '\n') write_string(u'[debug] User config: ' + repr(_hide_login_info(userConf)) + '\n') write_string(u'[debug] Command-line args: ' + repr(_hide_login_info(commandLineConf)) + '\n') - write_string(u'[debug] Encodings: locale %r, fs %r, out %r, pref: %r\n' % - (locale.getpreferredencoding(), sys.getfilesystemencoding(), sys.stdout.encoding, preferredencoding())) return parser, opts, args @@ -545,27 +578,35 @@ def _real_main(argv=None): if opts.referer is not None: std_headers['Referer'] = opts.referer + # Custom HTTP headers + if opts.headers is not None: + for h in opts.headers: + if h.find(':', 1) < 0: + parser.error(u'wrong header formatting, it should be key:value, not "%s"'%h) + key, value = h.split(':', 2) + if opts.verbose: + write_string(u'[debug] Adding header from command line option %s:%s\n'%(key, value)) + std_headers[key] = value + # Dump user agent if opts.dump_user_agent: compat_print(std_headers['User-Agent']) sys.exit(0) # Batch file verification - batchurls = [] + batch_urls = [] if opts.batchfile is not None: try: if opts.batchfile == '-': batchfd = sys.stdin else: - batchfd = open(opts.batchfile, 'r') - batchurls = batchfd.readlines() - batchurls = [x.strip() for x in batchurls] - batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)] + batchfd = io.open(opts.batchfile, 'r', encoding='utf-8', errors='ignore') + batch_urls = read_batch_urls(batchfd) if opts.verbose: - write_string(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') + write_string(u'[debug] Batch file urls: ' + repr(batch_urls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') - all_urls = batchurls + args + all_urls = batch_urls + args all_urls = [url.strip() for url in all_urls] _enc = preferredencoding() all_urls = [url.decode(_enc, 'ignore') if isinstance(url, bytes) else url for url in all_urls] @@ -604,7 +645,7 @@ def _real_main(argv=None): if opts.usetitle and opts.useid: parser.error(u'using title conflicts with using video ID') if opts.username is not None and opts.password is None: - opts.password = getpass.getpass(u'Type account password and press return:') + opts.password = compat_getpass(u'Type account password and press [Return]: ') if opts.ratelimit is not None: numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) if numeric_limit is None: @@ -642,13 +683,13 @@ def _real_main(argv=None): if not opts.audioquality.isdigit(): parser.error(u'invalid audio quality specified') if opts.recodevideo is not None: - if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg']: + if opts.recodevideo not in ['mp4', 'flv', 'webm', 'ogg', 'mkv']: parser.error(u'invalid video recode format specified') if opts.date is not None: date = DateRange.day(opts.date) else: date = DateRange(opts.dateafter, opts.datebefore) - if opts.default_search not in ('auto', None) and ':' not in opts.default_search: + if opts.default_search not in ('auto', 'auto_warning', None) and ':' not in opts.default_search: parser.error(u'--default-search invalid; did you forget a colon (:) at the end?') # Do not download videos when there are audio-only formats @@ -671,7 +712,7 @@ def _real_main(argv=None): or (opts.usetitle and u'%(title)s-%(id)s.%(ext)s') or (opts.useid and u'%(id)s.%(ext)s') or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s') - or u'%(title)s-%(id)s.%(ext)s') + or DEFAULT_OUTTMPL) if not os.path.splitext(outtmpl)[1] and opts.extractaudio: parser.error(u'Cannot download a video and extract audio into the same' u' file! Use "{0}.%(ext)s" instead of "{0}" as the output' @@ -686,6 +727,7 @@ def _real_main(argv=None): 'password': opts.password, 'videopassword': opts.videopassword, 'quiet': (opts.quiet or any_printing), + 'no_warnings': opts.no_warnings, 'forceurl': opts.geturl, 'forcetitle': opts.gettitle, 'forceid': opts.getid, @@ -749,6 +791,7 @@ def _real_main(argv=None): 'download_archive': download_archive_fn, 'cookiefile': opts.cookiefile, 'nocheckcertificate': opts.no_check_certificate, + 'prefer_insecure': opts.prefer_insecure, 'proxy': opts.proxy, 'socket_timeout': opts.socket_timeout, 'bidi_workaround': opts.bidi_workaround, @@ -757,6 +800,7 @@ def _real_main(argv=None): 'include_ads': opts.include_ads, 'default_search': opts.default_search, 'youtube_include_dash_manifest': opts.youtube_include_dash_manifest, + 'encoding': opts.encoding, } with YoutubeDL(ydl_opts) as ydl: @@ -775,6 +819,10 @@ def _real_main(argv=None): ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat)) if opts.xattrs: ydl.add_post_processor(XAttrMetadataPP()) + if opts.embedthumbnail: + if not opts.addmetadata: + ydl.add_post_processor(FFmpegAudioFixPP()) + ydl.add_post_processor(AtomicParsleyPP()) # Update version if opts.update_self: diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5a068aa..917f345 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,9 +4,10 @@ import sys import time from ..utils import ( + compat_str, encodeFilename, - timeconvert, format_bytes, + timeconvert, ) @@ -173,7 +174,7 @@ class FileDownloader(object): return os.rename(encodeFilename(old_filename), encodeFilename(new_filename)) except (IOError, OSError) as err: - self.report_error(u'unable to rename file: %s' % str(err)) + self.report_error(u'unable to rename file: %s' % compat_str(err)) def try_utime(self, filename, last_modified_hdr): """Try to set the last-modified time of the given file.""" diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 2a870a7..e6be6ae 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -12,7 +12,6 @@ from .http import HttpFD from ..utils import ( struct_pack, struct_unpack, - compat_urllib_request, compat_urlparse, format_bytes, encodeFilename, @@ -117,8 +116,8 @@ class FlvReader(io.BytesIO): self.read_unsigned_char() # flags self.read(3) - # BootstrapinfoVersion - bootstrap_info_version = self.read_unsigned_int() + + self.read_unsigned_int() # BootstrapinfoVersion # Profile,Live,Update,Reserved self.read(1) # time scale @@ -127,15 +126,15 @@ class FlvReader(io.BytesIO): self.read_unsigned_long_long() # SmpteTimeCodeOffset self.read_unsigned_long_long() - # MovieIdentifier - movie_identifier = self.read_string() + + self.read_string() # MovieIdentifier server_count = self.read_unsigned_char() # ServerEntryTable for i in range(server_count): self.read_string() quality_count = self.read_unsigned_char() # QualityEntryTable - for i in range(server_count): + for i in range(quality_count): self.read_string() # DrmData self.read_string() @@ -298,6 +297,7 @@ class F4mFD(FileDownloader): break frags_filenames.append(frag_filename) + dest_stream.close() self.report_finish(format_bytes(state['downloaded_bytes']), time.time() - start) self.try_rename(tmpfilename, filename) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index fa98346..9d407fe 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -13,8 +13,10 @@ class HlsFD(FileDownloader): self.report_destination(filename) tmpfilename = self.temp_name(filename) - args = ['-y', '-i', url, '-f', 'mp4', '-c', 'copy', - '-bsf:a', 'aac_adtstoasc', tmpfilename] + args = [ + '-y', '-i', url, '-f', 'mp4', '-c', 'copy', + '-bsf:a', 'aac_adtstoasc', + encodeFilename(tmpfilename, for_subprocess=True)] for program in ['avconv', 'ffmpeg']: try: diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 748f9f3..f79e6a9 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -14,6 +14,8 @@ from ..utils import ( class HttpFD(FileDownloader): + _TEST_FILE_SIZE = 10241 + def real_download(self, filename, info_dict): url = info_dict['url'] tmpfilename = self.temp_name(filename) @@ -23,11 +25,15 @@ class HttpFD(FileDownloader): headers = {'Youtubedl-no-compression': 'True'} if 'user_agent' in info_dict: headers['Youtubedl-user-agent'] = info_dict['user_agent'] + if 'http_referer' in info_dict: + headers['Referer'] = info_dict['http_referer'] basic_request = compat_urllib_request.Request(url, None, headers) request = compat_urllib_request.Request(url, None, headers) - if self.params.get('test', False): - request.add_header('Range', 'bytes=0-10240') + is_test = self.params.get('test', False) + + if is_test: + request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) # Establish possible resume length if os.path.isfile(encodeFilename(tmpfilename)): @@ -49,7 +55,7 @@ class HttpFD(FileDownloader): while count <= retries: # Establish connection try: - data = compat_urllib_request.urlopen(request) + data = self.ydl.urlopen(request) break except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: @@ -59,7 +65,7 @@ class HttpFD(FileDownloader): # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = compat_urllib_request.urlopen(basic_request) + data = self.ydl.urlopen(basic_request) content_length = data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: @@ -85,6 +91,7 @@ class HttpFD(FileDownloader): else: # The length does not match, we start the download over self.report_unable_to_resume() + resume_len = 0 open_mode = 'wb' break # Retry @@ -97,6 +104,15 @@ class HttpFD(FileDownloader): return False data_len = data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + if data_len is not None: data_len = int(data_len) + resume_len min_data_len = self.params.get("min_filesize", None) @@ -115,7 +131,7 @@ class HttpFD(FileDownloader): while True: # Download and write before = time.time() - data_block = data.read(block_size) + data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) after = time.time() if len(data_block) == 0: break @@ -159,6 +175,9 @@ class HttpFD(FileDownloader): 'speed': speed, }) + if is_test and byte_counter == data_len: + break + # Apply rate limit self.slow_down(start, byte_counter - resume_len) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index e93c28d..cc6a841 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import os import re import subprocess @@ -8,6 +10,7 @@ from .common import FileDownloader from ..utils import ( encodeFilename, format_bytes, + compat_str, ) @@ -22,7 +25,7 @@ class RtmpFD(FileDownloader): proc_stderr_closed = False while not proc_stderr_closed: # read line from stderr - line = u'' + line = '' while True: char = proc.stderr.read(1) if not char: @@ -46,7 +49,7 @@ class RtmpFD(FileDownloader): data_len = None if percent > 0: data_len = int(downloaded_data_len * 100 / percent) - data_len_str = u'~' + format_bytes(data_len) + data_len_str = '~' + format_bytes(data_len) self.report_progress(percent, data_len_str, speed, eta) cursor_in_new_line = False self._hook_progress({ @@ -76,12 +79,12 @@ class RtmpFD(FileDownloader): }) elif self.params.get('verbose', False): if not cursor_in_new_line: - self.to_screen(u'') + self.to_screen('') cursor_in_new_line = True - self.to_screen(u'[rtmpdump] '+line) + self.to_screen('[rtmpdump] '+line) proc.wait() if not cursor_in_new_line: - self.to_screen(u'') + self.to_screen('') return proc.returncode url = info_dict['url'] @@ -93,6 +96,7 @@ class RtmpFD(FileDownloader): flash_version = info_dict.get('flash_version', None) live = info_dict.get('rtmp_live', False) conn = info_dict.get('rtmp_conn', None) + protocol = info_dict.get('rtmp_protocol', None) self.report_destination(filename) tmpfilename = self.temp_name(filename) @@ -102,7 +106,7 @@ class RtmpFD(FileDownloader): try: subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) except (OSError, IOError): - self.report_error(u'RTMP download detected but "rtmpdump" could not be run') + self.report_error('RTMP download detected but "rtmpdump" could not be run') return False # Download using rtmpdump. rtmpdump returns exit code 2 when @@ -125,9 +129,14 @@ class RtmpFD(FileDownloader): basic_args += ['--flashVer', flash_version] if live: basic_args += ['--live'] - if conn: + if isinstance(conn, list): + for entry in conn: + basic_args += ['--conn', entry] + elif isinstance(conn, compat_str): basic_args += ['--conn', conn] - args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)] + if protocol is not None: + basic_args += ['--protocol', protocol] + args = basic_args + [[], ['--resume', '--skip', '1']][not live and self.params.get('continuedl', False)] if sys.platform == 'win32' and sys.version_info < (3, 0): # Windows subprocess module does not actually support Unicode @@ -150,26 +159,35 @@ class RtmpFD(FileDownloader): shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) except ImportError: shell_quote = repr - self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args)) + self.to_screen('[debug] rtmpdump command line: ' + shell_quote(str_args)) + + RD_SUCCESS = 0 + RD_FAILED = 1 + RD_INCOMPLETE = 2 + RD_NO_CONNECT = 3 retval = run_rtmpdump(args) - while (retval == 2 or retval == 1) and not test: + if retval == RD_NO_CONNECT: + self.report_error('[rtmpdump] Could not connect to RTMP server.') + return False + + while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live: prevsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'[rtmpdump] %s bytes' % prevsize) + self.to_screen('[rtmpdump] %s bytes' % prevsize) time.sleep(5.0) # This seems to be needed - retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) + retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == RD_FAILED]) cursize = os.path.getsize(encodeFilename(tmpfilename)) - if prevsize == cursize and retval == 1: + if prevsize == cursize and retval == RD_FAILED: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those - if prevsize == cursize and retval == 2 and cursize > 1024: - self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.') - retval = 0 + if prevsize == cursize and retval == RD_INCOMPLETE and cursize > 1024: + self.to_screen('[rtmpdump] Could not download the whole video. This can happen for some advertisements.') + retval = RD_SUCCESS break - if retval == 0 or (test and retval == 2): + if retval == RD_SUCCESS or (test and retval == RD_INCOMPLETE): fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen(u'[rtmpdump] %s bytes' % fsize) + self.to_screen('[rtmpdump] %s bytes' % fsize) self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, @@ -179,6 +197,6 @@ class RtmpFD(FileDownloader): }) return True else: - self.to_stderr(u"\n") - self.report_error(u'rtmpdump exited with code %d' % retval) + self.to_stderr('\n') + self.report_error('rtmpdump exited with code %d' % retval) return False diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 7253718..15a42ce 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -1,6 +1,8 @@ from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE +from .aftonbladet import AftonbladetIE from .anitube import AnitubeIE +from .aol import AolIE from .aparat import AparatIE from .appletrailers import AppleTrailersIE from .archiveorg import ArchiveOrgIE @@ -9,29 +11,39 @@ from .arte import ( ArteTvIE, ArteTVPlus7IE, ArteTVCreativeIE, + ArteTVConcertIE, ArteTVFutureIE, ArteTVDDCIE, + ArteTVEmbedIE, ) from .auengine import AUEngineIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE from .bbccouk import BBCCoUkIE +from .bilibili import BiliBiliIE from .blinkx import BlinkxIE from .bliptv import BlipTVIE, BlipTVUserIE from .bloomberg import BloombergIE +from .br import BRIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .byutv import BYUtvIE from .c56 import C56IE +from .canal13cl import Canal13clIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cbs import CBSIE +from .cbsnews import CBSNewsIE +from .ceskatelevize import CeskaTelevizeIE from .channel9 import Channel9IE from .chilloutzone import ChilloutzoneIE from .cinemassacre import CinemassacreIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE from .clipsyndicate import ClipsyndicateIE +from .clubic import ClubicIE from .cmt import CMTIE +from .cnet import CNETIE from .cnn import ( CNNIE, CNNBlogsIE, @@ -49,31 +61,36 @@ from .dailymotion import ( DailymotionUserIE, ) from .daum import DaumIE -from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .divxstage import DivxStageIE from .dropbox import DropboxIE from .ebaumsworld import EbaumsWorldIE from .ehow import EHowIE from .eighttracks import EightTracksIE from .eitb import EitbIE from .elpais import ElPaisIE +from .empflix import EmpflixIE +from .engadget import EngadgetIE from .escapist import EscapistIE from .everyonesmixtape import EveryonesMixtapeIE from .exfm import ExfmIE from .extremetube import ExtremeTubeIE from .facebook import FacebookIE from .faz import FazIE +from .fc2 import FC2IE from .firstpost import FirstpostIE from .firsttv import FirstTVIE +from .fivemin import FiveMinIE from .fktv import ( FKTVIE, FKTVPosteckeIE, ) from .flickr import FlickrIE from .fourtube import FourTubeIE +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -88,15 +105,18 @@ from .funnyordie import FunnyOrDieIE from .gamekings import GamekingsIE from .gamespot import GameSpotIE from .gametrailers import GametrailersIE +from .gdcvault import GDCVaultIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE from .hark import HarkIE from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .huffpost import HuffPostIE from .hypem import HypemIE +from .iconosquare import IconosquareIE from .ign import IGNIE, OneUPIE from .imdb import ( ImdbIE, @@ -104,7 +124,7 @@ from .imdb import ( ) from .ina import InaIE from .infoq import InfoQIE -from .instagram import InstagramIE +from .instagram import InstagramIE, InstagramUserIE from .internetvideoarchive import InternetVideoArchiveIE from .iprima import IPrimaIE from .ivi import ( @@ -122,6 +142,7 @@ from .khanacademy import KhanAcademyIE from .kickstarter import KickStarterIE from .keek import KeekIE from .kontrtube import KontrTubeIE +from .ku6 import Ku6IE from .la7 import LA7IE from .lifenews import LifeNewsIE from .liveleak import LiveLeakIE @@ -132,45 +153,67 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE +from .mailru import MailRuIE from .malemotion import MalemotionIE from .mdr import MDRIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE -from .mit import TechTVMITIE, MITIE +from .mit import TechTVMITIE, MITIE, OCWMITIE from .mixcloud import MixcloudIE from .mpora import MporaIE from .mofosex import MofosexIE from .mooshare import MooshareIE +from .morningstar import MorningstarIE +from .motorsport import MotorsportIE +from .moviezine import MoviezineIE +from .movshare import MovShareIE from .mtv import ( MTVIE, MTVIggyIE, ) +from .musicplayon import MusicPlayOnIE from .muzu import MuzuTVIE from .myspace import MySpaceIE from .myspass import MySpassIE from .myvideo import MyVideoIE from .naver import NaverIE from .nba import NBAIE -from .nbc import NBCNewsIE +from .nbc import ( + NBCIE, + NBCNewsIE, +) from .ndr import NDRIE from .ndtv import NDTVIE from .newgrounds import NewgroundsIE +from .newstube import NewstubeIE from .nfb import NFBIE from .nhl import NHLIE, NHLVideocenterIE from .niconico import NiconicoIE from .ninegag import NineGagIE +from .noco import NocoIE from .normalboots import NormalbootsIE -from .novamov import NovamovIE +from .novamov import NovaMovIE from .nowness import NownessIE from .nowvideo import NowVideoIE +from .nrk import ( + NRKIE, + NRKTVIE, +) +from .ntv import NTVIE +from .nytimes import NYTimesIE +from .nuvid import NuvidIE +from .oe1 import OE1IE from .ooyala import OoyalaIE from .orf import ORFIE +from .parliamentliveuk import ParliamentLiveUKIE from .pbs import PBSIE from .photobucket import PhotobucketIE +from .playvid import PlayvidIE from .podomatic import PodomaticIE from .pornhd import PornHdIE from .pornhub import PornHubIE from .pornotube import PornotubeIE +from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .radiofrance import RadioFranceIE from .rbmaradio import RBMARadioIE @@ -179,17 +222,23 @@ from .ringtv import RingTVIE from .ro220 import Ro220IE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rtbf import RTBFIE from .rtlnow import RTLnowIE +from .rts import RTSIE +from .rtve import RTVEALaCartaIE from .rutube import ( RutubeIE, RutubeChannelIE, RutubeMovieIE, RutubePersonIE, ) +from .rutv import RUTVIE +from .savefrom import SaveFromIE +from .scivee import SciVeeIE from .servingsys import ServingSysIE from .sina import SinaIE -from .slashdot import SlashdotIE from .slideshare import SlideshareIE +from .slutload import SlutloadIE from .smotri import ( SmotriIE, SmotriCommunityIE, @@ -197,7 +246,12 @@ from .smotri import ( SmotriBroadcastIE, ) from .sohu import SohuIE -from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE +from .soundcloud import ( + SoundcloudIE, + SoundcloudSetIE, + SoundcloudUserIE, + SoundcloudPlaylistIE +) from .southparkstudios import ( SouthParkStudiosIE, SouthparkDeIE, @@ -205,41 +259,62 @@ from .southparkstudios import ( from .space import SpaceIE from .spankwire import SpankwireIE from .spiegel import SpiegelIE +from .spiegeltv import SpiegeltvIE from .spike import SpikeIE from .stanfordoc import StanfordOpenClassroomIE -from .statigram import StatigramIE from .steam import SteamIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .swrmediathek import SWRMediathekIE from .syfy import SyfyIE from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .teachertube import ( + TeacherTubeIE, + TeacherTubeClassroomIE, +) +from .teachingchannel import TeachingChannelIE from .teamcoco import TeamcocoIE from .techtalks import TechTalksIE from .ted import TEDIE +from .testurl import TestURLIE from .tf1 import TF1IE from .theplatform import ThePlatformIE from .thisav import ThisAVIE from .tinypic import TinyPicIE +from .tlc import TlcIE, TlcDeIE from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE from .traileraddict import TrailerAddictIE from .trilulilu import TriluliluIE +from .trutube import TruTubeIE from .tube8 import Tube8IE from .tudou import TudouIE from .tumblr import TumblrIE from .tutv import TutvIE +from .tvigle import TvigleIE from .tvp import TvpIE +from .udemy import ( + UdemyIE, + UdemyCourseIE +) from .unistra import UnistraIE +from .urort import UrortIE from .ustream import UstreamIE, UstreamChannelIE from .vbox7 import Vbox7IE from .veehd import VeeHDIE from .veoh import VeohIE from .vesti import VestiIE from .vevo import VevoIE -from .vice import ViceIE +from .vh1 import VH1IE from .viddler import ViddlerIE +from .videobam import VideoBamIE from .videodetective import VideoDetectiveIE +from .videolecturesnet import VideoLecturesNetIE from .videofyme import VideofyMeIE from .videopremium import VideoPremiumIE +from .videott import VideoTtIE +from .videoweed import VideoWeedIE from .vimeo import ( VimeoIE, VimeoChannelIE, @@ -247,20 +322,32 @@ from .vimeo import ( VimeoAlbumIE, VimeoGroupsIE, VimeoReviewIE, + VimeoWatchLaterIE, +) +from .vine import ( + VineIE, + VineUserIE, ) -from .vine import VineIE from .viki import VikiIE from .vk import VKIE from .vube import VubeIE +from .vuclip import VuClipIE +from .washingtonpost import WashingtonPostIE from .wat import WatIE +from .wdr import ( + WDRIE, + WDRMobileIE, + WDRMausIE, +) from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE +from .xbef import XBefIE from .xhamster import XHamsterIE from .xnxx import XNXXIE from .xvideos import XVideosIE -from .xtube import XTubeIE +from .xtube import XTubeUserIE, XTubeIE from .yahoo import ( YahooIE, YahooNewsIE, @@ -271,19 +358,20 @@ from .youku import YoukuIE from .youporn import YouPornIE from .youtube import ( YoutubeIE, + YoutubeChannelIE, + YoutubeFavouritesIE, + YoutubeHistoryIE, YoutubePlaylistIE, - YoutubeSearchIE, + YoutubeRecommendedIE, YoutubeSearchDateIE, - YoutubeUserIE, - YoutubeChannelIE, + YoutubeSearchIE, + YoutubeSearchURLIE, YoutubeShowIE, YoutubeSubscriptionsIE, - YoutubeRecommendedIE, + YoutubeTopListIE, YoutubeTruncatedURLIE, + YoutubeUserIE, YoutubeWatchLaterIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTopListIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/academicearth.py b/youtube_dl/extractor/academicearth.py index 72f81d0..59d3bbb 100644 --- a/youtube_dl/extractor/academicearth.py +++ b/youtube_dl/extractor/academicearth.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class AcademicEarthCourseIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/(?:courses|playlists)/(?P[^?#/]+)' + _VALID_URL = r'^https?://(?:www\.)?academicearth\.org/playlists/(?P[^?#/]+)' IE_NAME = 'AcademicEarth:Course' def _real_extract(self, url): @@ -14,12 +14,12 @@ class AcademicEarthCourseIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) title = self._html_search_regex( - r'

(.*?)

', webpage, u'title') + r'

]*?>(.*?)

', webpage, u'title') description = self._html_search_regex( - r'

(.*?)

', + r'

]*?>(.*?)

', webpage, u'description', fatal=False) urls = re.findall( - r'

', + r'
  • \s*?', webpage) entries = [self.url_result(u) for u in urls] diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index a3a1b99..fcf2960 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -14,14 +16,14 @@ from ..utils import ( class AddAnimeIE(InfoExtractor): _VALID_URL = r'^http://(?:\w+\.)?add-anime\.net/watch_video\.php\?(?:.*?)v=(?P[\w_]+)(?:.*)' - IE_NAME = u'AddAnime' _TEST = { - u'url': u'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', - u'file': u'24MR3YO5SAS9.mp4', - u'md5': u'72954ea10bc979ab5e2eb288b21425a0', - u'info_dict': { - u"description": u"One Piece 606", - u"title": u"One Piece 606" + 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9', + 'md5': '72954ea10bc979ab5e2eb288b21425a0', + 'info_dict': { + 'id': '24MR3YO5SAS9', + 'ext': 'mp4', + 'description': 'One Piece 606', + 'title': 'One Piece 606', } } @@ -38,10 +40,10 @@ class AddAnimeIE(InfoExtractor): redir_webpage = ee.cause.read().decode('utf-8') action = self._search_regex( r'
    ', - redir_webpage, u'redirect vc value') + redir_webpage, 'redirect vc value') av = re.search( r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);', redir_webpage) @@ -52,19 +54,19 @@ class AddAnimeIE(InfoExtractor): parsed_url = compat_urllib_parse_urlparse(url) av_val = av_res + len(parsed_url.netloc) confirm_url = ( - parsed_url.scheme + u'://' + parsed_url.netloc + + parsed_url.scheme + '://' + parsed_url.netloc + action + '?' + compat_urllib_parse.urlencode({ 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)})) self._download_webpage( confirm_url, video_id, - note=u'Confirming after redirect') + note='Confirming after redirect') webpage = self._download_webpage(url, video_id) formats = [] for format_id in ('normal', 'hq'): rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id) - video_url = self._search_regex(rex, webpage, u'video file URLx', + video_url = self._search_regex(rex, webpage, 'video file URLx', fatal=False) if not video_url: continue @@ -72,14 +74,13 @@ class AddAnimeIE(InfoExtractor): 'format_id': format_id, 'url': video_url, }) - if not formats: - raise ExtractorError(u'Cannot find any video format!') + self._sort_formats(formats) video_title = self._og_search_title(webpage) video_description = self._og_search_description(webpage) return { '_type': 'video', - 'id': video_id, + 'id': video_id, 'formats': formats, 'title': video_title, 'description': video_description diff --git a/youtube_dl/extractor/aftonbladet.py b/youtube_dl/extractor/aftonbladet.py new file mode 100644 index 0000000..cfc7370 --- /dev/null +++ b/youtube_dl/extractor/aftonbladet.py @@ -0,0 +1,66 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class AftonbladetIE(InfoExtractor): + _VALID_URL = r'^http://tv\.aftonbladet\.se/webbtv.+?(?Particle[0-9]+)\.ab(?:$|[?#])' + _TEST = { + 'url': 'http://tv.aftonbladet.se/webbtv/nyheter/vetenskap/rymden/article36015.ab', + 'info_dict': { + 'id': 'article36015', + 'ext': 'mp4', + 'title': 'Vulkanutbrott i rymden - nu släpper NASA bilderna', + 'description': 'Jupiters måne mest aktiv av alla himlakroppar', + 'timestamp': 1394142732, + 'upload_date': '20140306', + }, + } + + def _real_extract(self, url): + mobj = re.search(self._VALID_URL, url) + + video_id = mobj.group('video_id') + webpage = self._download_webpage(url, video_id) + + # find internal video meta data + meta_url = 'http://aftonbladet-play.drlib.aptoma.no/video/%s.json' + internal_meta_id = self._html_search_regex( + r'data-aptomaId="([\w\d]+)"', webpage, 'internal_meta_id') + internal_meta_url = meta_url % internal_meta_id + internal_meta_json = self._download_json( + internal_meta_url, video_id, 'Downloading video meta data') + + # find internal video formats + format_url = 'http://aftonbladet-play.videodata.drvideo.aptoma.no/actions/video/?id=%s' + internal_video_id = internal_meta_json['videoId'] + internal_formats_url = format_url % internal_video_id + internal_formats_json = self._download_json( + internal_formats_url, video_id, 'Downloading video formats') + + formats = [] + for fmt in internal_formats_json['formats']['http']['pseudostreaming']['mp4']: + p = fmt['paths'][0] + formats.append({ + 'url': 'http://%s:%d/%s/%s' % (p['address'], p['port'], p['path'], p['filename']), + 'ext': 'mp4', + 'width': fmt['width'], + 'height': fmt['height'], + 'tbr': fmt['bitrate'], + 'protocol': 'http', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': internal_meta_json['title'], + 'formats': formats, + 'thumbnail': internal_meta_json['imageUrl'], + 'description': internal_meta_json['shortPreamble'], + 'timestamp': internal_meta_json['timePublished'], + 'duration': internal_meta_json['duration'], + 'view_count': internal_meta_json['views'], + } diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py new file mode 100644 index 0000000..a7bfe5a --- /dev/null +++ b/youtube_dl/extractor/aol.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .fivemin import FiveMinIE + + +class AolIE(InfoExtractor): + IE_NAME = 'on.aol.com' + _VALID_URL = r'''(?x) + (?: + aol-video:| + http://on\.aol\.com/ + (?: + video/.*-| + playlist/(?P[^/?#]+?)-(?P[0-9]+)[?#].*_videoid= + ) + ) + (?P[0-9]+) + (?:$|\?) + ''' + + _TEST = { + 'url': 'http://on.aol.com/video/u-s--official-warns-of-largest-ever-irs-phone-scam-518167793?icid=OnHomepageC2Wide_MustSee_Img', + 'md5': '18ef68f48740e86ae94b98da815eec42', + 'info_dict': { + 'id': '518167793', + 'ext': 'mp4', + 'title': 'U.S. Official Warns Of \'Largest Ever\' IRS Phone Scam', + }, + 'add_ie': ['FiveMin'], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + playlist_id = mobj.group('playlist_id') + if playlist_id and not self._downloader.params.get('noplaylist'): + self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + + webpage = self._download_webpage(url, playlist_id) + title = self._html_search_regex( + r'

    (.+?)

    ', webpage, 'title') + playlist_html = self._search_regex( + r"(?s)(.*?)", webpage, + 'playlist HTML') + entries = [{ + '_type': 'url', + 'url': 'aol-video:%s' % m.group('id'), + 'ie_key': 'Aol', + } for m in re.finditer( + r"[0-9]+)'\s+class='video-thumb'>", + playlist_html)] + + return { + '_type': 'playlist', + 'id': playlist_id, + 'display_id': mobj.group('playlist_display_id'), + 'title': title, + 'entries': entries, + } + + return FiveMinIE._build_result(video_id) diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 922cede..dc8657b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -6,7 +6,6 @@ import json from .common import InfoExtractor from ..utils import ( compat_urlparse, - determine_ext, ) @@ -16,9 +15,10 @@ class AppleTrailersIE(InfoExtractor): "url": "http://trailers.apple.com/trailers/wb/manofsteel/", "playlist": [ { - "file": "manofsteel-trailer4.mov", "md5": "d97a8e575432dbcb81b7c3acb741f8a8", "info_dict": { + "id": "manofsteel-trailer4", + "ext": "mov", "duration": 111, "title": "Trailer 4", "upload_date": "20130523", @@ -26,9 +26,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer3.mov", "md5": "b8017b7131b721fb4e8d6f49e1df908c", "info_dict": { + "id": "manofsteel-trailer3", + "ext": "mov", "duration": 182, "title": "Trailer 3", "upload_date": "20130417", @@ -36,9 +37,10 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-trailer.mov", "md5": "d0f1e1150989b9924679b441f3404d48", "info_dict": { + "id": "manofsteel-trailer", + "ext": "mov", "duration": 148, "title": "Trailer", "upload_date": "20121212", @@ -46,15 +48,16 @@ class AppleTrailersIE(InfoExtractor): }, }, { - "file": "manofsteel-teaser.mov", "md5": "5fe08795b943eb2e757fa95cb6def1cb", "info_dict": { + "id": "manofsteel-teaser", + "ext": "mov", "duration": 93, "title": "Teaser", "upload_date": "20120721", "uploader_id": "wb", }, - } + }, ] } @@ -65,16 +68,16 @@ class AppleTrailersIE(InfoExtractor): movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') def fix_html(s): - s = re.sub(r'(?s).*?', u'', s) + s = re.sub(r'(?s).*?', '', s) s = re.sub(r'', r'', s) # The ' in the onClick attributes are not escaped, it couldn't be parsed # like: http://trailers.apple.com/trailers/wb/gravity/ def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') s = re.sub(self._JSON_RE, _clean_json, s) - s = u'' + s + u'' + s = '' + s + u'' return s doc = self._download_xml(playlist_url, movie, transform_source=fix_html) @@ -82,7 +85,7 @@ class AppleTrailersIE(InfoExtractor): for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] trailer_info_json = self._search_regex(self._JSON_RE, - on_click, u'trailer info') + on_click, 'trailer info') trailer_info = json.loads(trailer_info_json) title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() @@ -98,8 +101,7 @@ class AppleTrailersIE(InfoExtractor): first_url = trailer_info['url'] trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) - settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json') - settings = json.loads(settings_json) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') formats = [] for format in settings['metadata']['sizes']: @@ -107,7 +109,6 @@ class AppleTrailersIE(InfoExtractor): format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) formats.append({ 'url': format_url, - 'ext': determine_ext(format_url), 'format': format['type'], 'width': format['width'], 'height': int(format['height']), diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index b88f71b..c6d22c0 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -38,15 +38,19 @@ class ARDIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex( - r'(.*?)
  • ', webpage, 'title') + [r'(.*?)', + r'', + r'

    (.*?)

    '], + webpage, 'title') description = self._html_search_meta( 'dcterms.abstract', webpage, 'description') thumbnail = self._og_search_thumbnail(webpage) - streams = [ - mo.groupdict() - for mo in re.finditer( - r'mediaCollection\.addMediaStream\((?P\d+), (?P\d+), "(?P[^"]*)", "(?P[^"]*)", "[^"]*"\)', webpage)] + + media_info = self._download_json( + 'http://www.ardmediathek.de/play/media/%s' % video_id, video_id) + # The second element of the _mediaArray contains the standard http urls + streams = media_info['_mediaArray'][1]['_mediaStreamArray'] if not streams: if '"fsk"' in webpage: raise ExtractorError('This video is only available after 20:00') @@ -54,21 +58,12 @@ class ARDIE(InfoExtractor): formats = [] for s in streams: format = { - 'quality': int(s['quality']), + 'quality': s['_quality'], + 'url': s['_stream'], } - if s.get('rtmp_url'): - format['protocol'] = 'rtmp' - format['url'] = s['rtmp_url'] - format['playpath'] = s['video_url'] - else: - format['url'] = s['video_url'] - - quality_name = self._search_regex( - r'[,.]([a-zA-Z0-9_-]+),?\.mp4', format['url'], - 'quality name', default='NA') - format['format_id'] = '%s-%s-%s-%s' % ( - determine_ext(format['url']), quality_name, s['media_type'], - s['quality']) + + format['format_id'] = '%s-%s' % ( + determine_ext(format['url']), format['quality']) formats.append(format) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 7cf3785..b528a9e 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor from ..utils import ( @@ -19,115 +18,46 @@ from ..utils import ( # is different for each one. The videos usually expire in 7 days, so we can't # add tests. -class ArteTvIE(InfoExtractor): - _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?Pfr|de)/.*-(?P.*?)\.html' - _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?Pfr|de)/(?P.+?)/(?P.+)' - _LIVE_URL = r'index-[0-9]+\.html$' +class ArteTvIE(InfoExtractor): + _VALID_URL = r'http://videos\.arte\.tv/(?Pfr|de)/.*-(?P.*?)\.html' IE_NAME = 'arte.tv' - @classmethod - def suitable(cls, url): - return any(re.match(regex, url) for regex in (cls._VIDEOS_URL, cls._LIVEWEB_URL)) - - # TODO implement Live Stream - # from ..utils import compat_urllib_parse - # def extractLiveStream(self, url): - # video_lang = url.split('/')[-4] - # info = self.grep_webpage( - # url, - # r'src="(.*?/videothek_js.*?\.js)', - # 0, - # [ - # (1, 'url', 'Invalid URL: %s' % url) - # ] - # ) - # http_host = url.split('/')[2] - # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) - # info = self.grep_webpage( - # next_url, - # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - # '(http://.*?\.swf).*?' + - # '(rtmp://.*?)\'', - # re.DOTALL, - # [ - # (1, 'path', 'could not extract video path: %s' % url), - # (2, 'player', 'could not extract video player: %s' % url), - # (3, 'url', 'could not extract video url: %s' % url) - # ] - # ) - # video_url = '%s/%s' % (info.get('url'), info.get('path')) - def _real_extract(self, url): - mobj = re.match(self._VIDEOS_URL, url) - if mobj is not None: - id = mobj.group('id') - lang = mobj.group('lang') - return self._extract_video(url, id, lang) - - mobj = re.match(self._LIVEWEB_URL, url) - if mobj is not None: - name = mobj.group('name') - lang = mobj.group('lang') - return self._extract_liveweb(url, name, lang) - - if re.search(self._LIVE_URL, url) is not None: - raise ExtractorError(u'Arte live streams are not yet supported, sorry') - # self.extractLiveStream(url) - # return - - def _extract_video(self, url, video_id, lang): - """Extract from videos.arte.tv""" + mobj = re.match(self._VALID_URL, url) + lang = mobj.group('lang') + video_id = mobj.group('id') + ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata') + ref_xml_doc = self._download_xml( + ref_xml_url, video_id, note='Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] - config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') - - video_urls = list(re.finditer(r'(?P.*?)', config_xml)) - def _key(m): - quality = m.group('quality') - if quality == 'hd': - return 2 - else: - return 1 - # We pick the best quality - video_urls = sorted(video_urls, key=_key) - video_url = list(video_urls)[-1].group('url') - - title = self._html_search_regex(r'(.*?)', config_xml, 'title') - thumbnail = self._html_search_regex(r'(.*?)', - config_xml, 'thumbnail') - return {'id': video_id, - 'title': title, - 'thumbnail': thumbnail, - 'url': video_url, - 'ext': 'flv', - } - - def _extract_liveweb(self, url, name, lang): - """Extract form http://liveweb.arte.tv/""" - webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id') - config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, - video_id, 'Downloading information') - event_doc = config_doc.find('event') - url_node = event_doc.find('video').find('urlHd') - if url_node is None: - url_node = event_doc.find('urlSd') - - return {'id': video_id, - 'title': event_doc.find('name%s' % lang.capitalize()).text, - 'url': url_node.text.replace('MP4', 'mp4'), - 'ext': 'flv', - 'thumbnail': self._og_search_thumbnail(webpage), - } + config = self._download_xml( + config_xml_url, video_id, note='Downloading configuration') + + formats = [{ + 'forma_id': q.attrib['quality'], + 'url': q.text, + 'ext': 'flv', + 'quality': 2 if q.attrib['quality'] == 'hd' else 1, + } for q in config.findall('./urls/url')] + self._sort_formats(formats) + + title = config.find('.//name').text + thumbnail = config.find('.//firstThumbnailUrl').text + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' @classmethod def _extract_url_info(cls, url): @@ -144,13 +74,12 @@ class ArteTVPlus7IE(InfoExtractor): return self._extract_from_webpage(webpage, video_id, lang) def _extract_from_webpage(self, webpage, video_id, lang): - json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + json_url = self._html_search_regex( + r'arte_vp_url="(.*?)"', webpage, 'json vp url') return self._extract_from_json_url(json_url, video_id, lang) def _extract_from_json_url(self, json_url, video_id, lang): - json_info = self._download_webpage(json_url, video_id, 'Downloading info json') - self.report_extraction(video_id) - info = json.loads(json_info) + info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] info_dict = { @@ -172,6 +101,8 @@ class ArteTVPlus7IE(InfoExtractor): l = 'F' elif lang == 'de': l = 'A' + else: + l = lang regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url @@ -190,14 +121,19 @@ class ArteTVPlus7IE(InfoExtractor): return ['HQ', 'MQ', 'EQ', 'SQ'].index(f['quality']) else: def sort_key(f): + versionCode = f.get('versionCode') + if versionCode is None: + versionCode = '' return ( # Sort first by quality - int(f.get('height',-1)), - int(f.get('bitrate',-1)), + int(f.get('height', -1)), + int(f.get('bitrate', -1)), # The original version with subtitles has lower relevance - re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, + re.match(r'VO-ST(F|A)', versionCode) is None, # The version with sourds/mal subtitles has also lower relevance - re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, + re.match(r'VO?(F|A)-STM\1', versionCode) is None, + # Prefer http downloads over m3u8 + 0 if f['url'].endswith('m3u8') else 1, ) formats = sorted(formats, key=sort_key) def _format(format_info): @@ -238,8 +174,9 @@ class ArteTVCreativeIE(ArteTVPlus7IE): _TEST = { 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', - 'file': '050489-002.mp4', 'info_dict': { + 'id': '050489-002', + 'ext': 'mp4', 'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', }, } @@ -251,8 +188,9 @@ class ArteTVFutureIE(ArteTVPlus7IE): _TEST = { 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', - 'file': '050940-003.mp4', 'info_dict': { + 'id': '050940-003', + 'ext': 'mp4', 'title': 'Les champignons au secours de la planète', }, } @@ -266,7 +204,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' - _VALID_URL = r'http?://ddc\.arte\.tv/(?Pemission|folge)/(?P.+)' + _VALID_URL = r'https?://ddc\.arte\.tv/(?Pemission|folge)/(?P.+)' def _real_extract(self, url): video_id, lang = self._extract_url_info(url) @@ -280,3 +218,39 @@ class ArteTVDDCIE(ArteTVPlus7IE): javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVConcertIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:concert' + _VALID_URL = r'https?://concert\.arte\.tv/(?Pde|fr)/(?P.+)' + + _TEST = { + 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', + 'md5': '9ea035b7bd69696b67aa2ccaaa218161', + 'info_dict': { + 'id': '186', + 'ext': 'mp4', + 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', + 'upload_date': '20140128', + 'description': 'md5:486eb08f991552ade77439fe6d82c305', + }, + } + + +class ArteTVEmbedIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:embed' + _VALID_URL = r'''(?x) + http://www\.arte\.tv + /playerv2/embed\.php\?json_url= + (?P + http://arte\.tv/papi/tvguide/videos/stream/player/ + (?P[^/]+)/(?P[^/]+)[^&]* + ) + ''' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + lang = mobj.group('lang') + json_url = mobj.group('json_url') + return self._extract_from_json_url(json_url, video_id, lang) diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py index c6f30e6..20bf125 100644 --- a/youtube_dl/extractor/auengine.py +++ b/youtube_dl/extractor/auengine.py @@ -11,22 +11,24 @@ from ..utils import ( class AUEngineIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?auengine\.com/embed\.php\?.*?file=(?P[^&]+).*?' + _TEST = { 'url': 'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370', - 'file': 'lfvlytY6.mp4', 'md5': '48972bdbcf1a3a2f5533e62425b41d4f', 'info_dict': { + 'id': 'lfvlytY6', + 'ext': 'mp4', 'title': '[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]' } } - _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed\.php\?.*?file=([^&]+).*?' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group(1) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(?P<title>.+?)', - webpage, 'title') + title = self._html_search_regex(r'(?P<title>.+?)', webpage, 'title') title = title.strip() links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage) links = map(compat_urllib_parse.unquote, links) @@ -39,14 +41,15 @@ class AUEngineIE(InfoExtractor): elif '/videos/' in link: video_url = link if not video_url: - raise ExtractorError(u'Could not find video URL') + raise ExtractorError('Could not find video URL') ext = '.' + determine_ext(video_url) if ext == title[-len(ext):]: title = title[:-len(ext)] return { - 'id': video_id, - 'url': video_url, - 'title': title, + 'id': video_id, + 'url': video_url, + 'title': title, 'thumbnail': thumbnail, + 'http_referer': 'http://www.auengine.com/flowplayer/flowplayer.commercial-3.2.14.swf', } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 886b0df..dcbbdef 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -12,14 +12,14 @@ from ..utils import ( class BandcampIE(InfoExtractor): - _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' + _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'file': '1812978515.mp3', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { "title": "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", - "duration": 10, + "duration": 9.8485, }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }] @@ -28,36 +28,32 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) - # We get the link to the free download page m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if m_download is None: + if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) if m_trackinfo: json_code = m_trackinfo.group(1) - data = json.loads(json_code) - d = data[0] + data = json.loads(json_code)[0] - duration = int(round(d['duration'])) formats = [] - for format_id, format_url in d['file'].items(): - ext, _, abr_str = format_id.partition('-') - + for format_id, format_url in data['file'].items(): + ext, abr_str = format_id.split('-', 1) formats.append({ 'format_id': format_id, 'url': format_url, - 'ext': format_id.partition('-')[0], + 'ext': ext, 'vcodec': 'none', - 'acodec': format_id.partition('-')[0], - 'abr': int(format_id.partition('-')[2]), + 'acodec': ext, + 'abr': int(abr_str), }) self._sort_formats(formats) return { - 'id': compat_str(d['id']), - 'title': d['title'], + 'id': compat_str(data['id']), + 'title': data['title'], 'formats': formats, - 'duration': duration, + 'duration': float(data['duration']), } else: raise ExtractorError('No free songs found') @@ -67,11 +63,9 @@ class BandcampIE(InfoExtractor): r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', webpage, re.MULTILINE | re.DOTALL).group('id') - download_webpage = self._download_webpage(download_link, video_id, - 'Downloading free downloads page') - # We get the dictionary of the track from some javascrip code - info = re.search(r'items: (.*?),$', - download_webpage, re.MULTILINE).group(1) + download_webpage = self._download_webpage(download_link, video_id, 'Downloading free downloads page') + # We get the dictionary of the track from some javascript code + info = re.search(r'items: (.*?),$', download_webpage, re.MULTILINE).group(1) info = json.loads(info)[0] # We pick mp3-320 for now, until format selection can be easily implemented. mp3_info = info['downloads']['mp3-320'] @@ -100,7 +94,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<title>[^?#]+))' _TEST = { 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -123,13 +117,15 @@ class BandcampAlbumIE(InfoExtractor): 'params': { 'playlistend': 2 }, - 'skip': 'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + 'skip': 'Bandcamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('subdomain') title = mobj.group('title') - webpage = self._download_webpage(url, title) + display_id = title or playlist_id + webpage = self._download_webpage(url, display_id) tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage) if not tracks_paths: raise ExtractorError('The page doesn\'t contain any tracks') @@ -139,6 +135,8 @@ class BandcampAlbumIE(InfoExtractor): title = self._search_regex(r'album_title : "(.*?)"', webpage, 'title') return { '_type': 'playlist', + 'id': playlist_id, + 'display_id': display_id, 'title': title, 'entries': entries, } diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 6d785c0..75e608f 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -13,13 +13,13 @@ class BBCCoUkIE(SubtitlesInfoExtractor): _TESTS = [ { - 'url': 'http://www.bbc.co.uk/programmes/p01q7wz1', + 'url': 'http://www.bbc.co.uk/programmes/b039g8p7', 'info_dict': { - 'id': 'p01q7wz4', + 'id': 'b039d07m', 'ext': 'flv', - 'title': 'Friction: Blu Mar Ten guest mix: Blu Mar Ten - Guest Mix', - 'description': 'Blu Mar Ten deliver a Guest Mix for Friction.', - 'duration': 1936, + 'title': 'Kaleidoscope: Leonard Cohen', + 'description': 'md5:db4755d7a665ae72343779f7dacb402c', + 'duration': 1740, }, 'params': { # rtmp download @@ -38,7 +38,8 @@ class BBCCoUkIE(SubtitlesInfoExtractor): 'params': { # rtmp download 'skip_download': True, - } + }, + 'skip': 'Episode is no longer available on BBC iPlayer Radio', }, { 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/', @@ -161,6 +162,11 @@ class BBCCoUkIE(SubtitlesInfoExtractor): mobj = re.match(self._VALID_URL, url) group_id = mobj.group('id') + webpage = self._download_webpage(url, group_id, 'Downloading video page') + if re.search(r'id="emp-error" class="notinuk">', webpage): + raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only', + expected=True) + playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id, 'Downloading playlist XML') diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py new file mode 100644 index 0000000..45067b9 --- /dev/null +++ b/youtube_dl/extractor/bilibili.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_parse_qs, + ExtractorError, + int_or_none, + unified_strdate, +) + + +class BiliBiliIE(InfoExtractor): + _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/' + + _TEST = { + 'url': 'http://www.bilibili.tv/video/av1074402/', + 'md5': '2c301e4dab317596e837c3e7633e7d86', + 'info_dict': { + 'id': '1074402', + 'ext': 'flv', + 'title': '【金坷垃】金泡沫', + 'duration': 308, + 'upload_date': '20140420', + 'thumbnail': 're:^https?://.+\.jpg', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + video_code = self._search_regex( + r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code') + + title = self._html_search_meta( + 'media:title', video_code, 'title', fatal=True) + duration_str = self._html_search_meta( + 'duration', video_code, 'duration') + if duration_str is None: + duration = None + else: + duration_mobj = re.match( + r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$', + duration_str) + duration = ( + int_or_none(duration_mobj.group('hours'), default=0) * 3600 + + int(duration_mobj.group('minutes')) * 60 + + int(duration_mobj.group('seconds'))) + upload_date = unified_strdate(self._html_search_meta( + 'uploadDate', video_code, fatal=False)) + thumbnail = self._html_search_meta( + 'thumbnailUrl', video_code, 'thumbnail', fatal=False) + + player_params = compat_parse_qs(self._html_search_regex( + r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"', + webpage, 'player params')) + + if 'cid' in player_params: + cid = player_params['cid'][0] + + lq_doc = self._download_xml( + 'http://interface.bilibili.cn/v_cdn_play?cid=%s' % cid, + video_id, + note='Downloading LQ video info' + ) + lq_durl = lq_doc.find('.//durl') + formats = [{ + 'format_id': 'lq', + 'quality': 1, + 'url': lq_durl.find('./url').text, + 'filesize': int_or_none( + lq_durl.find('./size'), get_attr='text'), + }] + + hq_doc = self._download_xml( + 'http://interface.bilibili.cn/playurl?cid=%s' % cid, + video_id, + note='Downloading HQ video info', + fatal=False, + ) + if hq_doc is not False: + hq_durl = hq_doc.find('.//durl') + formats.append({ + 'format_id': 'hq', + 'quality': 2, + 'ext': 'flv', + 'url': hq_durl.find('./url').text, + 'filesize': int_or_none( + hq_durl.find('./size'), get_attr='text'), + }) + else: + raise ExtractorError('Unsupported player parameters: %r' % (player_params,)) + + self._sort_formats(formats) + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'duration': duration, + 'upload_date': upload_date, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 96408e4..38ccd95 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import datetime import json import re @@ -19,15 +18,16 @@ class BlinkxIE(InfoExtractor): 'file': '8aQUy7GV.mp4', 'md5': '2e9a07364af40163a908edbf10bb2492', 'info_dict': { - "title": "Police Car Rolls Away", - "uploader": "stupidvideos.com", - "upload_date": "20131215", - "description": "A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!", - "duration": 14.886, - "thumbnails": [{ - "width": 100, - "height": 76, - "url": "http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg", + 'title': 'Police Car Rolls Away', + 'uploader': 'stupidvideos.com', + 'upload_date': '20131215', + 'timestamp': 1387068000, + 'description': 'A police car gently rolls away from a fight. Maybe it felt weird being around a confrontation and just had to get out of there!', + 'duration': 14.886, + 'thumbnails': [{ + 'width': 100, + 'height': 76, + 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg', }], }, } @@ -41,9 +41,6 @@ class BlinkxIE(InfoExtractor): 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] - dt = datetime.datetime.fromtimestamp(data['pubdate_epoch']) - pload_date = dt.strftime('%Y%m%d') - duration = None thumbnails = [] formats = [] @@ -64,10 +61,7 @@ class BlinkxIE(InfoExtractor): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') tbr = (int(m['vbr']) + int(m['abr'])) // 1000 - format_id = (u'%s-%sk-%s' % - (vcodec, - tbr, - m['w'])) + format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w']) formats.append({ 'format_id': format_id, 'url': m['link'], @@ -88,7 +82,7 @@ class BlinkxIE(InfoExtractor): 'title': data['title'], 'formats': formats, 'uploader': data['channel_name'], - 'upload_date': pload_date, + 'timestamp': data['pubdate_epoch'], 'description': data.get('description'), 'thumbnails': thumbnails, 'duration': duration, diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py index a26001b..d4da089 100644 --- a/youtube_dl/extractor/bliptv.py +++ b/youtube_dl/extractor/bliptv.py @@ -1,102 +1,124 @@ from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( - compat_str, compat_urllib_request, - unescapeHTML, + parse_iso8601, + compat_urlparse, + clean_html, + compat_str, ) class BlipTVIE(SubtitlesInfoExtractor): - """Information extractor for blip.tv""" - - _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(?P<presumptive_id>.+)$' - - _TESTS = [{ - 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', - 'md5': 'c6934ad0b6acf2bd920720ec888eb812', - 'info_dict': { - 'id': '5779306', - 'ext': 'mov', - 'upload_date': '20111205', - 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', - 'uploader': 'Comic Book Resources - CBR TV', - 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', - } - }, { - # https://github.com/rg3/youtube-dl/pull/2274 - 'note': 'Video with subtitles', - 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', - 'md5': '309f9d25b820b086ca163ffac8031806', - 'info_dict': { - 'id': '6586561', - 'ext': 'mp4', - 'uploader': 'Red vs. Blue', - 'description': 'One-Zero-One', - 'upload_date': '20130614', - 'title': 'Red vs. Blue Season 11 Episode 1', + _VALID_URL = r'https?://(?:\w+\.)?blip\.tv/(?:(?:.+-|rss/flash/)(?P<id>\d+)|((?:play/|api\.swf#)(?P<lookup_id>[\da-zA-Z]+)))' + + _TESTS = [ + { + 'url': 'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352', + 'md5': 'c6934ad0b6acf2bd920720ec888eb812', + 'info_dict': { + 'id': '5779306', + 'ext': 'mov', + 'title': 'CBR EXCLUSIVE: "Gotham City Imposters" Bats VS Jokerz Short 3', + 'description': 'md5:9bc31f227219cde65e47eeec8d2dc596', + 'timestamp': 1323138843, + 'upload_date': '20111206', + 'uploader': 'cbr', + 'uploader_id': '679425', + 'duration': 81, + } + }, + { + # https://github.com/rg3/youtube-dl/pull/2274 + 'note': 'Video with subtitles', + 'url': 'http://blip.tv/play/h6Uag5OEVgI.html', + 'md5': '309f9d25b820b086ca163ffac8031806', + 'info_dict': { + 'id': '6586561', + 'ext': 'mp4', + 'title': 'Red vs. Blue Season 11 Episode 1', + 'description': 'One-Zero-One', + 'timestamp': 1371261608, + 'upload_date': '20130615', + 'uploader': 'redvsblue', + 'uploader_id': '792887', + 'duration': 279, + } } - }] + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - presumptive_id = mobj.group('presumptive_id') + lookup_id = mobj.group('lookup_id') # See https://github.com/rg3/youtube-dl/issues/857 - embed_mobj = re.match(r'https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)([a-zA-Z0-9]+)', url) - if embed_mobj: - info_url = 'http://blip.tv/play/%s.x?p=1' % embed_mobj.group(1) - info_page = self._download_webpage(info_url, embed_mobj.group(1)) - video_id = self._search_regex( - r'data-episode-id="([0-9]+)', info_page, 'video_id') - return self.url_result('http://blip.tv/a/a-' + video_id, 'BlipTV') - - cchar = '&' if '?' in url else '?' - json_url = url + cchar + 'skin=json&version=2&no_wrap=1' - request = compat_urllib_request.Request(json_url) - request.add_header('User-Agent', 'iTunes/10.6.1') - - json_data = self._download_json(request, video_id=presumptive_id) - - if 'Post' in json_data: - data = json_data['Post'] + if lookup_id: + info_page = self._download_webpage( + 'http://blip.tv/play/%s.x?p=1' % lookup_id, lookup_id, 'Resolving lookup id') + video_id = self._search_regex(r'data-episode-id="([0-9]+)', info_page, 'video_id') else: - data = json_data + video_id = mobj.group('id') + + rss = self._download_xml('http://blip.tv/rss/flash/%s' % video_id, video_id, 'Downloading video RSS') + + def blip(s): + return '{http://blip.tv/dtd/blip/1.0}%s' % s + + def media(s): + return '{http://search.yahoo.com/mrss/}%s' % s + + def itunes(s): + return '{http://www.itunes.com/dtds/podcast-1.0.dtd}%s' % s + + item = rss.find('channel/item') + + video_id = item.find(blip('item_id')).text + title = item.find('./title').text + description = clean_html(compat_str(item.find(blip('puredescription')).text)) + timestamp = parse_iso8601(item.find(blip('datestamp')).text) + uploader = item.find(blip('user')).text + uploader_id = item.find(blip('userid')).text + duration = int(item.find(blip('runtime')).text) + media_thumbnail = item.find(media('thumbnail')) + thumbnail = media_thumbnail.get('url') if media_thumbnail is not None else item.find(itunes('image')).text + categories = [category.text for category in item.findall('category')] - video_id = compat_str(data['item_id']) - upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') - subtitles = {} formats = [] - if 'additionalMedia' in data: - for f in data['additionalMedia']: - if f.get('file_type_srt') == 1: - LANGS = { - 'english': 'en', - } - lang = f['role'].rpartition('-')[-1].strip().lower() - langcode = LANGS.get(lang, lang) - subtitles[langcode] = f['url'] - continue - if not int(f['media_width']): # filter m3u8 - continue + subtitles = {} + + media_group = item.find(media('group')) + for media_content in media_group.findall(media('content')): + url = media_content.get('url') + role = media_content.get(blip('role')) + msg = self._download_webpage( + url + '?showplayer=20140425131715&referrer=http://blip.tv&mask=7&skin=flashvars&view=url', + video_id, 'Resolving URL for %s' % role) + real_url = compat_urlparse.parse_qs(msg)['message'][0] + + media_type = media_content.get('type') + if media_type == 'text/srt' or url.endswith('.srt'): + LANGS = { + 'english': 'en', + } + lang = role.rpartition('-')[-1].strip().lower() + langcode = LANGS.get(lang, lang) + subtitles[langcode] = url + elif media_type.startswith('video/'): formats.append({ - 'url': f['url'], - 'format_id': f['role'], - 'width': int(f['media_width']), - 'height': int(f['media_height']), + 'url': real_url, + 'format_id': role, + 'format_note': media_type, + 'vcodec': media_content.get(blip('vcodec')), + 'acodec': media_content.get(blip('acodec')), + 'filesize': media_content.get('filesize'), + 'width': int(media_content.get('width')), + 'height': int(media_content.get('height')), }) - else: - formats.append({ - 'url': data['media']['url'], - 'width': int(data['media']['width']), - 'height': int(data['media']['height']), - }) self._sort_formats(formats) # subtitles @@ -107,12 +129,14 @@ class BlipTVIE(SubtitlesInfoExtractor): return { 'id': video_id, - 'uploader': data['display_name'], - 'upload_date': upload_date, - 'title': data['title'], - 'thumbnail': data['thumbnailUrl'], - 'description': data['description'], - 'user_agent': 'iTunes/10.6.1', + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'thumbnail': thumbnail, + 'categories': categories, 'formats': formats, 'subtitles': video_subtitles, } diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index 2415ce4..25fb79e 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -1,22 +1,21 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from .ooyala import OoyalaIE class BloombergIE(InfoExtractor): _VALID_URL = r'https?://www\.bloomberg\.com/video/(?P<name>.+?)\.html' _TEST = { - u'url': u'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', - u'file': u'12bzhqZTqQHmmlA8I-i0NpzJgcG5NNYX.mp4', - u'info_dict': { - u'title': u'Shah\'s Presentation on Foreign-Exchange Strategies', - u'description': u'md5:abc86e5236f9f0e4866c59ad36736686', - }, - u'params': { - # Requires ffmpeg (m3u8 manifest) - u'skip_download': True, + 'url': 'http://www.bloomberg.com/video/shah-s-presentation-on-foreign-exchange-strategies-qurhIVlJSB6hzkVi229d8g.html', + 'md5': '7bf08858ff7c203c870e8a6190e221e5', + 'info_dict': { + 'id': 'qurhIVlJSB6hzkVi229d8g', + 'ext': 'flv', + 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', + 'description': 'md5:0681e0d30dcdfc6abf34594961d8ea88', }, } @@ -24,7 +23,16 @@ class BloombergIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) name = mobj.group('name') webpage = self._download_webpage(url, name) - embed_code = self._search_regex( - r'<source src="https?://[^/]+/[^/]+/[^/]+/([^/]+)', webpage, - 'embed code') - return OoyalaIE._build_url_result(embed_code) + f4m_url = self._search_regex( + r'<source src="(https?://[^"]+\.f4m.*?)"', webpage, + 'f4m url') + title = re.sub(': Video$', '', self._og_search_title(webpage)) + + return { + 'id': name.split('-')[-1], + 'title': title, + 'url': f4m_url, + 'ext': 'flv', + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py new file mode 100644 index 0000000..b5b56ff --- /dev/null +++ b/youtube_dl/extractor/br.py @@ -0,0 +1,139 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class BRIE(InfoExtractor): + IE_DESC = 'Bayerischer Rundfunk Mediathek' + _VALID_URL = r'https?://(?:www\.)?br\.de/(?:[a-z0-9\-]+/)+(?P<id>[a-z0-9\-]+)\.html' + _BASE_URL = 'http://www.br.de' + + _TESTS = [ + { + 'url': 'http://www.br.de/mediathek/video/anselm-gruen-114.html', + 'md5': 'c4f83cf0f023ba5875aba0bf46860df2', + 'info_dict': { + 'id': '2c8d81c5-6fb7-4a74-88d4-e768e5856532', + 'ext': 'mp4', + 'title': 'Feiern und Verzichten', + 'description': 'Anselm Grün: Feiern und Verzichten', + 'uploader': 'BR/Birgit Baier', + 'upload_date': '20140301', + } + }, + { + 'url': 'http://www.br.de/mediathek/video/sendungen/unter-unserem-himmel/unter-unserem-himmel-alpen-ueber-den-pass-100.html', + 'md5': 'ab451b09d861dbed7d7cc9ab0be19ebe', + 'info_dict': { + 'id': '2c060e69-3a27-4e13-b0f0-668fac17d812', + 'ext': 'mp4', + 'title': 'Über den Pass', + 'description': 'Die Eroberung der Alpen: Über den Pass', + } + }, + { + 'url': 'http://www.br.de/nachrichten/schaeuble-haushaltsentwurf-bundestag-100.html', + 'md5': '3db0df1a9a9cd9fa0c70e6ea8aa8e820', + 'info_dict': { + 'id': 'c6aae3de-2cf9-43f2-957f-f17fef9afaab', + 'ext': 'aac', + 'title': '"Keine neuen Schulden im nächsten Jahr"', + 'description': 'Haushaltsentwurf: "Keine neuen Schulden im nächsten Jahr"', + } + }, + { + 'url': 'http://www.br.de/radio/bayern1/service/team/videos/team-video-erdelt100.html', + 'md5': 'dbab0aef2e047060ea7a21fc1ce1078a', + 'info_dict': { + 'id': '6ba73750-d405-45d3-861d-1ce8c524e059', + 'ext': 'mp4', + 'title': 'Umweltbewusster Häuslebauer', + 'description': 'Uwe Erdelt: Umweltbewusster Häuslebauer', + } + }, + { + 'url': 'http://www.br.de/fernsehen/br-alpha/sendungen/kant-fuer-anfaenger/kritik-der-reinen-vernunft/kant-kritik-01-metaphysik100.html', + 'md5': '23bca295f1650d698f94fc570977dae3', + 'info_dict': { + 'id': 'd982c9ce-8648-4753-b358-98abb8aec43d', + 'ext': 'mp4', + 'title': 'Folge 1 - Metaphysik', + 'description': 'Kant für Anfänger: Folge 1 - Metaphysik', + 'uploader': 'Eva Maria Steimle', + 'upload_date': '20140117', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + page = self._download_webpage(url, display_id) + xml_url = self._search_regex( + r"return BRavFramework\.register\(BRavFramework\('avPlayer_(?:[a-f0-9-]{36})'\)\.setup\({dataURL:'(/(?:[a-z0-9\-]+/)+[a-z0-9/~_.-]+)'}\)\);", page, 'XMLURL') + xml = self._download_xml(self._BASE_URL + xml_url, None) + + medias = [] + + for xml_media in xml.findall('video') + xml.findall('audio'): + media = { + 'id': xml_media.get('externalId'), + 'title': xml_media.find('title').text, + 'formats': self._extract_formats(xml_media.find('assets')), + 'thumbnails': self._extract_thumbnails(xml_media.find('teaserImage/variants')), + 'description': ' '.join(xml_media.find('shareTitle').text.splitlines()), + 'webpage_url': xml_media.find('permalink').text + } + if xml_media.find('author').text: + media['uploader'] = xml_media.find('author').text + if xml_media.find('broadcastDate').text: + media['upload_date'] = ''.join(reversed(xml_media.find('broadcastDate').text.split('.'))) + medias.append(media) + + if len(medias) > 1: + self._downloader.report_warning( + 'found multiple medias; please ' + 'report this with the video URL to http://yt-dl.org/bug') + if not medias: + raise ExtractorError('No media entries found') + return medias[0] + + def _extract_formats(self, assets): + + def text_or_none(asset, tag): + elem = asset.find(tag) + return None if elem is None else elem.text + + formats = [{ + 'url': text_or_none(asset, 'downloadUrl'), + 'ext': text_or_none(asset, 'mediaType'), + 'format_id': asset.get('type'), + 'width': int_or_none(text_or_none(asset, 'frameWidth')), + 'height': int_or_none(text_or_none(asset, 'frameHeight')), + 'tbr': int_or_none(text_or_none(asset, 'bitrateVideo')), + 'abr': int_or_none(text_or_none(asset, 'bitrateAudio')), + 'vcodec': text_or_none(asset, 'codecVideo'), + 'acodec': text_or_none(asset, 'codecAudio'), + 'container': text_or_none(asset, 'mediaType'), + 'filesize': int_or_none(text_or_none(asset, 'size')), + } for asset in assets.findall('asset') + if asset.find('downloadUrl') is not None] + + self._sort_formats(formats) + return formats + + def _extract_thumbnails(self, variants): + thumbnails = [{ + 'url': self._BASE_URL + variant.find('url').text, + 'width': int_or_none(variant.find('width').text), + 'height': int_or_none(variant.find('height').text), + } for variant in variants.findall('variant')] + thumbnails.sort(key=lambda x: x['width'] * x['height'], reverse=True) + return thumbnails diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 8ec6dda..1bfc9f3 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -23,13 +23,14 @@ class BreakIE(InfoExtractor): video_id = mobj.group(1).split("-")[-1] embed_url = 'http://www.break.com/embed/%s' % video_id webpage = self._download_webpage(embed_url, video_id) - info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, - 'info json', flags=re.DOTALL) + info_json = self._search_regex(r'var embedVars = ({.*})\s*?</script>', + webpage, 'info json', flags=re.DOTALL) info = json.loads(info_json) video_url = info['videoUri'] - m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) - if m_youtube is not None: - return self.url_result(m_youtube.group(1), 'Youtube') + youtube_id = info.get('youtubeId') + if youtube_id: + return self.url_result(youtube_id, 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return { 'id': video_id, diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 83eec84..3c02c29 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -87,7 +87,7 @@ class BrightcoveIE(InfoExtractor): object_str = object_str.replace('<--', '<!--') object_str = fix_xml_ampersands(object_str) - object_doc = xml.etree.ElementTree.fromstring(object_str) + object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') if fv_el is not None: @@ -140,7 +140,11 @@ class BrightcoveIE(InfoExtractor): url_m = re.search(r'<meta\s+property="og:video"\s+content="(http://c.brightcove.com/[^"]+)"', webpage) if url_m: - return [unescapeHTML(url_m.group(1))] + url = unescapeHTML(url_m.group(1)) + # Some sites don't add it, we can't download with this url, for example: + # http://www.ktvu.com/videos/news/raw-video-caltrain-releases-video-of-man-almost/vCTZdY/ + if 'playerKey' in url: + return [url] matches = re.findall( r'''(?sx)<object diff --git a/youtube_dl/extractor/byutv.py b/youtube_dl/extractor/byutv.py new file mode 100644 index 0000000..cf19b7b --- /dev/null +++ b/youtube_dl/extractor/byutv.py @@ -0,0 +1,48 @@ +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class BYUtvIE(InfoExtractor): + _VALID_URL = r'^https?://(?:www\.)?byutv.org/watch/[0-9a-f-]+/(?P<video_id>[^/?#]+)' + _TEST = { + 'url': 'http://www.byutv.org/watch/44e80f7b-e3ba-43ba-8c51-b1fd96c94a79/granite-flats-talking', + 'info_dict': { + 'id': 'granite-flats-talking', + 'ext': 'mp4', + 'description': 'md5:4e9a7ce60f209a33eca0ac65b4918e1c', + 'title': 'Talking', + 'thumbnail': 're:^https?://.*promo.*' + }, + 'params': { + 'skip_download': True, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('video_id') + + webpage = self._download_webpage(url, video_id) + episode_code = self._search_regex( + r'(?s)episode:(.*?\}),\s*\n', webpage, 'episode information') + episode_json = re.sub( + r'(\n\s+)([a-zA-Z]+):\s+\'(.*?)\'', r'\1"\2": "\3"', episode_code) + ep = json.loads(episode_json) + + if ep['providerType'] == 'Ooyala': + return { + '_type': 'url_transparent', + 'ie_key': 'Ooyala', + 'url': 'ooyala:%s' % ep['providerId'], + 'id': video_id, + 'title': ep['title'], + 'description': ep.get('description'), + 'thumbnail': ep.get('imageThumbnail'), + } + else: + raise ExtractorError('Unsupported provider %s' % ep['provider']) diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py index 690bc7c..cb96c38 100644 --- a/youtube_dl/extractor/c56.py +++ b/youtube_dl/extractor/c56.py @@ -2,39 +2,46 @@ from __future__ import unicode_literals import re -import json from .common import InfoExtractor class C56IE(InfoExtractor): - _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)' + _VALID_URL = r'https?://(?:(?:www|player)\.)?56\.com/(?:.+?/)?(?:v_|(?:play_album.+-))(?P<textid>.+?)\.(?:html|swf)' IE_NAME = '56.com' _TEST = { 'url': 'http://www.56.com/u39/v_OTM0NDA3MTY.html', - 'file': '93440716.flv', 'md5': 'e59995ac63d0457783ea05f93f12a866', 'info_dict': { + 'id': '93440716', + 'ext': 'flv', 'title': '网事知多少 第32期:车怒', + 'duration': 283.813, }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) text_id = mobj.group('textid') - info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id, - text_id, 'Downloading video info') - info = json.loads(info_page)['info'] - formats = [{ - 'format_id': f['type'], - 'filesize': int(f['filesize']), - 'url': f['url'] - } for f in info['rfiles']] + + page = self._download_json( + 'http://vxml.56.com/json/%s/' % text_id, text_id, 'Downloading video info') + + info = page['info'] + + formats = [ + { + 'format_id': f['type'], + 'filesize': int(f['filesize']), + 'url': f['url'] + } for f in info['rfiles'] + ] self._sort_formats(formats) return { 'id': info['vid'], 'title': info['Subject'], + 'duration': int(info['duration']) / 1000.0, 'formats': formats, 'thumbnail': info.get('bimg') or info.get('img'), } diff --git a/youtube_dl/extractor/canal13cl.py b/youtube_dl/extractor/canal13cl.py new file mode 100644 index 0000000..93241fe --- /dev/null +++ b/youtube_dl/extractor/canal13cl.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class Canal13clIE(InfoExtractor): + _VALID_URL = r'^http://(?:www\.)?13\.cl/(?:[^/?#]+/)*(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.13.cl/t13/nacional/el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'md5': '4cb1fa38adcad8fea88487a078831755', + 'info_dict': { + 'id': '1403022125', + 'display_id': 'el-circulo-de-hierro-de-michelle-bachelet-en-su-regreso-a-la-moneda', + 'ext': 'mp4', + 'title': 'El "círculo de hierro" de Michelle Bachelet en su regreso a La Moneda', + 'description': '(Foto: Agencia Uno) En nueve días más, Michelle Bachelet va a asumir por segunda vez como presidenta de la República. Entre aquellos que la acompañarán hay caras que se repiten y otras que se consolidan en su entorno de colaboradores más cercanos.', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True) + description = self._html_search_meta( + 'twitter:description', webpage, 'description') + url = self._html_search_regex( + r'articuloVideo = \"(.*?)\"', webpage, 'url') + real_id = self._search_regex( + r'[^0-9]([0-9]{7,})[^0-9]', url, 'id', default=display_id) + thumbnail = self._html_search_regex( + r'articuloImagen = \"(.*?)\"', webpage, 'thumbnail') + + return { + 'id': real_id, + 'display_id': display_id, + 'url': url, + 'title': title, + 'description': description, + 'ext': 'mp4', + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index 3d8d7f9..c4fefef 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -9,11 +11,12 @@ class Canalc2IE(InfoExtractor): _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)' _TEST = { - u'url': u'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', - u'file': u'12163.mp4', - u'md5': u'060158428b650f896c542dfbb3d6487f', - u'info_dict': { - u'title': u'Terrasses du Numérique' + 'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui', + 'md5': '060158428b650f896c542dfbb3d6487f', + 'info_dict': { + 'id': '12163', + 'ext': 'mp4', + 'title': 'Terrasses du Numérique' } } @@ -28,10 +31,11 @@ class Canalc2IE(InfoExtractor): video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name title = self._html_search_regex( - r'class="evenement8">(.*?)</a>', webpage, u'title') - - return {'id': video_id, - 'ext': 'mp4', - 'url': video_url, - 'title': title, - } + r'class="evenement8">(.*?)</a>', webpage, 'title') + + return { + 'id': video_id, + 'ext': 'mp4', + 'url': video_url, + 'title': title, + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 7cdcd83..0202078 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -1,53 +1,72 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + unified_strdate, + url_basename, +) class CanalplusIE(InfoExtractor): - _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))' + _VALID_URL = r'https?://(?:www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>[0-9]+))' _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' - IE_NAME = u'canalplus.fr' + IE_NAME = 'canalplus.fr' _TEST = { - u'url': u'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', - u'file': u'922470.flv', - u'info_dict': { - u'title': u'Zapping - 26/08/13', - u'description': u'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', - u'upload_date': u'20130826', - }, - u'params': { - u'skip_download': True, + 'url': 'http://www.canalplus.fr/c-infos-documentaires/pid1830-c-zapping.html?vid=922470', + 'md5': '3db39fb48b9685438ecf33a1078023e4', + 'info_dict': { + 'id': '922470', + 'ext': 'flv', + 'title': 'Zapping - 26/08/13', + 'description': 'Le meilleur de toutes les chaînes, tous les jours.\nEmission du 26 août 2013', + 'upload_date': '20130826', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.groupdict().get('id') + + # Beware, some subclasses do not define an id group + display_id = url_basename(mobj.group('path')) + if video_id is None: - webpage = self._download_webpage(url, mobj.group('path')) - video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id') + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex(r'<canal:player videoId="(\d+)"', webpage, 'video id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id - doc = self._download_xml(info_url,video_id, - u'Downloading video info') + doc = self._download_xml(info_url, video_id, 'Downloading video XML') - self.report_extraction(video_id) video_info = [video for video in doc if video.find('ID').text == video_id][0] - infos = video_info.find('INFOS') media = video_info.find('MEDIA') - formats = [media.find('VIDEOS/%s' % format) - for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] - video_url = [format.text for format in formats if format is not None][-1] - - return {'id': video_id, - 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, - infos.find('TITRAGE/SOUS_TITRE').text), - 'url': video_url, - 'ext': 'flv', - 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), - 'thumbnail': media.find('IMAGES/GRAND').text, - 'description': infos.find('DESCRIPTION').text, - 'view_count': int(infos.find('NB_VUES').text), - } + infos = video_info.find('INFOS') + + preferences = ['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD', 'HLS', 'HDS'] + + formats = [ + { + 'url': fmt.text + '?hdcore=2.11.3' if fmt.tag == 'HDS' else fmt.text, + 'format_id': fmt.tag, + 'ext': 'mp4' if fmt.tag == 'HLS' else 'flv', + 'preference': preferences.index(fmt.tag) if fmt.tag in preferences else -1, + } for fmt in media.find('VIDEOS') if fmt.text + ] + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': '%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + 'description': infos.find('DESCRIPTION').text, + 'view_count': int(infos.find('NB_VUES').text), + 'like_count': int(infos.find('NB_LIKES').text), + 'comment_count': int(infos.find('NB_COMMENTS').text), + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py new file mode 100644 index 0000000..0bce793 --- /dev/null +++ b/youtube_dl/extractor/cbsnews.py @@ -0,0 +1,87 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor + + +class CBSNewsIE(InfoExtractor): + IE_DESC = 'CBS News' + _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)' + + _TESTS = [ + { + 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', + 'info_dict': { + 'id': 'tesla-and-spacex-elon-musks-industrial-empire', + 'ext': 'flv', + 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', + 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', + 'duration': 791, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + { + 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', + 'info_dict': { + 'id': 'fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack', + 'ext': 'flv', + 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', + 'thumbnail': 'http://cbsnews2.cbsistatic.com/hub/i/r/2014/04/04/0c9fbc66-576b-41ca-8069-02d122060dd2/thumbnail/140x90/6dad7a502f88875ceac38202984b6d58/en-0404-werner-replace-640x360.jpg', + 'duration': 205, + }, + 'params': { + # rtmp download + 'skip_download': True, + }, + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + video_info = json.loads(self._html_search_regex( + r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', + webpage, 'video JSON info')) + + item = video_info['item'] if 'item' in video_info else video_info + title = item.get('articleTitle') or item.get('hed') + duration = item.get('duration') + thumbnail = item.get('mediaImage') or item.get('thumbnail') + + formats = [] + for format_id in ['RtmpMobileLow', 'RtmpMobileHigh', 'Hls', 'RtmpDesktop']: + uri = item.get('media' + format_id + 'URI') + if not uri: + continue + fmt = { + 'url': uri, + 'format_id': format_id, + } + if uri.startswith('rtmp'): + fmt.update({ + 'app': 'ondemand?auth=cbs', + 'play_path': 'mp4:' + uri.split('<break>')[-1], + 'player_url': 'http://www.cbsnews.com/[[IMPORT]]/vidtech.cbsinteractive.com/player/3_3_0/CBSI_PLAYER_HD.swf', + 'page_url': 'http://www.cbsnews.com', + 'ext': 'flv', + }) + elif uri.endswith('.m3u8'): + fmt['ext'] = 'mp4' + formats.append(fmt) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py new file mode 100644 index 0000000..90a3ddd --- /dev/null +++ b/youtube_dl/extractor/ceskatelevize.py @@ -0,0 +1,126 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_request, + compat_urllib_parse, + compat_urllib_parse_urlparse, + ExtractorError, +) + + +class CeskaTelevizeIE(InfoExtractor): + _VALID_URL = r'https?://www\.ceskatelevize\.cz/(porady|ivysilani)/(.+/)?(?P<id>[^?#]+)' + + _TESTS = [ + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/213512120230004-spanelska-chripka', + 'info_dict': { + 'id': '213512120230004', + 'ext': 'flv', + 'title': 'První republika: Španělská chřipka', + 'duration': 3107.4, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + 'skip': 'Works only from Czech Republic.', + }, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/1030584952-tsatsiki-maminka-a-policajt', + 'info_dict': { + 'id': '20138143440', + 'ext': 'flv', + 'title': 'Tsatsiki, maminka a policajt', + 'duration': 6754.1, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + 'skip': 'Works only from Czech Republic.', + }, + { + 'url': 'http://www.ceskatelevize.cz/ivysilani/10532695142-prvni-republika/bonus/14716-zpevacka-z-duparny-bobina', + 'info_dict': { + 'id': '14716', + 'ext': 'flv', + 'title': 'První republika: Zpěvačka z Dupárny Bobina', + 'duration': 90, + }, + 'params': { + 'skip_download': True, # requires rtmpdump + }, + }, + ] + + def _real_extract(self, url): + url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') + + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + + NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' + if '%s</p>' % NOT_AVAILABLE_STRING in webpage: + raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + + typ = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') + episode_id = self._html_search_regex(r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + + data = { + 'playlist[0][type]': typ, + 'playlist[0][id]': episode_id, + 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestSource': 'iVysilani', + } + + req = compat_urllib_request.Request('http://www.ceskatelevize.cz/ivysilani/ajax/get-playlist-url', + data=compat_urllib_parse.urlencode(data)) + + req.add_header('Content-type', 'application/x-www-form-urlencoded') + req.add_header('x-addr', '127.0.0.1') + req.add_header('X-Requested-With', 'XMLHttpRequest') + req.add_header('Referer', url) + + playlistpage = self._download_json(req, video_id) + + req = compat_urllib_request.Request(compat_urllib_parse.unquote(playlistpage['url'])) + req.add_header('Referer', url) + + playlist = self._download_xml(req, video_id) + + formats = [] + for i in playlist.find('smilRoot/body'): + if 'AD' not in i.attrib['id']: + base_url = i.attrib['base'] + parsedurl = compat_urllib_parse_urlparse(base_url) + duration = i.attrib['duration'] + + for video in i.findall('video'): + if video.attrib['label'] != 'AD': + format_id = video.attrib['label'] + play_path = video.attrib['src'] + vbr = int(video.attrib['system-bitrate']) + + formats.append({ + 'format_id': format_id, + 'url': base_url, + 'vbr': vbr, + 'play_path': play_path, + 'app': parsedurl.path[1:] + '?' + parsedurl.query, + 'rtmp_live': True, + 'ext': 'flv', + }) + + self._sort_formats(formats) + + return { + 'id': episode_id, + 'title': self._html_search_regex(r'<title>(.+?) — iVysílání — Česká televize', webpage, 'title'), + 'duration': float(duration), + 'formats': formats, + } diff --git a/youtube_dl/extractor/cinemassacre.py b/youtube_dl/extractor/cinemassacre.py index f0d08ce..496271b 100644 --- a/youtube_dl/extractor/cinemassacre.py +++ b/youtube_dl/extractor/cinemassacre.py @@ -1,84 +1,94 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re from .common import InfoExtractor from ..utils import ( ExtractorError, + int_or_none, ) class CinemassacreIE(InfoExtractor): - _VALID_URL = r'(?:http://)?(?:www\.)?(?Pcinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/.+?)(?:[/?].*)?' - _TESTS = [{ - u'url': u'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', - u'file': u'19911.flv', - u'info_dict': { - u'upload_date': u'20121110', - u'title': u'“Angry Video Game Nerd: The Movie” – Trailer', - u'description': u'md5:fb87405fcb42a331742a0dce2708560b', - }, - u'params': { - # rtmp download - u'skip_download': True, - }, - }, - { - u'url': u'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', - u'file': u'521be8ef82b16.flv', - u'info_dict': { - u'upload_date': u'20131002', - u'title': u'The Mummy’s Hand (1940)', - }, - u'params': { - # rtmp download - u'skip_download': True, + _VALID_URL = r'http://(?:www\.)?cinemassacre\.com/(?P[0-9]{4})/(?P[0-9]{2})/(?P[0-9]{2})/(?P[^?#/]+)' + _TESTS = [ + { + 'url': 'http://cinemassacre.com/2012/11/10/avgn-the-movie-trailer/', + 'md5': 'fde81fbafaee331785f58cd6c0d46190', + 'info_dict': { + 'id': '19911', + 'ext': 'mp4', + 'upload_date': '20121110', + 'title': '“Angry Video Game Nerd: The Movie” – Trailer', + 'description': 'md5:fb87405fcb42a331742a0dce2708560b', + }, }, - }] + { + 'url': 'http://cinemassacre.com/2013/10/02/the-mummys-hand-1940', + 'md5': 'd72f10cd39eac4215048f62ab477a511', + 'info_dict': { + 'id': '521be8ef82b16', + 'ext': 'mp4', + 'upload_date': '20131002', + 'title': 'The Mummy’s Hand (1940)', + }, + } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') - webpage_url = u'http://' + mobj.group('url') - webpage = self._download_webpage(webpage_url, None) # Don't know video id yet + webpage = self._download_webpage(url, display_id) video_date = mobj.group('date_Y') + mobj.group('date_m') + mobj.group('date_d') mobj = re.search(r'src="(?Phttp://player\.screenwavemedia\.com/play/[a-zA-Z]+\.php\?id=(?:Cinemassacre-)?(?P.+?))"', webpage) if not mobj: - raise ExtractorError(u'Can\'t extract embed url and video id') - playerdata_url = mobj.group(u'embed_url') - video_id = mobj.group(u'video_id') + raise ExtractorError('Can\'t extract embed url and video id') + playerdata_url = mobj.group('embed_url') + video_id = mobj.group('video_id') - video_title = self._html_search_regex(r'(?P<title>.+?)\|', - webpage, u'title') - video_description = self._html_search_regex(r'<div class="entry-content">(?P<description>.+?)</div>', - webpage, u'description', flags=re.DOTALL, fatal=False) - if len(video_description) == 0: - video_description = None + video_title = self._html_search_regex( + r'<title>(?P<title>.+?)\|', webpage, 'title') + video_description = self._html_search_regex( + r'<div class="entry-content">(?P<description>.+?)</div>', + webpage, 'description', flags=re.DOTALL, fatal=False) - playerdata = self._download_webpage(playerdata_url, video_id) - url = self._html_search_regex(r'\'streamer\': \'(?P<url>[^\']+)\'', playerdata, u'url') + playerdata = self._download_webpage(playerdata_url, video_id, 'Downloading player webpage') + video_thumbnail = self._search_regex( + r'image: \'(?P<thumbnail>[^\']+)\'', playerdata, 'thumbnail', fatal=False) + sd_url = self._search_regex(r'file: \'([^\']+)\', label: \'SD\'', playerdata, 'sd_file') + videolist_url = self._search_regex(r'file: \'([^\']+\.smil)\'}', playerdata, 'videolist_url') - sd_file = self._html_search_regex(r'\'file\': \'(?P<sd_file>[^\']+)\'', playerdata, u'sd_file') - hd_file = self._html_search_regex(r'\'?file\'?: "(?P<hd_file>[^"]+)"', playerdata, u'hd_file') - video_thumbnail = self._html_search_regex(r'\'image\': \'(?P<thumbnail>[^\']+)\'', playerdata, u'thumbnail', fatal=False) + videolist = self._download_xml(videolist_url, video_id, 'Downloading videolist XML') - formats = [ - { - 'url': url, - 'play_path': 'mp4:' + sd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', - 'format': 'sd', - 'format_id': 'sd', - }, - { - 'url': url, - 'play_path': 'mp4:' + hd_file, - 'rtmp_live': True, # workaround - 'ext': 'flv', - 'format': 'hd', - 'format_id': 'hd', - }, - ] + formats = [] + baseurl = sd_url[:sd_url.rfind('/')+1] + for video in videolist.findall('.//video'): + src = video.get('src') + if not src: + continue + file_ = src.partition(':')[-1] + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + bitrate = int_or_none(video.get('system-bitrate')) + format = { + 'url': baseurl + file_, + 'format_id': src.rpartition('.')[0].rpartition('_')[-1], + } + if width or height: + format.update({ + 'tbr': bitrate // 1000 if bitrate else None, + 'width': width, + 'height': height, + }) + else: + format.update({ + 'abr': bitrate // 1000 if bitrate else None, + 'vcodec': 'none', + }) + formats.append(format) + self._sort_formats(formats) return { 'id': video_id, diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index 43efb08..669919a 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -1,22 +1,28 @@ +from __future__ import unicode_literals + import re import time import xml.etree.ElementTree from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_duration, +) class ClipfishIE(InfoExtractor): - IE_NAME = u'clipfish' + IE_NAME = 'clipfish' _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' _TEST = { - u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', - u'file': u'3966754.mp4', - u'md5': u'2521cd644e862936cf2e698206e47385', - u'info_dict': { - u'title': u'FIFA 14 - E3 2013 Trailer', - u'duration': 82, + 'url': 'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/', + 'md5': '2521cd644e862936cf2e698206e47385', + 'info_dict': { + 'id': '3966754', + 'ext': 'mp4', + 'title': 'FIFA 14 - E3 2013 Trailer', + 'duration': 82, }, u'skip': 'Blocked in the US' } @@ -33,21 +39,10 @@ class ClipfishIE(InfoExtractor): video_url = doc.find('filename').text if video_url is None: xml_bytes = xml.etree.ElementTree.tostring(doc) - raise ExtractorError(u'Cannot find video URL in document %r' % + raise ExtractorError('Cannot find video URL in document %r' % xml_bytes) thumbnail = doc.find('imageurl').text - duration_str = doc.find('duration').text - m = re.match( - r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$', - duration_str) - if m: - duration = ( - (int(m.group('hours')) * 60 * 60) + - (int(m.group('minutes')) * 60) + - (int(m.group('seconds'))) - ) - else: - duration = None + duration = parse_duration(doc.find('duration').text) return { 'id': video_id, diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py index 9ab6a4a..02a1667 100644 --- a/youtube_dl/extractor/clipsyndicate.py +++ b/youtube_dl/extractor/clipsyndicate.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor @@ -11,13 +13,14 @@ class ClipsyndicateIE(InfoExtractor): _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)' _TEST = { - u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', - u'md5': u'4d7d549451bad625e0ff3d7bd56d776c', - u'info_dict': { - u'id': u'4629301', - u'ext': u'mp4', - u'title': u'Brick Briscoe', - u'duration': 612, + 'url': 'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe', + 'md5': '4d7d549451bad625e0ff3d7bd56d776c', + 'info_dict': { + 'id': '4629301', + 'ext': 'mp4', + 'title': 'Brick Briscoe', + 'duration': 612, + 'thumbnail': 're:^https?://.+\.jpg', }, } @@ -26,13 +29,13 @@ class ClipsyndicateIE(InfoExtractor): video_id = mobj.group('id') js_player = self._download_webpage( 'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id, - video_id, u'Downlaoding player') + video_id, 'Downlaoding player') # it includes a required token - flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars') + flvars = self._search_regex(r'flvars: "(.*?)"', js_player, 'flvars') pdoc = self._download_xml( 'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars, - video_id, u'Downloading video info', + video_id, 'Downloading video info', transform_source=fix_xml_ampersands) track_doc = pdoc.find('trackList/track') diff --git a/youtube_dl/extractor/clubic.py b/youtube_dl/extractor/clubic.py new file mode 100644 index 0000000..14f215c --- /dev/null +++ b/youtube_dl/extractor/clubic.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + qualities, +) + + +class ClubicIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?clubic\.com/video/[^/]+/video.*-(?P<id>[0-9]+)\.html' + + _TEST = { + 'url': 'http://www.clubic.com/video/clubic-week/video-clubic-week-2-0-le-fbi-se-lance-dans-la-photo-d-identite-448474.html', + 'md5': '1592b694ba586036efac1776b0b43cd3', + 'info_dict': { + 'id': '448474', + 'ext': 'mp4', + 'title': 'Clubic Week 2.0 : le FBI se lance dans la photo d\u0092identité', + 'description': 're:Gueule de bois chez Nokia. Le constructeur a indiqué cette.*', + 'thumbnail': 're:^http://img\.clubic\.com/.*\.jpg$', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + player_url = 'http://player.m6web.fr/v1/player/clubic/%s.html' % video_id + player_page = self._download_webpage(player_url, video_id) + + config_json = self._search_regex( + r'(?m)M6\.Player\.config\s*=\s*(\{.+?\});$', player_page, + 'configuration') + config = json.loads(config_json) + + video_info = config['videoInfo'] + sources = config['sources'] + quality_order = qualities(['sd', 'hq']) + + formats = [{ + 'format_id': src['streamQuality'], + 'url': src['src'], + 'quality': quality_order(src['streamQuality']), + } for src in sources] + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_info['title'], + 'formats': formats, + 'description': clean_html(video_info.get('description')), + 'thumbnail': config.get('poster'), + } diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index 88e0e9a..e96c59f 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,19 +1,19 @@ +from __future__ import unicode_literals from .mtv import MTVIE + class CMTIE(MTVIE): - IE_NAME = u'cmt.com' + IE_NAME = 'cmt.com' _VALID_URL = r'https?://www\.cmt\.com/videos/.+?/(?P<videoid>[^/]+)\.jhtml' _FEED_URL = 'http://www.cmt.com/sitewide/apps/player/embed/rss/' - _TESTS = [ - { - u'url': u'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', - u'md5': u'e6b7ef3c4c45bbfae88061799bbba6c2', - u'info_dict': { - u'id': u'989124', - u'ext': u'mp4', - u'title': u'Garth Brooks - "The Call (featuring Trisha Yearwood)"', - u'description': u'Blame It All On My Roots', - }, + _TESTS = [{ + 'url': 'http://www.cmt.com/videos/garth-brooks/989124/the-call-featuring-trisha-yearwood.jhtml#artist=30061', + 'md5': 'e6b7ef3c4c45bbfae88061799bbba6c2', + 'info_dict': { + 'id': '989124', + 'ext': 'mp4', + 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', + 'description': 'Blame It All On My Roots', }, - ] + }] diff --git a/youtube_dl/extractor/cnet.py b/youtube_dl/extractor/cnet.py new file mode 100644 index 0000000..a94f425 --- /dev/null +++ b/youtube_dl/extractor/cnet.py @@ -0,0 +1,75 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, +) + + +class CNETIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cnet\.com/videos/(?P<id>[^/]+)/' + _TEST = { + 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', + 'md5': '041233212a0d06b179c87cbcca1577b8', + 'info_dict': { + 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', + 'ext': 'mp4', + 'title': 'Hands-on with Microsoft Windows 8.1 Update', + 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', + 'thumbnail': 're:^http://.*/flmswindows8.jpg$', + 'uploader_id': 'sarah.mitroff@cbsinteractive.com', + 'uploader': 'Sarah Mitroff', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('id') + + webpage = self._download_webpage(url, display_id) + data_json = self._html_search_regex( + r"<div class=\"cnetVideoPlayer\"\s+.*?data-cnet-video-options='([^']+)'", + webpage, 'data json') + data = json.loads(data_json) + vdata = data['video'] + if not vdata: + vdata = data['videos'][0] + if not vdata: + raise ExtractorError('Cannot find video data') + + video_id = vdata['id'] + title = vdata['headline'] + description = vdata.get('dek') + thumbnail = vdata.get('image', {}).get('path') + author = vdata.get('author') + if author: + uploader = '%s %s' % (author['firstName'], author['lastName']) + uploader_id = author.get('email') + else: + uploader = None + uploader_id = None + + formats = [{ + 'format_id': '%s-%s-%s' % ( + f['type'], f['format'], + int_or_none(f.get('bitrate'), 1000, default='')), + 'url': f['uri'], + 'tbr': int_or_none(f.get('bitrate'), 1000), + } for f in vdata['files']['data']] + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index b32cb89..dae40c1 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -79,8 +79,11 @@ class CNNIE(InfoExtractor): self._sort_formats(formats) - thumbnails = sorted([((int(t.attrib['height']),int(t.attrib['width'])), t.text) for t in info.findall('images/image')]) - thumbs_dict = [{'resolution': res, 'url': t_url} for (res, t_url) in thumbnails] + thumbnails = [{ + 'height': int(t.attrib['height']), + 'width': int(t.attrib['width']), + 'url': t.text, + } for t in info.findall('images/image')] metas_el = info.find('metas') upload_date = ( @@ -93,8 +96,7 @@ class CNNIE(InfoExtractor): 'id': info.attrib['id'], 'title': info.find('headline').text, 'formats': formats, - 'thumbnail': thumbnails[-1][1], - 'thumbnails': thumbs_dict, + 'thumbnails': thumbnails, 'description': info.find('description').text, 'duration': duration, 'upload_date': upload_date, diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 10c925d..6f866e7 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -17,8 +17,9 @@ class CollegeHumorIE(InfoExtractor): 'id': '6902724', 'ext': 'mp4', 'title': 'Comic-Con Cosplay Catastrophe', - 'description': 'Fans get creative this year', + 'description': "Fans get creative this year at San Diego. Too creative. And yes, that's really Joss Whedon.", 'age_limit': 13, + 'duration': 187, }, }, { @@ -28,22 +29,22 @@ class CollegeHumorIE(InfoExtractor): 'id': '3505939', 'ext': 'mp4', 'title': 'Font Conference', - 'description': 'This video wasn\'t long enough,', + 'description': "This video wasn't long enough, so we made it double-spaced.", 'age_limit': 10, 'duration': 179, }, }, # embedded youtube video { - 'url': 'http://www.collegehumor.com/embed/6950457', + 'url': 'http://www.collegehumor.com/embed/6950306', 'info_dict': { - 'id': 'W5gMp3ZjYg4', + 'id': 'Z-bao9fg6Yc', 'ext': 'mp4', - 'title': 'Funny Dogs Protecting Babies Compilation 2014 [NEW HD]', - 'uploader': 'Funnyplox TV', - 'uploader_id': 'funnyploxtv', - 'description': 'md5:7ded37421526d54afdf005e25bc2b7a3', - 'upload_date': '20140128', + 'title': 'Young Americans Think President John F. Kennedy Died THIS MORNING IN A CAR ACCIDENT!!!', + 'uploader': 'Mark Dice', + 'uploader_id': 'MarkDice', + 'description': 'md5:62c3dab9351fac7bb44b53b69511d87f', + 'upload_date': '20140127', }, 'params': { 'skip_download': True, @@ -87,6 +88,7 @@ class CollegeHumorIE(InfoExtractor): self._sort_formats(formats) duration = int_or_none(vdata.get('duration'), 1000) + like_count = int_or_none(vdata.get('likes')) return { 'id': video_id, @@ -96,4 +98,5 @@ class CollegeHumorIE(InfoExtractor): 'formats': formats, 'age_limit': age_limit, 'duration': duration, + 'like_count': like_count, } diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index ed3986f..ba4d73a 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -7,21 +7,21 @@ from .mtv import MTVServicesInfoExtractor from ..utils import ( compat_str, compat_urllib_parse, - ExtractorError, + float_or_none, unified_strdate, ) class ComedyCentralIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?://(?:www\.)?comedycentral\.com/ + _VALID_URL = r'''(?x)https?://(?:www\.)?(comedycentral|cc)\.com/ (video-clips|episodes|cc-studios|video-collections) /(?P<title>.*)''' _FEED_URL = 'http://comedycentral.com/feeds/mrss/' _TEST = { 'url': 'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother', - 'md5': '4167875aae411f903b751a21f357f1ee', + 'md5': 'c4f48e9eda1b16dd10add0744344b6d8', 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', @@ -32,31 +32,34 @@ class ComedyCentralIE(MTVServicesInfoExtractor): class ComedyCentralShowsIE(InfoExtractor): - IE_DESC = 'The Daily Show / Colbert Report' + IE_DESC = 'The Daily Show / The Colbert Report' # urls can be abbreviations like :thedailyshow or :colbert # urls for episodes like: # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) - |(https?://)?(www\.)? - (?P<showname>thedailyshow|colbertnation)\.com/ - (full-episodes/(?P<episode>.*)| + _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) + |https?://(:www\.)? + (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/ + ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| (?P<clip> - (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) - |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))| + (?:(?:guests/[^/]+|videos|video-playlists|special-editions)/[^/]+/(?P<videotitle>[^/?#]+)) + |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) + |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) + )| (?P<interview> - extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?))) - $""" + extended-interviews/(?P<interID>[0-9a-z]+)/(?:playlist_tds_extended_)?(?P<interview_title>.*?)(/.*?)?))) + (?:[?#].*|$)''' _TEST = { - 'url': 'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', - 'file': '422212.mp4', + 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', 'info_dict': { - "upload_date": "20121214", - "description": "Kristen Stewart", - "uploader": "thedailyshow", - "title": "thedailyshow-kristen-stewart part 1" + 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', + 'ext': 'mp4', + 'upload_date': '20121213', + 'description': 'Kristen Stewart learns to let loose in "On the Road."', + 'uploader': 'thedailyshow', + 'title': 'thedailyshow kristen-stewart part 1', } } @@ -79,11 +82,6 @@ class ComedyCentralShowsIE(InfoExtractor): '400': (384, 216), } - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - @staticmethod def _transform_rtmp_url(rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\.comedystor/.*)$', rtmp_video_url) @@ -99,14 +97,16 @@ class ComedyCentralShowsIE(InfoExtractor): if mobj.group('shortname'): if mobj.group('shortname') in ('tds', 'thedailyshow'): - url = 'http://www.thedailyshow.com/full-episodes/' + url = 'http://thedailyshow.cc.com/full-episodes/' else: - url = 'http://www.colbertnation.com/full-episodes/' + url = 'http://thecolbertreport.cc.com/full-episodes/' mobj = re.match(self._VALID_URL, url, re.VERBOSE) assert mobj is not None if mobj.group('clip'): - if mobj.group('showname') == 'thedailyshow': + if mobj.group('videotitle'): + epTitle = mobj.group('videotitle') + elif mobj.group('showname') == 'thedailyshow': epTitle = mobj.group('tdstitle') else: epTitle = mobj.group('cntitle') @@ -120,9 +120,9 @@ class ComedyCentralShowsIE(InfoExtractor): epTitle = mobj.group('showname') else: epTitle = mobj.group('episode') + show_name = mobj.group('showname') - self.report_extraction(epTitle) - webpage,htmlHandle = self._download_webpage_handle(url, epTitle) + webpage, htmlHandle = self._download_webpage_handle(url, epTitle) if dlNewest: url = htmlHandle.geturl() mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -130,71 +130,86 @@ class ComedyCentralShowsIE(InfoExtractor): raise ExtractorError('Invalid redirected URL: ' + url) if mobj.group('episode') == '': raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = mobj.group('episode') + epTitle = mobj.group('episode').rpartition('/')[-1] mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) - if len(mMovieParams) == 0: # The Colbert Report embeds the information in a without # a URL prefix; so extract the alternate reference # and then add the URL prefix manually. - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage) + altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) if len(altMovieParams) == 0: raise ExtractorError('unable to find Flash URL in webpage ' + url) else: mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])] uri = mMovieParams[0][1] - indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri}) - idoc = self._download_xml(indexUrl, epTitle, - 'Downloading show index', - 'unable to download episode index') - - results = [] - - itemEls = idoc.findall('.//item') - for partNum,itemEl in enumerate(itemEls): - mediaId = itemEl.findall('./guid')[0].text - shortMediaId = mediaId.split(':')[-1] - showId = mediaId.split(':')[-2].replace('.com', '') - officialTitle = itemEl.findall('./title')[0].text - officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text) - - configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + - compat_urllib_parse.urlencode({'uri': mediaId})) - cdoc = self._download_xml(configUrl, epTitle, - 'Downloading configuration for %s' % shortMediaId) + # Correct cc.com in uri + uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.cc.com', uri) + + index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse.urlencode({'uri': uri})) + idoc = self._download_xml( + index_url, epTitle, + 'Downloading show index', 'Unable to download episode index') + + title = idoc.find('./channel/title').text + description = idoc.find('./channel/description').text + + entries = [] + item_els = idoc.findall('.//item') + for part_num, itemEl in enumerate(item_els): + upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) + thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') + + content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') + duration = float_or_none(content.attrib.get('duration')) + mediagen_url = content.attrib['url'] + guid = itemEl.find('./guid').text.rpartition(':')[-1] + + cdoc = self._download_xml( + mediagen_url, epTitle, + 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) turls = [] for rendition in cdoc.findall('.//rendition'): finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) turls.append(finfo) - if len(turls) == 0: - self._downloader.report_error('unable to download ' + mediaId + ': No videos found') - continue - formats = [] for format, rtmp_video_url in turls: w, h = self._video_dimensions.get(format, (None, None)) formats.append({ + 'format_id': 'vhttp-%s' % format, 'url': self._transform_rtmp_url(rtmp_video_url), 'ext': self._video_extensions.get(format, 'mp4'), - 'format_id': format, 'height': h, 'width': w, }) + formats.append({ + 'format_id': 'rtmp-%s' % format, + 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), + 'ext': self._video_extensions.get(format, 'mp4'), + 'height': h, + 'width': w, + }) + self._sort_formats(formats) - effTitle = showId + '-' + epTitle + ' part ' + compat_str(partNum+1) - results.append({ - 'id': shortMediaId, + virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) + entries.append({ + 'id': guid, + 'title': virtual_id, 'formats': formats, - 'uploader': showId, - 'upload_date': officialDate, - 'title': effTitle, - 'thumbnail': None, - 'description': compat_str(officialTitle), + 'uploader': show_name, + 'upload_date': upload_date, + 'duration': duration, + 'thumbnail': thumbnail, + 'description': description, }) - return results + return { + '_type': 'playlist', + 'entries': entries, + 'title': show_name + ' ' + title, + 'description': description, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 84fca8b..49e7540 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -74,7 +74,7 @@ class InfoExtractor(object): "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this @@ -88,12 +88,22 @@ class InfoExtractor(object): The following fields are optional: - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. subtitles: The subtitle file contents as a dictionary in the format @@ -107,6 +117,8 @@ class InfoExtractor(object): webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] Unless mentioned otherwise, the fields should be Unicode strings. @@ -114,9 +126,6 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -239,16 +248,31 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - if len(url) > 200: - h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() - url = url[:200 - len(h)] + h - raw_filename = ('%s_%s.dump' % (video_id, url)) + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen(u'Saving request to ' + filename) with open(filename, 'wb') as outf: outf.write(webpage_bytes) - content = webpage_bytes.decode(encoding, 'replace') + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if (u'<title>Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'', start_page, 'xml root', None, False) + + if xml_root is None: + # Probably need to authenticate + start_page = self._login(webpage_url, video_id) + if start_page is None: + self.report_warning('Could not login.') + else: + # Grab the url from the authenticated page + xml_root = self._html_search_regex(r'', start_page, 'xml filename', None, False) + if xml_name is None: + # Fallback to the older format + xml_name = self._html_search_regex(r'', page) - - if not mobj: - raise ExtractorError('No media found') - - video_type = mobj.group('type') video_id = mobj.group('id') + page = self._download_webpage('http://www.vesti.ru/only_video.html?vid=%s' % video_id, video_id, + 'Downloading video page') - json_data = self._download_json( - 'http://player.rutv.ru/iframe/%splay/id/%s' % ('live-' if video_type == 'live' else '', video_id), - video_id, 'Downloading JSON') - - if json_data['errors']: - raise ExtractorError('vesti returned error: %s' % json_data['errors'], expected=True) - - playlist = json_data['data']['playlist'] - medialist = playlist['medialist'] - media = medialist[0] - - if media['errors']: - raise ExtractorError('vesti returned error: %s' % media['errors'], expected=True) - - view_count = playlist.get('count_views') - priority_transport = playlist['priority_transport'] - - thumbnail = media['picture'] - width = media['width'] - height = media['height'] - description = media['anons'] - title = media['title'] - duration = int_or_none(media.get('duration')) - - formats = [] - - for transport, links in media['sources'].items(): - for quality, url in links.items(): - if transport == 'rtmp': - mobj = re.search(r'^(?Prtmp://[^/]+/(?P.+))/(?P.+)$', url) - if not mobj: - continue - fmt = { - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': 'http://player.rutv.ru', - 'player_url': 'http://player.rutv.ru/flash2v/osmf.swf?i=22', - 'rtmp_live': True, - 'ext': 'flv', - 'vbr': int(quality), - } - elif transport == 'm3u8': - fmt = { - 'url': url, - 'ext': 'mp4', - } - else: - fmt = { - 'url': url - } - fmt.update({ - 'width': width, - 'height': height, - 'format_id': '%s-%s' % (transport, quality), - 'preference': -1 if priority_transport == transport else -2, - }) - formats.append(fmt) - - if not formats: - raise ExtractorError('No media links available for %s' % video_id) - - self._sort_formats(formats) + rutv_url = RUTVIE._extract_url(page) + if rutv_url: + return self.url_result(rutv_url, 'RUTV') - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'view_count': view_count, - 'duration': duration, - 'formats': formats, - } \ No newline at end of file + raise ExtractorError('No video found', expected=True) \ No newline at end of file diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index e458ac9..eada13c 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import re import xml.etree.ElementTree -import datetime from .common import InfoExtractor from ..utils import ( @@ -17,22 +16,55 @@ class VevoIE(InfoExtractor): (currently used by MTVIE) """ _VALID_URL = r'''(?x) - (?:https?://www\.vevo\.com/watch/(?:[^/]+/[^/]+/)?| + (?:https?://www\.vevo\.com/watch/(?:[^/]+/(?:[^/]+/)?)?| https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| vevo:) (?P[^&?#]+)''' + _TESTS = [{ 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', - 'file': 'GB1101300280.mp4', "md5": "06bea460acb744eab74a9d7dcb4bfd61", 'info_dict': { + 'id': 'GB1101300280', + 'ext': 'mp4', "upload_date": "20130624", "uploader": "Hurts", "title": "Somebody to Die For", "duration": 230.12, "width": 1920, "height": 1080, + # timestamp and upload_date are often incorrect; seem to change randomly + 'timestamp': int, + } + }, { + 'note': 'v3 SMIL format', + 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', + 'md5': '893ec0e0d4426a1d96c01de8f2bdff58', + 'info_dict': { + 'id': 'USUV71302923', + 'ext': 'mp4', + 'upload_date': '20140219', + 'uploader': 'Cassadee Pope', + 'title': 'I Wish I Could Break Your Heart', + 'duration': 226.101, + 'age_limit': 0, + 'timestamp': int, + } + }, { + 'note': 'Age-limited video', + 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', + 'info_dict': { + 'id': 'USRV81300282', + 'ext': 'mp4', + 'age_limit': 18, + 'title': 'Tunnel Vision (Explicit)', + 'uploader': 'Justin Timberlake', + 'upload_date': 're:2013070[34]', + 'timestamp': int, + }, + 'params': { + 'skip_download': 'true', } }] _SMIL_BASE_URL = 'http://smil.lvl3.vevo.com/' @@ -102,12 +134,40 @@ class VevoIE(InfoExtractor): video_id = mobj.group('id') json_url = 'http://videoplayer.vevo.com/VideoService/AuthenticateVideo?isrc=%s' % video_id - video_info = self._download_json(json_url, video_id)['video'] + response = self._download_json(json_url, video_id) + video_info = response['video'] + + if not video_info: + if 'statusMessage' in response: + raise ExtractorError('%s said: %s' % (self.IE_NAME, response['statusMessage']), expected=True) + raise ExtractorError('Unable to extract videos') formats = self._formats_from_json(video_info) + + is_explicit = video_info.get('isExplicit') + if is_explicit is True: + age_limit = 18 + elif is_explicit is False: + age_limit = 0 + else: + age_limit = None + + # Download SMIL + smil_blocks = sorted(( + f for f in video_info['videoVersions'] + if f['sourceType'] == 13), + key=lambda f: f['version']) + + smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( + self._SMIL_BASE_URL, video_id, video_id.lower()) + if smil_blocks: + smil_url_m = self._search_regex( + r'url="([^"]+)"', smil_blocks[-1]['data'], 'SMIL URL', + fatal=False) + if smil_url_m is not None: + smil_url = smil_url_m + try: - smil_url = '%s/Video/V2/VFILE/%s/%sr.smil' % ( - self._SMIL_BASE_URL, video_id, video_id.lower()) smil_xml = self._download_webpage(smil_url, video_id, 'Downloading SMIL info') formats.extend(self._formats_from_smil(smil_xml)) @@ -119,13 +179,14 @@ class VevoIE(InfoExtractor): timestamp_ms = int(self._search_regex( r'/Date\((\d+)\)/', video_info['launchDate'], 'launch date')) - upload_date = datetime.datetime.fromtimestamp(timestamp_ms // 1000) + return { 'id': video_id, 'title': video_info['title'], 'formats': formats, 'thumbnail': video_info['imageUrl'], - 'upload_date': upload_date.strftime('%Y%m%d'), + 'timestamp': timestamp_ms // 1000, 'uploader': video_info['mainArtists'][0]['artistName'], 'duration': video_info['duration'], + 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py new file mode 100644 index 0000000..2f77e38 --- /dev/null +++ b/youtube_dl/extractor/vh1.py @@ -0,0 +1,124 @@ +from __future__ import unicode_literals + +from .mtv import MTVIE + +import re +from ..utils import fix_xml_ampersands + + +class VH1IE(MTVIE): + IE_NAME = 'vh1.com' + _FEED_URL = 'http://www.vh1.com/player/embed/AS3/fullepisode/rss/' + _TESTS = [{ + 'url': 'http://www.vh1.com/video/metal-evolution/full-episodes/progressive-metal/1678612/playlist.jhtml', + 'playlist': [ + { + 'md5': '7827a7505f59633983165bbd2c119b52', + 'info_dict': { + 'id': '731565', + 'ext': 'mp4', + 'title': 'Metal Evolution: Ep. 11 Act 1', + 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 12 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' + } + }, + { + 'md5': '34fb4b7321c546b54deda2102a61821f', + 'info_dict': { + 'id': '731567', + 'ext': 'mp4', + 'title': 'Metal Evolution: Ep. 11 Act 2', + 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' + } + }, + { + 'md5': '813f38dba4c1b8647196135ebbf7e048', + 'info_dict': { + 'id': '731568', + 'ext': 'mp4', + 'title': 'Metal Evolution: Ep. 11 Act 3', + 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' + } + }, + { + 'md5': '51adb72439dfaed11c799115d76e497f', + 'info_dict': { + 'id': '731569', + 'ext': 'mp4', + 'title': 'Metal Evolution: Ep. 11 Act 4', + 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' + } + }, + { + 'md5': '93d554aaf79320703b73a95288c76a6e', + 'info_dict': { + 'id': '731570', + 'ext': 'mp4', + 'title': 'Metal Evolution: Ep. 11 Act 5', + 'description': 'Many rock academics have proclaimed that the truly progressive musicianship of the last 20 years has been found right here in the world of heavy metal, rather than obvious locales such as jazz, fusion or progressive rock. It stands to reason then, that much of this jaw-dropping virtuosity occurs within what\'s known as progressive metal, a genre that takes root with the likes of Rush in the \'70s, Queensryche and Fates Warning in the \'80s, and Dream Theater in the \'90s. Since then, the genre has exploded with creativity, spawning mind-bending, genre-defying acts such as Tool, Mastodon, Coheed And Cambria, Porcupine Tree, Meshuggah, A Perfect Circle and Opeth. Episode 11 looks at the extreme musicianship of these bands, as well as their often extreme literary prowess and conceptual strength, the end result being a rich level of respect and attention such challenging acts have brought upon the world of heavy metal, from a critical community usually dismissive of the form.' + } + } + ], + 'skip': 'Blocked outside the US', + }, { + # Clip + 'url': 'http://www.vh1.com/video/misc/706675/metal-evolution-episode-1-pre-metal-show-clip.jhtml#id=1674118', + 'md5': '7d67cf6d9cdc6b4f3d3ac97a55403844', + 'info_dict': { + 'id': '706675', + 'ext': 'mp4', + 'title': 'Metal Evolution: Episode 1 Pre-Metal Show Clip', + 'description': 'The greatest documentary ever made about Heavy Metal begins as our host Sam Dunn travels the globe to seek out the origins and influences that helped create Heavy Metal. Sam speaks to legends like Kirk Hammett, Alice Cooper, Slash, Bill Ward, Geezer Butler, Tom Morello, Ace Frehley, Lemmy Kilmister, Dave Davies, and many many more. This episode is the prologue for the 11 hour series, and Sam goes back to the very beginning to reveal how Heavy Metal was created.' + }, + 'skip': 'Blocked outside the US', + }, { + # Short link + 'url': 'http://www.vh1.com/video/play.jhtml?id=1678353', + 'md5': '853192b87ad978732b67dd8e549b266a', + 'info_dict': { + 'id': '730355', + 'ext': 'mp4', + 'title': 'Metal Evolution: Episode 11 Progressive Metal Sneak', + 'description': 'In Metal Evolution\'s finale sneak, Sam sits with Michael Giles of King Crimson and gets feedback from Metallica guitarist Kirk Hammett on why the group was influential.' + }, + 'skip': 'Blocked outside the US', + }, { + 'url': 'http://www.vh1.com/video/macklemore-ryan-lewis/900535/cant-hold-us-ft-ray-dalton.jhtml', + 'md5': 'b1bcb5b4380c9d7f544065589432dee7', + 'info_dict': { + 'id': '900535', + 'ext': 'mp4', + 'title': 'Macklemore & Ryan Lewis - "Can\'t Hold Us ft. Ray Dalton"', + 'description': 'The Heist' + }, + 'skip': 'Blocked outside the US', + }] + + _VALID_URL = r'''(?x) + https?://www\.vh1\.com/video/ + (?: + .+?/full-episodes/.+?/(?P[^/]+)/playlist\.jhtml + | + (?: + play.jhtml\?id=| + misc/.+?/.+?\.jhtml\#id= + ) + (?P[0-9]+)$ + | + [^/]+/(?P[0-9]+)/[^/]+? + ) + ''' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj.group('music_id'): + id_field = 'vid' + video_id = mobj.group('music_id') + else: + video_id = mobj.group('playlist_id') or mobj.group('video_id') + id_field = 'id' + doc_url = '%s?%s=%s' % (self._FEED_URL, id_field, video_id) + + idoc = self._download_xml( + doc_url, video_id, + 'Downloading info', transform_source=fix_xml_ampersands) + return [self._get_video_info(item) for item in idoc.findall('.//item')] diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py deleted file mode 100644 index 87812d6..0000000 --- a/youtube_dl/extractor/vice.py +++ /dev/null @@ -1,38 +0,0 @@ -import re - -from .common import InfoExtractor -from .ooyala import OoyalaIE -from ..utils import ExtractorError - - -class ViceIE(InfoExtractor): - _VALID_URL = r'http://www\.vice\.com/.*?/(?P.+)' - - _TEST = { - u'url': u'http://www.vice.com/Fringes/cowboy-capitalists-part-1', - u'file': u'43cW1mYzpia9IlestBjVpd23Yu3afAfp.mp4', - u'info_dict': { - u'title': u'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - }, - u'params': { - # Requires ffmpeg (m3u8 manifest) - u'skip_download': True, - }, - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') - webpage = self._download_webpage(url, name) - try: - ooyala_url = self._og_search_video_url(webpage) - except ExtractorError: - try: - embed_code = self._search_regex( - r'OO.Player.create\(\'ooyalaplayer\', \'(.+?)\'', webpage, - u'ooyala embed code') - ooyala_url = OoyalaIE._url_for_embed_code(embed_code) - except ExtractorError: - raise ExtractorError(u'The page doesn\'t contain a video', expected=True) - return self.url_result(ooyala_url, ie='Ooyala') - diff --git a/youtube_dl/extractor/videobam.py b/youtube_dl/extractor/videobam.py new file mode 100644 index 0000000..fed95ef --- /dev/null +++ b/youtube_dl/extractor/videobam.py @@ -0,0 +1,81 @@ +from __future__ import unicode_literals + +import re +import json + +from .common import InfoExtractor +from ..utils import int_or_none + + +class VideoBamIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?videobam\.com/(?:videos/download/)?(?P[a-zA-Z]+)' + + _TESTS = [ + { + 'url': 'http://videobam.com/OiJQM', + 'md5': 'db471f27763a531f10416a0c58b5a1e0', + 'info_dict': { + 'id': 'OiJQM', + 'ext': 'mp4', + 'title': 'Is Alcohol Worse Than Ecstasy?', + 'description': 'md5:d25b96151515c91debc42bfbb3eb2683', + 'uploader': 'frihetsvinge', + }, + }, + { + 'url': 'http://videobam.com/pqLvq', + 'md5': 'd9a565b5379a99126ef94e1d7f9a383e', + 'note': 'HD video', + 'info_dict': { + 'id': 'pqLvq', + 'ext': 'mp4', + 'title': '_', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + page = self._download_webpage('http://videobam.com/%s' % video_id, video_id, 'Downloading page') + + formats = [] + + for preference, format_id in enumerate(['low', 'high']): + mobj = re.search(r"%s: '(?P[^']+)'" % format_id, page) + if not mobj: + continue + formats.append({ + 'url': mobj.group('url'), + 'ext': 'mp4', + 'format_id': format_id, + 'preference': preference, + }) + + if not formats: + player_config = json.loads(self._html_search_regex(r'var player_config = ({.+?});', page, 'player config')) + formats = [{ + 'url': item['url'], + 'ext': 'mp4', + } for item in player_config['playlist'] if 'autoPlay' in item] + + self._sort_formats(formats) + + title = self._og_search_title(page, default='_', fatal=False) + description = self._og_search_description(page, default=None) + thumbnail = self._og_search_thumbnail(page) + uploader = self._html_search_regex(r'Upload by ([^<]+)
    ', page, 'uploader', fatal=False, default=None) + view_count = int_or_none( + self._html_search_regex(r'Views: (\d+) ', page, 'view count', fatal=False)) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'view_count': view_count, + 'formats': formats, + 'age_limit': 18, + } \ No newline at end of file diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py index 265dd5b..ac6c255 100644 --- a/youtube_dl/extractor/videodetective.py +++ b/youtube_dl/extractor/videodetective.py @@ -1,22 +1,23 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor from .internetvideoarchive import InternetVideoArchiveIE -from ..utils import ( - compat_urlparse, -) +from ..utils import compat_urlparse class VideoDetectiveIE(InfoExtractor): _VALID_URL = r'https?://www\.videodetective\.com/[^/]+/[^/]+/(?P\d+)' _TEST = { - u'url': u'http://www.videodetective.com/movies/kick-ass-2/194487', - u'file': u'194487.mp4', - u'info_dict': { - u'title': u'KICK-ASS 2', - u'description': u'md5:65ba37ad619165afac7d432eaded6013', - u'duration': 135, + 'url': 'http://www.videodetective.com/movies/kick-ass-2/194487', + 'info_dict': { + 'id': '194487', + 'ext': 'mp4', + 'title': 'KICK-ASS 2', + 'description': 'md5:65ba37ad619165afac7d432eaded6013', + 'duration': 135, }, } @@ -26,5 +27,4 @@ class VideoDetectiveIE(InfoExtractor): webpage = self._download_webpage(url, video_id) og_video = self._og_search_video_url(webpage) query = compat_urlparse.urlparse(og_video).query - return self.url_result(InternetVideoArchiveIE._build_url(query), - ie=InternetVideoArchiveIE.ie_key()) + return self.url_result(InternetVideoArchiveIE._build_url(query), ie=InternetVideoArchiveIE.ie_key()) diff --git a/youtube_dl/extractor/videolecturesnet.py b/youtube_dl/extractor/videolecturesnet.py new file mode 100644 index 0000000..ebd2a3d --- /dev/null +++ b/youtube_dl/extractor/videolecturesnet.py @@ -0,0 +1,70 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + find_xpath_attr, + int_or_none, + parse_duration, + unified_strdate, +) + + +class VideoLecturesNetIE(InfoExtractor): + _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P[^/#?]+)/' + IE_NAME = 'videolectures.net' + + _TEST = { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/', + 'info_dict': { + 'id': 'promogram_igor_mekjavic_eng', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'duration': 565, + 'thumbnail': 're:http://.*\.jpg', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + smil_url = 'http://videolectures.net/%s/video/1/smil.xml' % video_id + smil = self._download_xml(smil_url, video_id) + + title = find_xpath_attr(smil, './/meta', 'name', 'title').attrib['content'] + description_el = find_xpath_attr(smil, './/meta', 'name', 'abstract') + description = ( + None if description_el is None + else description_el.attrib['content']) + upload_date = unified_strdate( + find_xpath_attr(smil, './/meta', 'name', 'date').attrib['content']) + + switch = smil.find('.//switch') + duration = parse_duration(switch.attrib.get('dur')) + thumbnail_el = find_xpath_attr(switch, './image', 'type', 'thumbnail') + thumbnail = ( + None if thumbnail_el is None else thumbnail_el.attrib.get('src')) + + formats = [{ + 'url': v.attrib['src'], + 'width': int_or_none(v.attrib.get('width')), + 'height': int_or_none(v.attrib.get('height')), + 'filesize': int_or_none(v.attrib.get('size')), + 'tbr': int_or_none(v.attrib.get('systemBitrate')) / 1000.0, + 'ext': v.attrib.get('ext'), + } for v in switch.findall('./video') + if v.attrib.get('proto') == 'http'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'upload_date': upload_date, + 'duration': duration, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/videott.py b/youtube_dl/extractor/videott.py new file mode 100644 index 0000000..b5034b0 --- /dev/null +++ b/youtube_dl/extractor/videott.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals + +import re +import base64 + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class VideoTtIE(InfoExtractor): + ID_NAME = 'video.tt' + IE_DESC = 'video.tt - Your True Tube' + _VALID_URL = r'http://(?:www\.)?video\.tt/(?:video/|watch_video\.php\?v=)(?P[\da-zA-Z]{9})' + + _TEST = { + 'url': 'http://www.video.tt/watch_video.php?v=amd5YujV8', + 'md5': 'b13aa9e2f267effb5d1094443dff65ba', + 'info_dict': { + 'id': 'amd5YujV8', + 'ext': 'flv', + 'title': 'Motivational video Change your mind in just 2.50 mins', + 'description': '', + 'upload_date': '20130827', + 'uploader': 'joseph313', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + settings = self._download_json( + 'http://www.video.tt/player_control/settings.php?v=%s' % video_id, video_id, + 'Downloading video JSON')['settings'] + + video = settings['video_details']['video'] + + formats = [ + { + 'url': base64.b64decode(res['u']).decode('utf-8'), + 'ext': 'flv', + 'format_id': res['l'], + } for res in settings['res'] if res['u'] + ] + + return { + 'id': video_id, + 'title': video['title'], + 'description': video['description'], + 'thumbnail': settings['config']['thumbnail'], + 'upload_date': unified_strdate(video['added']), + 'uploader': video['owner'], + 'view_count': int(video['view_count']), + 'comment_count': int(video['comment_count']), + 'like_count': int(video['liked']), + 'dislike_count': int(video['disliked']), + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/videoweed.py b/youtube_dl/extractor/videoweed.py new file mode 100644 index 0000000..4a08ddd --- /dev/null +++ b/youtube_dl/extractor/videoweed.py @@ -0,0 +1,26 @@ +from __future__ import unicode_literals + +from .novamov import NovaMovIE + + +class VideoWeedIE(NovaMovIE): + IE_NAME = 'videoweed' + IE_DESC = 'VideoWeed' + + _VALID_URL = NovaMovIE._VALID_URL_TEMPLATE % {'host': 'videoweed\.(?:es|com)'} + + _HOST = 'www.videoweed.es' + + _FILE_DELETED_REGEX = r'>This file no longer exists on our servers.<' + _TITLE_REGEX = r'

    ([^<]+)

    ' + + _TEST = { + 'url': 'http://www.videoweed.es/file/b42178afbea14', + 'md5': 'abd31a2132947262c50429e1d16c1bfd', + 'info_dict': { + 'id': 'b42178afbea14', + 'ext': 'flv', + 'title': 'optical illusion dissapeared image magic illusion', + 'description': '' + }, + } \ No newline at end of file diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 2206a06..15f3152 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -1,29 +1,33 @@ +from __future__ import unicode_literals + import re from ..utils import ( ExtractorError, unescapeHTML, unified_strdate, + US_RATINGS, ) from .subtitles import SubtitlesInfoExtractor class VikiIE(SubtitlesInfoExtractor): - IE_NAME = u'viki' + IE_NAME = 'viki' _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P[0-9]+v)' _TEST = { - u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', - u'file': u'1023585v.mp4', - u'md5': u'a21454021c2646f5433514177e2caa5f', - u'info_dict': { - u'title': u'Heirs Episode 14', - u'uploader': u'SBS', - u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', - u'upload_date': u'20131121', - u'age_limit': 13, + 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', + 'md5': 'a21454021c2646f5433514177e2caa5f', + 'info_dict': { + 'id': '1023585v', + 'ext': 'mp4', + 'title': 'Heirs Episode 14', + 'uploader': 'SBS', + 'description': 'md5:c4b17b9626dd4b143dcc4d855ba3474e', + 'upload_date': '20131121', + 'age_limit': 13, }, - u'skip': u'Blocked in the US', + 'skip': 'Blocked in the US', } def _real_extract(self, url): @@ -44,28 +48,21 @@ class VikiIE(SubtitlesInfoExtractor): rating_str = self._html_search_regex( r'Rating: \s*([^<]*)<', webpage, - u'rating information', default='').strip() - RATINGS = { - 'G': 0, - 'PG': 10, - 'PG-13': 13, - 'R': 16, - 'NC': 18, - } - age_limit = RATINGS.get(rating_str) + 'rating information', default='').strip() + age_limit = US_RATINGS.get(rating_str) info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id info_webpage = self._download_webpage( - info_url, video_id, note=u'Downloading info page') + info_url, video_id, note='Downloading info page') if re.match(r'\s*]+src="([^"]+)"', info_webpage, u'video URL') + r']+src="([^"]+)"', info_webpage, 'video URL') upload_date_str = self._html_search_regex( - r'"created_at":"([^"]+)"', info_webpage, u'upload date') + r'"created_at":"([^"]+)"', info_webpage, 'upload date') upload_date = ( unified_strdate(upload_date_str) if upload_date_str is not None diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 4bc2620..2558555 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -8,6 +8,7 @@ import itertools from .common import InfoExtractor from .subtitles import SubtitlesInfoExtractor from ..utils import ( + compat_HTTPError, compat_urllib_parse, compat_urllib_request, clean_html, @@ -16,10 +17,39 @@ from ..utils import ( RegexNotFoundError, std_headers, unsmuggle_url, + urlencode_postdata, + int_or_none, ) -class VimeoIE(SubtitlesInfoExtractor): +class VimeoBaseInfoExtractor(InfoExtractor): + _NETRC_MACHINE = 'vimeo' + _LOGIN_REQUIRED = False + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return + self.report_login() + login_url = 'https://vimeo.com/log_in' + webpage = self._download_webpage(login_url, None, False) + token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') + data = urlencode_postdata({ + 'email': username, + 'password': password, + 'action': 'login', + 'service': 'vimeo', + 'token': token, + }) + login_request = compat_urllib_request.Request(login_url, data) + login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') + login_request.add_header('Cookie', 'xsrft=%s' % token) + self._download_webpage(login_request, None, False, 'Wrong login info') + + +class VimeoIE(VimeoBaseInfoExtractor, SubtitlesInfoExtractor): """Information extractor for vimeo.com.""" # _VALID_URL matches Vimeo URLs @@ -32,53 +62,60 @@ class VimeoIE(SubtitlesInfoExtractor): (?:videos?/)? (?P[0-9]+) /?(?:[?&].*)?(?:[#].*)?$''' - _NETRC_MACHINE = 'vimeo' IE_NAME = 'vimeo' _TESTS = [ { 'url': 'http://vimeo.com/56015672#at=0', - 'file': '56015672.mp4', 'md5': '8879b6cc097e987f02484baf890129e5', 'info_dict': { - "upload_date": "20121220", - "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - "uploader_id": "user7108434", - "uploader": "Filippo Valsorda", + 'id': '56015672', + 'ext': 'mp4', + "upload_date": "20121220", + "description": "This is a test case for youtube-dl.\nFor more information, see github.com/rg3/youtube-dl\nTest chars: \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + "uploader_id": "user7108434", + "uploader": "Filippo Valsorda", "title": "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", + "duration": 10, }, }, { 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', - 'file': '68093876.mp4', 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', 'note': 'Vimeo Pro video (#1197)', 'info_dict': { + 'id': '68093876', + 'ext': 'mp4', 'uploader_id': 'openstreetmapus', 'uploader': 'OpenStreetMap US', 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'duration': 1595, }, }, { 'url': 'http://player.vimeo.com/video/54469442', - 'file': '54469442.mp4', 'md5': '619b811a4417aa4abe78dc653becf511', 'note': 'Videos that embed the url in the player page', 'info_dict': { + 'id': '54469442', + 'ext': 'mp4', 'title': 'Kathy Sierra: Building the minimum Badass User, Business of Software', 'uploader': 'The BLN & Business of Software', 'uploader_id': 'theblnbusinessofsoftware', + 'duration': 3610, }, }, { 'url': 'http://vimeo.com/68375962', - 'file': '68375962.mp4', 'md5': 'aaf896bdb7ddd6476df50007a0ac0ae7', 'note': 'Video protected with password', 'info_dict': { + 'id': '68375962', + 'ext': 'mp4', 'title': 'youtube-dl password protected test video', 'upload_date': '20130614', 'uploader_id': 'user18948128', 'uploader': 'Jaime Marquínez Ferrándiz', + 'duration': 10, }, 'params': { 'videopassword': 'youtube-dl', @@ -96,42 +133,35 @@ class VimeoIE(SubtitlesInfoExtractor): 'upload_date': '20131015', 'uploader_id': 'staff', 'uploader': 'Vimeo Staff', + 'duration': 62, } }, ] - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - self.report_login() - login_url = 'https://vimeo.com/log_in' - webpage = self._download_webpage(login_url, None, False) - token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') - data = compat_urllib_parse.urlencode({'email': username, - 'password': password, - 'action': 'login', - 'service': 'vimeo', - 'token': token, - }) - login_request = compat_urllib_request.Request(login_url, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - login_request.add_header('Cookie', 'xsrft=%s' % token) - self._download_webpage(login_request, None, False, 'Wrong login info') + @classmethod + def suitable(cls, url): + if VimeoChannelIE.suitable(url): + # Otherwise channel urls like http://vimeo.com/channels/31259 would + # match + return False + else: + return super(VimeoIE, cls).suitable(url) def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: raise ExtractorError('This video is protected by a password, use the --video-password option') token = self._search_regex(r'xsrft: \'(.*?)\'', webpage, 'login token') - data = compat_urllib_parse.urlencode({'password': password, - 'token': token}) + data = compat_urllib_parse.urlencode({ + 'password': password, + 'token': token, + }) # I didn't manage to use the password with https if url.startswith('https'): - pass_url = url.replace('https','http') + pass_url = url.replace('https', 'http') else: pass_url = url - password_request = compat_urllib_request.Request(pass_url+'/password', data) + password_request = compat_urllib_request.Request(pass_url + '/password', data) password_request.add_header('Content-Type', 'application/x-www-form-urlencoded') password_request.add_header('Cookie', 'xsrft=%s' % token) self._download_webpage(password_request, video_id, @@ -171,7 +201,18 @@ class VimeoIE(SubtitlesInfoExtractor): # Retrieve video webpage to extract further information request = compat_urllib_request.Request(url, None, headers) - webpage = self._download_webpage(request, video_id) + try: + webpage = self._download_webpage(request, video_id) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: + errmsg = ee.cause.read() + if b'Because of its privacy settings, this video cannot be played here' in errmsg: + raise ExtractorError( + 'Cannot download embed-only video without embedding ' + 'URL. Please call youtube-dl with the URL of the page ' + 'that embeds this video.', + expected=True) + raise # Now we begin extracting as much information as we can from what we # retrieved. First we extract the information common to all extractors, @@ -220,13 +261,16 @@ class VimeoIE(SubtitlesInfoExtractor): # Extract video thumbnail video_thumbnail = config["video"].get("thumbnail") if video_thumbnail is None: - _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in config["video"]["thumbs"].items())[-1] + video_thumbs = config["video"].get("thumbs") + if video_thumbs and isinstance(video_thumbs, dict): + _, video_thumbnail = sorted((int(width), t_url) for (width, t_url) in video_thumbs.items())[-1] # Extract video description video_description = None try: - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) + video_description = get_element_by_attribute("class", "description_wrapper", webpage) + if video_description: + video_description = clean_html(video_description) except AssertionError as err: # On some pages like (http://player.vimeo.com/video/54469442) the # html tags are not closed, python 2.6 cannot handle it @@ -235,6 +279,9 @@ class VimeoIE(SubtitlesInfoExtractor): else: raise + # Extract video duration + video_duration = int_or_none(config["video"].get("duration")) + # Extract upload date video_upload_date = None mobj = re.search(r'[^/]+)' + _VALID_URL = r'(?:https?://)?vimeo\.com/channels/(?P[^/]+)/?(\?.*)?$' _MORE_PAGES_INDICATOR = r']+?title="(.*?)"' @@ -331,7 +379,7 @@ class VimeoChannelIE(InfoExtractor): video_ids = [] for pagenum in itertools.count(1): webpage = self._download_webpage( - self._page_url(base_url, pagenum) ,list_id, + self._page_url(base_url, pagenum), list_id, 'Downloading page %s' % pagenum) video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage)) if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: @@ -347,7 +395,7 @@ class VimeoChannelIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - channel_id = mobj.group('id') + channel_id = mobj.group('id') return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id) @@ -414,3 +462,25 @@ class VimeoReviewIE(InfoExtractor): video_id = mobj.group('id') player_url = 'https://player.vimeo.com/player/' + video_id return self.url_result(player_url, 'Vimeo', video_id) + + +class VimeoWatchLaterIE(VimeoBaseInfoExtractor, VimeoChannelIE): + IE_NAME = 'vimeo:watchlater' + IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' + _VALID_URL = r'https?://vimeo\.com/home/watchlater|:vimeowatchlater' + _LOGIN_REQUIRED = True + _TITLE_RE = r'href="/home/watchlater".*?>(.*?)<' + + def _real_initialize(self): + self._login() + + def _page_url(self, base_url, pagenum): + url = '%s/page:%d/' % (base_url, pagenum) + request = compat_urllib_request.Request(url) + # Set the header to get a partial html page with the ids, + # the normal page doesn't contain them. + request.add_header('X-Requested-With', 'XMLHttpRequest') + return request + + def _real_extract(self, url): + return self._extract_videos('watchlater', 'https://vimeo.com/home/watchlater') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index e14ff91..076c871 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -1,8 +1,11 @@ from __future__ import unicode_literals import re +import json +import itertools from .common import InfoExtractor +from ..utils import unified_strdate class VineIE(InfoExtractor): @@ -13,31 +16,76 @@ class VineIE(InfoExtractor): 'info_dict': { 'id': 'b9KOOWX7HUx', 'ext': 'mp4', - 'uploader': 'Jack Dorsey', 'title': 'Chicken.', + 'description': 'Chicken.', + 'upload_date': '20130519', + 'uploader': 'Jack Dorsey', + 'uploader_id': '76', }, } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage_url = 'https://vine.co/v/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - self.report_extraction(video_id) + webpage = self._download_webpage('https://vine.co/v/' + video_id, video_id) - video_url = self._html_search_meta('twitter:player:stream', webpage, - 'video URL') + data = json.loads(self._html_search_regex( + r'window\.POST_DATA = { %s: ({.+?}) }' % video_id, webpage, 'vine data')) - uploader = self._html_search_regex(r'

    (.*?)

    ', - webpage, 'uploader', fatal=False, flags=re.DOTALL) + formats = [ + { + 'url': data['videoLowURL'], + 'ext': 'mp4', + 'format_id': 'low', + }, + { + 'url': data['videoUrl'], + 'ext': 'mp4', + 'format_id': 'standard', + } + ] return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', 'title': self._og_search_title(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': uploader, + 'description': data['description'], + 'thumbnail': data['thumbnailUrl'], + 'upload_date': unified_strdate(data['created']), + 'uploader': data['username'], + 'uploader_id': data['userIdStr'], + 'like_count': data['likes']['count'], + 'comment_count': data['comments']['count'], + 'repost_count': data['reposts']['count'], + 'formats': formats, } + + +class VineUserIE(InfoExtractor): + IE_NAME = 'vine:user' + _VALID_URL = r'(?:https?://)?vine\.co/(?P[^/]+)/?(\?.*)?$' + _VINE_BASE_URL = "https://vine.co/" + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + user = mobj.group('user') + + profile_url = "%sapi/users/profiles/vanity/%s" % ( + self._VINE_BASE_URL, user) + profile_data = self._download_json( + profile_url, user, note='Downloading user profile data') + + user_id = profile_data['data']['userId'] + timeline_data = [] + for pagenum in itertools.count(1): + timeline_url = "%sapi/timelines/users/%s?page=%s" % ( + self._VINE_BASE_URL, user_id, pagenum) + timeline_page = self._download_json( + timeline_url, user, note='Downloading page %d' % pagenum) + timeline_data.extend(timeline_page['data']['records']) + if timeline_page['data']['nextPage'] is None: + break + + entries = [ + self.url_result(e['permalinkUrl'], 'Vine') for e in timeline_data] + return self.playlist_result(entries, user) diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index a293b88..fb082f3 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -16,7 +16,7 @@ from ..utils import ( class VKIE(InfoExtractor): IE_NAME = 'vk.com' - _VALID_URL = r'https?://vk\.com/(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$)' + _VALID_URL = r'https?://vk\.com/(?:video_ext\.php\?.*?\boid=(?P-?\d+).*?\bid=(?P\d+)|(?:videos.*?\?.*?z=)?video(?P.*?)(?:\?|%2F|$))' _NETRC_MACHINE = 'vk' _TESTS = [ @@ -37,11 +37,23 @@ class VKIE(InfoExtractor): 'info_dict': { 'id': '163339118', 'ext': 'mp4', - 'uploader': 'Elvira Dzhonik', + 'uploader': 'Elya Iskhakova', 'title': 'Dream Theater - Hollow Years Live at Budokan 720*', 'duration': 558, } }, + { + 'note': 'Embedded video', + 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1', + 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a', + 'info_dict': { + 'id': '162925554', + 'ext': 'mp4', + 'uploader': 'Vladimir Gavrin', + 'title': 'Lin Dan', + 'duration': 101, + } + }, { 'url': 'http://vk.com/video-8871596_164049491', 'md5': 'a590bcaf3d543576c9bd162812387666', @@ -54,7 +66,7 @@ class VKIE(InfoExtractor): 'duration': 8352, }, 'skip': 'Requires vk account credentials', - } + }, ] def _login(self): @@ -82,7 +94,10 @@ class VKIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('videoid') + + if not video_id: + video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id')) info_url = 'http://vk.com/al_video.php?act=show&al=1&video=%s' % video_id info_page = self._download_webpage(info_url, video_id) @@ -93,7 +108,7 @@ class VKIE(InfoExtractor): m_yt = re.search(r'src="(http://www.youtube.com/.*?)"', info_page) if m_yt is not None: - self.to_screen(u'Youtube video detected') + self.to_screen('Youtube video detected') return self.url_result(m_yt.group(1), 'Youtube') data_json = self._search_regex(r'var vars = ({.*?});', info_page, 'vars') data = json.loads(data_json) diff --git a/youtube_dl/extractor/vube.py b/youtube_dl/extractor/vube.py index fbdff47..7b77865 100644 --- a/youtube_dl/extractor/vube.py +++ b/youtube_dl/extractor/vube.py @@ -1,47 +1,69 @@ from __future__ import unicode_literals import re -import datetime from .common import InfoExtractor +from ..utils import int_or_none class VubeIE(InfoExtractor): IE_NAME = 'vube' IE_DESC = 'Vube.com' - _VALID_URL = r'http://vube\.com/[^/]+/(?P[\da-zA-Z]{10})' + _VALID_URL = r'http://vube\.com/(?:[^/]+/)+(?P[\da-zA-Z]{10})\b' - _TEST = { - 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', - 'md5': 'f81dcf6d0448e3291f54380181695821', - 'info_dict': { - 'id': 'YL2qNPkqon', - 'ext': 'mp4', - 'title': 'Chiara Grispo - Price Tag by Jessie J', - 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg', - 'uploader': 'Chiara.Grispo', - 'uploader_id': '1u3hX0znhP', - 'upload_date': '20140103', - 'duration': 170.56 + _TESTS = [ + { + 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', + 'md5': 'db7aba89d4603dadd627e9d1973946fe', + 'info_dict': { + 'id': 'YL2qNPkqon', + 'ext': 'mp4', + 'title': 'Chiara Grispo - Price Tag by Jessie J', + 'description': 'md5:8ea652a1f36818352428cb5134933313', + 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f.jpg', + 'uploader': 'Chiara.Grispo', + 'uploader_id': '1u3hX0znhP', + 'timestamp': 1388743358, + 'upload_date': '20140103', + 'duration': 170.56 + } + }, + { + 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', + 'md5': '5d4a52492d76f72712117ce6b0d98d08', + 'info_dict': { + 'id': 'UeBhTudbfS', + 'ext': 'mp4', + 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', + 'description': 'md5:40bcacb97796339f1690642c21d56f4a', + 'thumbnail': 'http://frame.thestaticvube.com/snap/228x128/102265d5a9f-0f17-4f6b-5753-adf08484ee1e.jpg', + 'uploader': 'Seraina', + 'uploader_id': 'XU9VE2BQ2q', + 'timestamp': 1396492438, + 'upload_date': '20140403', + 'duration': 240.107 + } } - } + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - video = self._download_json('http://vube.com/api/v2/video/%s' % video_id, - video_id, 'Downloading video JSON') + video = self._download_json( + 'http://vube.com/api/v2/video/%s' % video_id, video_id, 'Downloading video JSON') public_id = video['public_id'] - formats = [{'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id), - 'height': int(fmt['height']), - 'abr': int(fmt['audio_bitrate']), - 'vbr': int(fmt['video_bitrate']), - 'format_id': fmt['media_resolution_id'] - } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed'] + formats = [ + { + 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (fmt['media_resolution_id'], public_id), + 'height': int(fmt['height']), + 'abr': int(fmt['audio_bitrate']), + 'vbr': int(fmt['video_bitrate']), + 'format_id': fmt['media_resolution_id'] + } for fmt in video['mtm'] if fmt['transcoding_status'] == 'processed' + ] self._sort_formats(formats) @@ -52,16 +74,16 @@ class VubeIE(InfoExtractor): thumbnail = 'http:' + thumbnail uploader = video['user_alias'] uploader_id = video['user_url_id'] - upload_date = datetime.datetime.fromtimestamp(int(video['upload_time'])).strftime('%Y%m%d') + timestamp = int(video['upload_time']) duration = video['duration'] - view_count = video['raw_view_count'] - like_count = video['total_likes'] - dislike_count= video['total_hates'] + view_count = video.get('raw_view_count') + like_count = video.get('total_likes') + dislike_count= video.get('total_hates') - comment = self._download_json('http://vube.com/api/video/%s/comment' % video_id, - video_id, 'Downloading video comment JSON') + comment = self._download_json( + 'http://vube.com/api/video/%s/comment' % video_id, video_id, 'Downloading video comment JSON') - comment_count = comment['total'] + comment_count = int_or_none(comment.get('total')) return { 'id': video_id, @@ -71,10 +93,10 @@ class VubeIE(InfoExtractor): 'thumbnail': thumbnail, 'uploader': uploader, 'uploader_id': uploader_id, - 'upload_date': upload_date, + 'timestamp': timestamp, 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - } \ No newline at end of file + } diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py new file mode 100644 index 0000000..fb0600f --- /dev/null +++ b/youtube_dl/extractor/vuclip.py @@ -0,0 +1,66 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse_urlparse, + parse_duration, + qualities, +) + + +class VuClipIE(InfoExtractor): + _VALID_URL = r'http://(?:m\.)?vuclip\.com/w\?.*?cid=(?P[0-9]+)' + + _TEST = { + 'url': 'http://m.vuclip.com/w?cid=843902317&fid=63532&z=1007&nvar&frm=index.html&bu=4757321434', + 'md5': '92ac9d1ccefec4f0bb474661ab144fcf', + 'info_dict': { + 'id': '843902317', + 'ext': '3gp', + 'title': 'Movie Trailer: Noah', + 'duration': 139, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + webpage = self._download_webpage(url, video_id) + ad_m = re.search( + r'''value="No.*?" onClick="location.href='([^"']+)'"''', webpage) + if ad_m: + urlr = compat_urllib_parse_urlparse(url) + adfree_url = urlr.scheme + '://' + urlr.netloc + ad_m.group(1) + webpage = self._download_webpage( + adfree_url, video_id, note='Download post-ad page') + + links_code = self._search_regex( + r'(?s)', + webpage, 'duration', fatal=False)) + + view_count = self._html_search_regex(r'Views: ([^<]+)', webpage, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + + mobj = re.search(r"hint='(?P\d+) Likes / (?P\d+) Dislikes'", webpage) + (like_count, dislike_count) = (mobj.group('likecount'), mobj.group('dislikecount')) if mobj else (None, None) + + mobj = re.search(r'Comments \((?P\d+)\)', webpage) + comment_count = mobj.group('commentcount') if mobj else 0 age_limit = self._rta_search(webpage) hd = is_hd(webpage) + video_url = extract_video_url(webpage) formats = [{ 'url': video_url, 'format_id': 'hd' if hd else 'sd', - 'preference': 0, + 'preference': 1, }] - video_mp4_url = extract_mp4_video_url(webpage) - if video_mp4_url is not None: - formats.append({ - 'url': video_mp4_url, - 'ext': 'mp4', - 'format_id': 'mp4-hd' if hd else 'mp4-sd', - 'preference': 1, - }) - if not hd: - webpage = self._download_webpage( - mrss_url + '?hd', video_id, note='Downloading HD webpage') + mrss_url = self._search_regex(r'(.*?)\s+-\s+XNXX.COM' - VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&' + _VALID_URL = r'^https?://(?:video|www)\.xnxx\.com/video(?P[0-9]+)/(.*)' _TEST = { - u'url': u'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', - u'file': u'1135332.flv', - u'md5': u'0831677e2b4761795f68d417e0b7b445', - u'info_dict': { - u"title": u"lida \u00bb Naked Funny Actress (5)", - u"age_limit": 18, + 'url': 'http://video.xnxx.com/video1135332/lida_naked_funny_actress_5_', + 'md5': '0831677e2b4761795f68d417e0b7b445', + 'info_dict': { + 'id': '1135332', + 'ext': 'flv', + 'title': 'lida » Naked Funny Actress (5)', + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) + video_id = mobj.group('id') # Get webpage content webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(self.VIDEO_URL_RE, - webpage, u'video URL') + video_url = self._search_regex(r'flv_url=(.*?)&', + webpage, 'video URL') video_url = compat_urllib_parse.unquote(video_url) - video_title = self._html_search_regex(self.VIDEO_TITLE_RE, - webpage, u'title') + video_title = self._html_search_regex(r'(.*?)\s+-\s+XNXX.COM', + webpage, 'title') - video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE, - webpage, u'thumbnail', fatal=False) + video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&', + webpage, 'thumbnail', fatal=False) - return [{ + return { 'id': video_id, 'url': video_url, - 'uploader': None, - 'upload_date': None, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, - 'description': None, 'age_limit': 18, - }] + } diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 9826199..b293e26 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -1,25 +1,29 @@ from __future__ import unicode_literals -import os import re +import json from .common import InfoExtractor from ..utils import ( - compat_urllib_parse_urlparse, compat_urllib_request, + parse_duration, + str_to_int, ) + class XTubeIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>xtube\.com/watch\.php\?v=(?P<videoid>[^/?&]+))' _TEST = { 'url': 'http://www.xtube.com/watch.php?v=kVTUy_G222_', - 'file': 'kVTUy_G222_.mp4', 'md5': '092fbdd3cbe292c920ef6fc6a8a9cdab', 'info_dict': { - "title": "strange erotica", - "description": "surreal gay themed erotica...almost an ET kind of thing", - "uploader": "greenshowers", - "age_limit": 18, + 'id': 'kVTUy_G222_', + 'ext': 'mp4', + 'title': 'strange erotica', + 'description': 'surreal gay themed erotica...almost an ET kind of thing', + 'uploader': 'greenshowers', + 'duration': 450, + 'age_limit': 18, } } @@ -32,25 +36,79 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(req, video_id) - video_title = self._html_search_regex(r'<div class="p_5px[^>]*>([^<]+)', webpage, 'title') - video_uploader = self._html_search_regex(r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False) - video_description = self._html_search_regex(r'<p class="video_description">([^<]+)', webpage, 'description', fatal=False) - video_url= self._html_search_regex(r'var videoMp4 = "([^"]+)', webpage, 'video_url').replace('\\/', '/') - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format[0] += 'p' - format[1] += 'k' - format = "-".join(format) + video_title = self._html_search_regex(r'<p class="title">([^<]+)', webpage, 'title') + video_uploader = self._html_search_regex( + r'so_s\.addVariable\("owner_u", "([^"]+)', webpage, 'uploader', fatal=False) + video_description = self._html_search_regex( + r'<p class="fieldsDesc">([^<]+)', webpage, 'description', fatal=False) + duration = parse_duration(self._html_search_regex( + r'<span class="bold">Runtime:</span> ([^<]+)</p>', webpage, 'duration', fatal=False)) + view_count = self._html_search_regex( + r'<span class="bold">Views:</span> ([\d,\.]+)</p>', webpage, 'view count', fatal=False) + if view_count: + view_count = str_to_int(view_count) + comment_count = self._html_search_regex( + r'<div id="commentBar">([\d,\.]+) Comments</div>', webpage, 'comment count', fatal=False) + if comment_count: + comment_count = str_to_int(comment_count) + + player_quality_option = json.loads(self._html_search_regex( + r'playerQualityOption = ({.+?});', webpage, 'player quality option')) + + QUALITIES = ['3gp', 'mp4_normal', 'mp4_high', 'flv', 'mp4_ultra', 'mp4_720', 'mp4_1080'] + formats = [ + { + 'url': furl, + 'format_id': format_id, + 'preference': QUALITIES.index(format_id) if format_id in QUALITIES else -1, + } for format_id, furl in player_quality_option.items() + ] + self._sort_formats(formats) return { 'id': video_id, 'title': video_title, 'uploader': video_uploader, 'description': video_description, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'formats': formats, 'age_limit': 18, } + +class XTubeUserIE(InfoExtractor): + IE_DESC = 'XTube user profile' + _VALID_URL = r'https?://(?:www\.)?xtube\.com/community/profile\.php\?(.*?)user=(?P<username>[^&#]+)(?:$|[&#])' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + username = mobj.group('username') + + profile_page = self._download_webpage( + url, username, note='Retrieving profile page') + + video_count = int(self._search_regex( + r'<strong>%s\'s Videos \(([0-9]+)\)</strong>'%username, profile_page, + 'video count')) + + PAGE_SIZE = 25 + urls = [] + page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE + for n in range(1, page_count + 1): + lpage_url = 'http://www.xtube.com/user_videos.php?page=%d&u=%s' % (n, username) + lpage = self._download_webpage( + lpage_url, username, + note='Downloading page %d/%d' % (n, page_count)) + urls.extend( + re.findall(r'addthis:url="([^"]+)"', lpage)) + + return { + '_type': 'playlist', + 'id': username, + 'entries': [{ + '_type': 'url', + 'url': eurl, + 'ie_key': 'XTube', + } for eurl in urls] + } diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 85e99e1..7e00448 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,18 +5,21 @@ import re from .common import InfoExtractor from ..utils import ( compat_urllib_parse, + ExtractorError, + clean_html, ) class XVideosIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' _TEST = { - 'url': 'http://www.xvideos.com/video939581/funny_porns_by_s_-1', - 'file': '939581.flv', - 'md5': '1d0c835822f0a71a7bf011855db929d0', + 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', + 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', 'info_dict': { - "title": "Funny Porns By >>>>S<<<<<< -1", - "age_limit": 18, + 'id': '4588838', + 'ext': 'flv', + 'title': 'Biker Takes his Girl', + 'age_limit': 18, } } @@ -28,6 +31,10 @@ class XVideosIE(InfoExtractor): self.report_extraction(video_id) + mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) + if mobj: + raise ExtractorError('%s said: %s' % (self.IE_NAME, clean_html(mobj.group(1))), expected=True) + # Extract video URL video_url = compat_urllib_parse.unquote( self._search_regex(r'flv_url=(.+?)&', webpage, 'video URL')) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index d92d14f..d84be25 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -14,27 +14,39 @@ from ..utils import ( class YahooIE(InfoExtractor): - IE_DESC = 'Yahoo screen' - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' + IE_DESC = 'Yahoo screen and movies' + _VALID_URL = r'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html' _TESTS = [ { 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', - 'file': '214727115.mp4', 'md5': '4962b075c08be8690a922ee026d05e69', 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', 'title': 'Julian Smith & Travis Legg Watch Julian Smith', 'description': 'Julian and Travis watch Julian Smith', }, }, { 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', - 'file': '103000935.mp4', 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', 'info_dict': { + 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', + 'ext': 'mp4', 'title': 'Codefellas - The Cougar Lies with Spanish Moss', 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', }, }, + { + 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', + 'md5': '410b7104aa9893b765bc22787a22f3d9', + 'info_dict': { + 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', + 'ext': 'mp4', + 'title': 'The World Loves Spider-Man', + 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', + } + } ] def _real_extract(self, url): @@ -42,16 +54,25 @@ class YahooIE(InfoExtractor): video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) - items_json = self._search_regex(r'mediaItems: ({.*?})$', - webpage, 'items', flags=re.MULTILINE) - items = json.loads(items_json) - info = items['mediaItems']['query']['results']['mediaObj'][0] - # The 'meta' field is not always in the video webpage, we request it - # from another page - long_id = info['id'] - return self._get_info(long_id, video_id) - - def _get_info(self, long_id, video_id): + items_json = self._search_regex( + r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE, + default=None) + if items_json is None: + CONTENT_ID_REGEXES = [ + r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"', + r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"' + ] + long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID') + video_id = long_id + else: + items = json.loads(items_json) + info = items['mediaItems']['query']['results']['mediaObj'][0] + # The 'meta' field is not always in the video webpage, we request it + # from another page + long_id = info['id'] + return self._get_info(long_id, video_id, webpage) + + def _get_info(self, long_id, video_id, webpage): query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"' ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"' ' AND protocol="http"' % long_id) @@ -60,10 +81,9 @@ class YahooIE(InfoExtractor): 'env': 'prod', 'format': 'json', }) - query_result_json = self._download_webpage( + query_result = self._download_json( 'http://video.query.yahoo.com/v1/public/yql?' + data, video_id, 'Downloading video info') - query_result = json.loads(query_result_json) info = query_result['query']['results']['mediaObj'][0] meta = info['meta'] @@ -86,7 +106,6 @@ class YahooIE(InfoExtractor): else: format_url = compat_urlparse.urljoin(host, path) format_info['url'] = format_url - formats.append(format_info) self._sort_formats(formats) @@ -96,7 +115,7 @@ class YahooIE(InfoExtractor): 'title': meta['title'], 'formats': formats, 'description': clean_html(meta['description']), - 'thumbnail': meta['thumbnail'], + 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage), } @@ -104,7 +123,7 @@ class YahooNewsIE(YahooIE): IE_NAME = 'yahoo:news' _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 'md5': '67010fdf3a08d290e060a4dd96baa07b', 'info_dict': { @@ -113,17 +132,14 @@ class YahooNewsIE(YahooIE): 'title': 'China Moses Is Crazy About the Blues', 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', }, - } - - # Overwrite YahooIE properties we don't want - _TESTS = [] + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') webpage = self._download_webpage(url, video_id) long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id') - return self._get_info(long_id, video_id) + return self._get_info(long_id, video_id, webpage) class YahooSearchIE(SearchInfoExtractor): @@ -134,27 +150,25 @@ class YahooSearchIE(SearchInfoExtractor): def _get_n_results(self, query, n): """Get a specified number of results for a query""" - - res = { - '_type': 'playlist', - 'id': query, - 'entries': [] - } - for pagenum in itertools.count(0): + entries = [] + for pagenum in itertools.count(0): result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) - webpage = self._download_webpage(result_url, query, - note='Downloading results page '+str(pagenum+1)) - info = json.loads(webpage) + info = self._download_json(result_url, query, + note='Downloading results page '+str(pagenum+1)) m = info['m'] results = info['results'] for (i, r) in enumerate(results): - if (pagenum * 30) +i >= n: + if (pagenum * 30) + i >= n: break mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r) e = self.url_result('http://' + mobj.group('url'), 'Yahoo') - res['entries'].append(e) - if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)): + entries.append(e) + if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)): break - return res + return { + '_type': 'playlist', + 'id': query, + 'entries': entries, + } diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 77ad423..d456c4d 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + import json import re import sys @@ -17,24 +20,25 @@ from ..aes import ( class YouPornIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' + _VALID_URL = r'^(?P<proto>https?://)(?:www\.)?(?P<url>youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+))' _TEST = { - u'url': u'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', - u'file': u'505835.mp4', - u'md5': u'71ec5fcfddacf80f495efa8b6a8d9a89', - u'info_dict': { - u"upload_date": u"20101221", - u"description": u"Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?", - u"uploader": u"Ask Dan And Jennifer", - u"title": u"Sex Ed: Is It Safe To Masturbate Daily?", - u"age_limit": 18, + 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', + 'md5': '71ec5fcfddacf80f495efa8b6a8d9a89', + 'info_dict': { + 'id': '505835', + 'ext': 'mp4', + 'upload_date': '20101221', + 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', + 'uploader': 'Ask Dan And Jennifer', + 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', + 'age_limit': 18, } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') - url = 'http://www.' + mobj.group('url') + url = mobj.group('proto') + 'www.' + mobj.group('url') req = compat_urllib_request.Request(url) req.add_header('Cookie', 'age_verified=1') @@ -42,7 +46,7 @@ class YouPornIE(InfoExtractor): age_limit = self._rta_search(webpage) # Get JSON parameters - json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') + json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, 'JSON parameters') try: params = json.loads(json_params) except: @@ -61,7 +65,7 @@ class YouPornIE(InfoExtractor): # Get all of the links from the page DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' download_list_html = self._search_regex(DOWNLOAD_LIST_RE, - webpage, u'download list').strip() + webpage, 'download list').strip() LINK_RE = r'<a href="([^"]+)">' links = re.findall(LINK_RE, download_list_html) @@ -86,7 +90,7 @@ class YouPornIE(InfoExtractor): resolution = format_parts[0] height = int(resolution[:-len('p')]) bitrate = int(format_parts[1][:-len('k')]) - format = u'-'.join(format_parts) + u'-' + dn + format = '-'.join(format_parts) + '-' + dn formats.append({ 'url': video_url, diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index a810368..7c50881 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,13 +7,13 @@ import itertools import json import os.path import re -import string import struct import traceback import zlib from .common import InfoExtractor, SearchInfoExtractor from .subtitles import SubtitlesInfoExtractor +from ..jsinterp import JSInterpreter from ..utils import ( compat_chr, compat_parse_qs, @@ -29,7 +29,6 @@ from ..utils import ( ExtractorError, int_or_none, PagedList, - RegexNotFoundError, unescapeHTML, unified_strdate, orderedSet, @@ -138,19 +137,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ |(?: # or the v= param in all its forms - (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) + (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) (?:\?|\#!?) # the params delimiter ? or # or #! (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) )) |youtu\.be/ # just youtu.be/xxxx + |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID @@ -176,32 +177,32 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # 3d videos - '82': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, - '83': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, - '84': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, - '85': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': '3D', 'preference': -20}, - '100': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': '3D', 'preference': -20}, - '101': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': '3D', 'preference': -20}, - '102': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': '3D', 'preference': -20}, + '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20}, + '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20}, + '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20}, + '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20}, + '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20}, + '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20}, + '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20}, # Apple HTTP Live Streaming - '92': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, - '93': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'HLS', 'preference': -10}, - '94': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'HLS', 'preference': -10}, - '95': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'HLS', 'preference': -10}, - '96': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'HLS', 'preference': -10}, - '132': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'HLS', 'preference': -10}, - '151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10}, + '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, + '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10}, + '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10}, + '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10}, + '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10}, + '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10}, + '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40}, - '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash mp4 audio '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, @@ -209,23 +210,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'resolution': '720p', 'format_note': 'DASH webm', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH webm', 'preference': -40}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, - '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, + '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, @@ -241,7 +243,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader": u"Philipp Hagemeister", u"uploader_id": u"phihag", u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." + u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .", + u"categories": [u'Science & Technology'], } }, { @@ -251,7 +254,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:5b292926389560516e384ac437c0ec07", + u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -296,6 +299,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"format": "141", }, }, + # DASH manifest with encrypted signature + { + u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA', + u'info_dict': { + u'id': u'IB3lcPjvWLA', + u'ext': u'm4a', + u'title': u'Afrojack - The Spark ft. Spree Wilson', + u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8', + u'uploader': u'AfrojackVEVO', + u'uploader_id': u'AfrojackVEVO', + u'upload_date': u'20131011', + }, + u"params": { + u'youtube_include_dash_manifest': True, + u'format': '141', + }, + }, ] @@ -421,113 +441,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( r'signature=([a-zA-Z]+)', jscode, - u'Initial JS player signature function name') - - functions = {} - - def argidx(varname): - return string.lowercase.index(varname) - - def interpret_statement(stmt, local_vars, allow_recursion=20): - if allow_recursion < 0: - raise ExtractorError(u'Recursion limit reached') - - if stmt.startswith(u'var '): - stmt = stmt[len(u'var '):] - ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + - r'=(?P<expr>.*)$', stmt) - if ass_m: - if ass_m.groupdict().get('index'): - def assign(val): - lvar = local_vars[ass_m.group('out')] - idx = interpret_expression(ass_m.group('index'), - local_vars, allow_recursion) - assert isinstance(idx, int) - lvar[idx] = val - return val - expr = ass_m.group('expr') - else: - def assign(val): - local_vars[ass_m.group('out')] = val - return val - expr = ass_m.group('expr') - elif stmt.startswith(u'return '): - assign = lambda v: v - expr = stmt[len(u'return '):] - else: - raise ExtractorError( - u'Cannot determine left side of statement in %r' % stmt) - - v = interpret_expression(expr, local_vars, allow_recursion) - return assign(v) - - def interpret_expression(expr, local_vars, allow_recursion): - if expr.isdigit(): - return int(expr) - - if expr.isalpha(): - return local_vars[expr] - - m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) - if m: - member = m.group('member') - val = local_vars[m.group('in')] - if member == 'split("")': - return list(val) - if member == 'join("")': - return u''.join(val) - if member == 'length': - return len(val) - if member == 'reverse()': - return val[::-1] - slice_m = re.match(r'slice\((?P<idx>.*)\)', member) - if slice_m: - idx = interpret_expression( - slice_m.group('idx'), local_vars, allow_recursion-1) - return val[idx:] - - m = re.match( - r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) - if m: - val = local_vars[m.group('in')] - idx = interpret_expression(m.group('idx'), local_vars, - allow_recursion-1) - return val[idx] - - m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) - if m: - a = interpret_expression(m.group('a'), - local_vars, allow_recursion) - b = interpret_expression(m.group('b'), - local_vars, allow_recursion) - return a % b - - m = re.match( - r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr) - if m: - fname = m.group('func') - if fname not in functions: - functions[fname] = extract_function(fname) - argvals = [int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')] - return functions[fname](argvals) - raise ExtractorError(u'Unsupported JS expression %r' % expr) - - def extract_function(funcname): - func_m = re.search( - r'function ' + re.escape(funcname) + - r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', - jscode) - argnames = func_m.group('args').split(',') - - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in func_m.group('code').split(';'): - res = interpret_statement(stmt, local_vars) - return res - return resf - - initial_function = extract_function(funcname) + u'Initial JS player signature function name') + + jsi = JSInterpreter(jscode) + initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) def _parse_sig_swf(self, file_contents): @@ -1113,14 +1030,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.') def _real_extract(self, url): + proto = ( + u'http' if self._downloader.params.get('prefer_insecure', False) + else u'https') + # Extract original video URL from URL with redirection, like age verification, using next_url parameter mobj = re.search(self._NEXT_URL_RE, url) if mobj: - url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') + url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') video_id = self.extract_id(url) # Get video webpage - url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id + url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id video_webpage = self._download_webpage(url, video_id) # Attempt to extract SWF player URL @@ -1145,7 +1066,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'asv': 3, 'sts':'1588', }) - video_info_url = 'https://www.youtube.com/get_video_info?' + data + video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, errnote='unable to download video info webpage') @@ -1153,7 +1074,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: age_gate = False for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' + video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' % (video_id, el_type)) video_info_webpage = self._download_webpage(video_info_url, video_id, note=False, @@ -1163,9 +1084,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): break if 'token' not in video_info: if 'reason' in video_info: - raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True) + raise ExtractorError( + u'YouTube said: %s' % video_info['reason'][0], + expected=True, video_id=video_id) else: - raise ExtractorError(u'"token" parameter not in video info for unknown reason') + raise ExtractorError( + u'"token" parameter not in video info for unknown reason', + video_id=video_id) if 'view_count' in video_info: view_count = int(video_info['view_count'][0]) @@ -1194,7 +1119,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # title if 'title' in video_info: - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + video_title = video_info['title'][0] else: self._downloader.report_warning(u'Unable to extract video title') video_title = u'_' @@ -1213,11 +1138,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # upload date upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL) + mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage) + if mobj is None: + mobj = re.search( + r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>', + video_webpage) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + m_cat_container = get_element_by_id("eow-category", video_webpage) + if m_cat_container: + category = self._html_search_regex( + r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1268,11 +1206,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # Decide which formats to download try: - mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage) + mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage) if not mobj: raise ValueError('Could not find vevo ID') - info = json.loads(mobj.group(1)) - args = info['args'] + json_code = uppercase_escape(mobj.group(1)) + ytplayer_config = json.loads(json_code) + args = ytplayer_config['args'] # Easy way to know if the 's' value is in url_encoded_fmt_stream_map # this signatures are encrypted if 'url_encoded_fmt_stream_map' not in args: @@ -1365,12 +1304,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') # Look for the DASH manifest - dash_manifest_url_lst = video_info.get('dashmpd') - if (dash_manifest_url_lst and dash_manifest_url_lst[0] and - self._downloader.params.get('youtube_include_dash_manifest', False)): + if (self._downloader.params.get('youtube_include_dash_manifest', False)): try: + # The DASH manifest used needs to be the one from the original video_webpage. + # The one found in get_video_info seems to be using different signatures. + # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage. + # Luckily, it seems, this case uses some kind of default signature (len == 86), so the + # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here. + if age_gate: + dash_manifest_url = video_info.get('dashmpd')[0] + else: + dash_manifest_url = ytplayer_config['args']['dashmpd'] + def decrypt_sig(mobj): + s = mobj.group(1) + dec_s = self._decrypt_signature(s, video_id, player_url, age_gate) + return '/signature/%s' % dec_s + dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url) dash_doc = self._download_xml( - dash_manifest_url_lst[0], video_id, + dash_manifest_url, video_id, note=u'Downloading DASH manifest', errnote=u'Could not download DASH manifest') for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'): @@ -1411,11 +1362,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, + 'categories': video_categories, 'subtitles': video_subtitles, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, - 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, @@ -1442,9 +1394,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' - _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)' IE_NAME = u'youtube:playlist' def _real_initialize(self): @@ -1459,11 +1411,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') - title_span = (get_element_by_attribute('class', 'title long-title', webpage) or - get_element_by_attribute('class', 'title ', webpage)) + search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) + title_span = (search_title('playlist-title') or + search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) - ids = orderedSet(re.findall(video_re, webpage)) + video_re = r'''(?x)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) + ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1483,7 +1437,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, 'Youtube', video_id=video_id) else: - self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) if playlist_id.startswith('RD'): # Mixes require a custom extraction process @@ -1492,29 +1446,41 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): raise ExtractorError(u'For downloading YouTube.com top lists, use ' u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) + url = self._TEMPLATE_URL % playlist_id + page = self._download_webpage(url, playlist_id) + more_widget_html = content_html = page + + # Check if the playlist exists or is private + if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None: + raise ExtractorError( + u'The playlist doesn\'t exist or is private, use --username or ' + '--netrc to access it.', + expected=True) + # Extract the video ids from the playlist pages ids = [] for page_num in itertools.count(1): - url = self._TEMPLATE_URL % (playlist_id, page_num) - page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) - matches = re.finditer(self._VIDEO_RE, page) + matches = re.finditer(self._VIDEO_RE, content_html) # We remove the duplicates and the link with index 0 # (it's not the first video of the playlist) new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0') ids.extend(new_ids) - if re.search(self._MORE_PAGES_INDICATOR, page) is None: + mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html) + if not mobj: break - try: - playlist_title = self._og_search_title(page) - except RegexNotFoundError: - self.report_warning( - u'Playlist page is missing OpenGraph title, falling back ...', - playlist_id) - playlist_title = self._html_search_regex( - r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title') + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + playlist_title = self._html_search_regex( + r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>', + page, u'title') url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) @@ -1610,7 +1576,7 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' @@ -1672,7 +1638,7 @@ class YoutubeUserIE(InfoExtractor): class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' + _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _MAX_RESULTS = 1000 IE_NAME = u'youtube:search' _SEARCH_KEY = 'ytsearch' @@ -1683,9 +1649,12 @@ class YoutubeSearchIE(SearchInfoExtractor): video_ids = [] pagenum = 0 limit = n + PAGE_SIZE = 50 - while (50 * pagenum) < limit: - result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) + while (PAGE_SIZE * pagenum) < limit: + result_url = self._API_URL % ( + compat_urllib_parse.quote_plus(query.encode('utf-8')), + (PAGE_SIZE * pagenum) + 1) data_json = self._download_webpage( result_url, video_id=u'query "%s"' % query, note=u'Downloading page %s' % (pagenum + 1), @@ -1709,12 +1678,50 @@ class YoutubeSearchIE(SearchInfoExtractor): for video_id in video_ids] return self.playlist_result(videos, query) + class YoutubeSearchDateIE(YoutubeSearchIE): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' _SEARCH_KEY = 'ytsearchdate' IE_DESC = u'YouTube.com searches, newest videos first' + +class YoutubeSearchURLIE(InfoExtractor): + IE_DESC = u'YouTube.com search URLs' + IE_NAME = u'youtube:search_url' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + query = compat_urllib_parse.unquote_plus(mobj.group('query')) + + webpage = self._download_webpage(url, query) + result_code = self._search_regex( + r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML') + + part_codes = re.findall( + r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code) + entries = [] + for part_code in part_codes: + part_title = self._html_search_regex( + r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False) + part_url_snippet = self._html_search_regex( + r'(?s)href="([^"]+)"', part_code, 'item URL') + part_url = compat_urlparse.urljoin( + 'https://www.youtube.com/', part_url_snippet) + entries.append({ + '_type': 'url', + 'url': part_url, + 'title': part_title, + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': query, + } + + class YoutubeShowIE(InfoExtractor): IE_DESC = u'YouTube.com (multi-season) shows' _VALID_URL = r'https?://www\.youtube\.com/show/(.*)' @@ -1758,23 +1765,25 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_entries = [] paging = 0 for i in itertools.count(1): - info = self._download_webpage(self._FEED_TEMPLATE % paging, + info = self._download_json(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) - info = json.loads(info) - feed_html = info['feed_html'] + feed_html = info.get('feed_html') or info.get('content_html') m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend( self.url_result(video_id, 'Youtube', video_id=video_id) for video_id in ids) - if info['paging'] is None: + mobj = re.search( + r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)', + feed_html) + if mobj is None: break - paging = info['paging'] + paging = mobj.group('paging') return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' + IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = u'Youtube Subscriptions' @@ -1815,7 +1824,7 @@ class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?feature=[a-z_]+$| + (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$| (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ ''' diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 829f002..3b1ac4e 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -1,4 +1,5 @@ # coding: utf-8 +from __future__ import unicode_literals import re @@ -13,52 +14,42 @@ class ZDFIE(InfoExtractor): _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?' _TEST = { - u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt", - u"file": u"2037704.webm", - u"info_dict": { - u"upload_date": u"20131127", - u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".", - u"uploader": u"spezial", - u"title": u"ZDFspezial - Ende des Machtpokers" + 'url': 'http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt', + 'info_dict': { + 'id': '2037704', + 'ext': 'webm', + 'title': 'ZDFspezial - Ende des Machtpokers', + 'description': 'Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial "Ende des Machtpokers - Große Koalition für Deutschland".', + 'duration': 1022, + 'uploader': 'spezial', + 'uploader_id': '225948', + 'upload_date': '20131127', }, - u"skip": u"Videos on ZDF.de are depublicised in short order", + 'skip': 'Videos on ZDF.de are depublicised in short order', } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('video_id') - xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id + xml_url = 'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id doc = self._download_xml( xml_url, video_id, - note=u'Downloading video info', - errnote=u'Failed to download video info') + note='Downloading video info', + errnote='Failed to download video info') title = doc.find('.//information/title').text description = doc.find('.//information/detail').text + duration = int(doc.find('.//details/lengthSec').text) uploader_node = doc.find('.//details/originChannelTitle') uploader = None if uploader_node is None else uploader_node.text - duration_str = doc.find('.//details/length').text - duration_m = re.match(r'''(?x)^ - (?P<hours>[0-9]{2}) - :(?P<minutes>[0-9]{2}) - :(?P<seconds>[0-9]{2}) - (?:\.(?P<ms>[0-9]+)?) - ''', duration_str) - duration = ( - ( - (int(duration_m.group('hours')) * 60 * 60) + - (int(duration_m.group('minutes')) * 60) + - int(duration_m.group('seconds')) - ) - if duration_m - else None - ) + uploader_id_node = doc.find('.//details/originChannelId') + uploader_id = None if uploader_id_node is None else uploader_id_node.text upload_date = unified_strdate(doc.find('.//details/airtime').text) def xml_to_format(fnode): video_url = fnode.find('url').text - is_available = u'http://www.metafilegenerator' not in video_url + is_available = 'http://www.metafilegenerator' not in video_url format_id = fnode.attrib['basetype'] format_m = re.match(r'''(?x) @@ -71,22 +62,28 @@ class ZDFIE(InfoExtractor): quality = fnode.find('./quality').text abr = int(fnode.find('./audioBitrate').text) // 1000 - vbr = int(fnode.find('./videoBitrate').text) // 1000 + vbr_node = fnode.find('./videoBitrate') + vbr = None if vbr_node is None else int(vbr_node.text) // 1000 - format_note = u'' + width_node = fnode.find('./width') + width = None if width_node is None else int_or_none(width_node.text) + height_node = fnode.find('./height') + height = None if height_node is None else int_or_none(height_node.text) + + format_note = '' if not format_note: format_note = None return { - 'format_id': format_id + u'-' + quality, + 'format_id': format_id + '-' + quality, 'url': video_url, 'ext': ext, 'acodec': format_m.group('acodec'), 'vcodec': format_m.group('vcodec'), 'abr': abr, 'vbr': vbr, - 'width': int_or_none(fnode.find('./width').text), - 'height': int_or_none(fnode.find('./height').text), + 'width': width, + 'height': height, 'filesize': int_or_none(fnode.find('./filesize').text), 'format_note': format_note, 'protocol': proto, @@ -103,9 +100,10 @@ class ZDFIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'formats': formats, 'description': description, - 'uploader': uploader, 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, 'upload_date': upload_date, - } + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py new file mode 100644 index 0000000..449482d --- /dev/null +++ b/youtube_dl/jsinterp.py @@ -0,0 +1,116 @@ +from __future__ import unicode_literals + +import re + +from .utils import ( + ExtractorError, +) + + +class JSInterpreter(object): + def __init__(self, code): + self.code = code + self._functions = {} + + def interpret_statement(self, stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExtractorError('Recursion limit reached') + + if stmt.startswith('var '): + stmt = stmt[len('var '):] + ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' + + r'=(?P<expr>.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = self.interpret_expression( + ass_m.group('index'), local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith('return '): + assign = lambda v: v + expr = stmt[len('return '):] + else: + raise ExtractorError( + 'Cannot determine left side of statement in %r' % stmt) + + v = self.interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(self, expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P<idx>.*)\)', member) + if slice_m: + idx = self.interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion - 1) + return val[idx:] + + m = re.match( + r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = self.interpret_expression( + m.group('idx'), local_vars, allow_recursion - 1) + return val[idx] + + m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr) + if m: + a = self.interpret_expression( + m.group('a'), local_vars, allow_recursion) + b = self.interpret_expression( + m.group('b'), local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in self._functions: + self._functions[fname] = self.extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return self._functions[fname](argvals) + raise ExtractorError('Unsupported JS expression %r' % expr) + + def extract_function(self, funcname): + func_m = re.search( + (r'(?:function %s|%s\s*=\s*function)' % ( + re.escape(funcname), re.escape(funcname))) + + r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}', + self.code) + if func_m is None: + raise ExtractorError('Could not find JS function %r' % funcname) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = self.interpret_statement(stmt, local_vars) + return res + return resf + diff --git a/youtube_dl/postprocessor/__init__.py b/youtube_dl/postprocessor/__init__.py index 7f19f71..08e6ddd 100644 --- a/youtube_dl/postprocessor/__init__.py +++ b/youtube_dl/postprocessor/__init__.py @@ -1,5 +1,7 @@ +from .atomicparsley import AtomicParsleyPP from .ffmpeg import ( + FFmpegAudioFixPP, FFmpegMergerPP, FFmpegMetadataPP, FFmpegVideoConvertor, @@ -9,6 +11,8 @@ from .ffmpeg import ( from .xattrpp import XAttrMetadataPP __all__ = [ + 'AtomicParsleyPP', + 'FFmpegAudioFixPP', 'FFmpegMergerPP', 'FFmpegMetadataPP', 'FFmpegVideoConvertor', diff --git a/youtube_dl/postprocessor/atomicparsley.py b/youtube_dl/postprocessor/atomicparsley.py new file mode 100644 index 0000000..765b2d9 --- /dev/null +++ b/youtube_dl/postprocessor/atomicparsley.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + + +import os +import subprocess + +from .common import PostProcessor + +from ..utils import ( + check_executable, + compat_urlretrieve, + encodeFilename, + PostProcessingError, + prepend_extension, + shell_quote +) + + +class AtomicParsleyPPError(PostProcessingError): + pass + + +class AtomicParsleyPP(PostProcessor): + def run(self, info): + if not check_executable('AtomicParsley', ['-v']): + raise AtomicParsleyPPError('AtomicParsley was not found. Please install.') + + filename = info['filepath'] + temp_filename = prepend_extension(filename, 'temp') + temp_thumbnail = prepend_extension(filename, 'thumb') + + if not info.get('thumbnail'): + raise AtomicParsleyPPError('Thumbnail was not found. Nothing to do.') + + compat_urlretrieve(info['thumbnail'], temp_thumbnail) + + cmd = ['AtomicParsley', filename, '--artwork', temp_thumbnail, '-o', temp_filename] + + self._downloader.to_screen('[atomicparsley] Adding thumbnail to "%s"' % filename) + + if self._downloader.params.get('verbose', False): + self._downloader.to_screen('[debug] AtomicParsley command line: %s' % shell_quote(cmd)) + + p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = p.communicate() + + if p.returncode != 0: + msg = stderr.decode('utf-8', 'replace').strip() + raise AtomicParsleyPPError(msg) + + os.remove(encodeFilename(filename)) + os.remove(encodeFilename(temp_thumbnail)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + + return True, info diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index c22f2cd..45328ed 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -9,6 +9,7 @@ from .common import AudioConversionError, PostProcessor from ..utils import ( check_executable, compat_subprocess_get_DEVNULL, + encodeArgument, encodeFilename, PostProcessingError, prepend_extension, @@ -48,13 +49,13 @@ class FFmpegPostProcessor(PostProcessor): for path in input_paths: files_cmd.extend(['-i', encodeFilename(path, True)]) cmd = ([self._get_executable(), '-y'] + files_cmd - + opts + + + [encodeArgument(o) for o in opts] + [encodeFilename(self._ffmpeg_filename_argument(out_path), True)]) if self._downloader.params.get('verbose', False): self._downloader.to_screen(u'[debug] ffmpeg command line: %s' % shell_quote(cmd)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout,stderr = p.communicate() + stdout, stderr = p.communicate() if p.returncode != 0: stderr = stderr.decode('utf-8', 'replace') msg = stderr.strip().split('\n')[-1] @@ -464,7 +465,11 @@ class FFmpegMetadataPP(FFmpegPostProcessor): filename = info['filepath'] temp_filename = prepend_extension(filename, 'temp') - options = ['-c', 'copy'] + if info['ext'] == u'm4a': + options = ['-vn', '-acodec', 'copy'] + else: + options = ['-c', 'copy'] + for (name, value) in metadata.items(): options.extend(['-metadata', '%s=%s' % (name, value)]) @@ -483,3 +488,17 @@ class FFmpegMergerPP(FFmpegPostProcessor): self.run_ffmpeg_multiple_files(info['__files_to_merge'], filename, args) return True, info + +class FFmpegAudioFixPP(FFmpegPostProcessor): + def run(self, info): + filename = info['filepath'] + temp_filename = prepend_extension(filename, 'temp') + + options = ['-vn', '-acodec', 'copy'] + self._downloader.to_screen(u'[ffmpeg] Fixing audio file "%s"' % filename) + self.run_ffmpeg(filename, temp_filename, options) + + os.remove(encodeFilename(filename)) + os.rename(encodeFilename(temp_filename), encodeFilename(filename)) + + return True, info diff --git a/youtube_dl/postprocessor/xattrpp.py b/youtube_dl/postprocessor/xattrpp.py index 1897924..f694094 100644 --- a/youtube_dl/postprocessor/xattrpp.py +++ b/youtube_dl/postprocessor/xattrpp.py @@ -6,6 +6,7 @@ from .common import PostProcessor from ..utils import ( check_executable, hyphenate_date, + subprocess_check_output ) @@ -57,7 +58,7 @@ class XAttrMetadataPP(PostProcessor): elif user_has_xattr: cmd = ['xattr', '-w', key, value, path] - subprocess.check_output(cmd) + subprocess_check_output(cmd) else: # On Unix, and can't find pyxattr, setfattr, or xattr. diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 057cd20..b97e62a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1,10 +1,14 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- +import calendar +import codecs +import contextlib import ctypes import datetime import email.utils import errno +import getpass import gzip import itertools import io @@ -21,6 +25,7 @@ import struct import subprocess import sys import traceback +import xml.etree.ElementTree import zlib try: @@ -174,6 +179,11 @@ try: except NameError: compat_chr = chr +try: + from xml.etree.ElementTree import ParseError as compat_xml_parse_error +except ImportError: # Python 2.6 + from xml.parsers.expat import ExpatError as compat_xml_parse_error + def compat_ord(c): if type(c) is int: return c else: return ord(c) @@ -493,13 +503,13 @@ def orderedSet(iterable): res.append(el) return res + def unescapeHTML(s): - """ - @param s a string - """ - assert type(s) == type(u'') + if s is None: + return None + assert type(s) == compat_str - result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s) return result @@ -531,6 +541,15 @@ def encodeFilename(s, for_subprocess=False): return s.encode(encoding, 'ignore') +def encodeArgument(s): + if not isinstance(s, compat_str): + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) + s = s.decode('ascii') + return encodeFilename(s, True) + + def decodeOption(optval): if optval is None: return optval @@ -585,13 +604,15 @@ def make_HTTPS_handler(opts_no_check_certificate, **kwargs): class ExtractorError(Exception): """Error during info extraction.""" - def __init__(self, msg, tb=None, expected=False, cause=None): + def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): """ tb, if given, is the original traceback (so that it can be printed out). If expected is set, this is a normal error message and most likely not a bug in youtube-dl. """ if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError): expected = True + if video_id is not None: + msg = video_id + ': ' + msg if not expected: msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.' super(ExtractorError, self).__init__(msg) @@ -599,6 +620,7 @@ class ExtractorError(Exception): self.traceback = tb self.exc_info = sys.exc_info() # preserve original exception self.cause = cause + self.video_id = video_id def format_traceback(self): if self.traceback is None: @@ -753,8 +775,37 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): https_response = http_response +def parse_iso8601(date_str): + """ Return a UNIX timestamp from the given date """ + + if date_str is None: + return None + + m = re.search( + r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group(0))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + + dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone + return calendar.timegm(dt.timetuple()) + + def unified_strdate(date_str): """Return a string with the date in the format YYYYMMDD""" + + if date_str is None: + return None + upload_date = None #Replace commas date_str = date_str.replace(',', ' ') @@ -766,14 +817,17 @@ def unified_strdate(date_str): '%B %d %Y', '%b %d %Y', '%Y-%m-%d', + '%d.%m.%Y', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M:%S', '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', '%Y-%m-%dT%H:%M:%SZ', '%Y-%m-%dT%H:%M:%S.%fZ', '%Y-%m-%dT%H:%M:%S.%f0Z', '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', '%Y-%m-%dT%H:%M', ] for expression in format_expressions: @@ -869,25 +923,97 @@ def platform_name(): return res -def write_string(s, out=None): +def _windows_write_string(s, out): + """ Returns True if the string was written using special methods, + False if it has yet to be written out.""" + # Adapted from http://stackoverflow.com/a/3259271/35070 + + import ctypes + import ctypes.wintypes + + WIN_OUTPUT_IDS = { + 1: -11, + 2: -12, + } + + try: + fileno = out.fileno() + except AttributeError: + # If the output stream doesn't have a fileno, it's virtual + return False + if fileno not in WIN_OUTPUT_IDS: + return False + + GetStdHandle = ctypes.WINFUNCTYPE( + ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( + ("GetStdHandle", ctypes.windll.kernel32)) + h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) + + WriteConsoleW = ctypes.WINFUNCTYPE( + ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, + ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), + ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32)) + written = ctypes.wintypes.DWORD(0) + + GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32)) + FILE_TYPE_CHAR = 0x0002 + FILE_TYPE_REMOTE = 0x8000 + GetConsoleMode = ctypes.WINFUNCTYPE( + ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, + ctypes.POINTER(ctypes.wintypes.DWORD))( + ("GetConsoleMode", ctypes.windll.kernel32)) + INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value + + def not_a_console(handle): + if handle == INVALID_HANDLE_VALUE or handle is None: + return True + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR + or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + + if not_a_console(h): + return False + + def next_nonbmp_pos(s): + try: + return next(i for i, c in enumerate(s) if ord(c) > 0xffff) + except StopIteration: + return len(s) + + while s: + count = min(next_nonbmp_pos(s), 1024) + + ret = WriteConsoleW( + h, s, count if count else 2, ctypes.byref(written), None) + if ret == 0: + raise OSError('Failed to write string') + if not count: # We just wrote a non-BMP character + assert written.value == 2 + s = s[1:] + else: + assert written.value > 0 + s = s[written.value:] + return True + + +def write_string(s, out=None, encoding=None): if out is None: out = sys.stderr assert type(s) == compat_str + if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): + if _windows_write_string(s, out): + return + if ('b' in getattr(out, 'mode', '') or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr - s = s.encode(preferredencoding(), 'ignore') - try: + byt = s.encode(encoding or preferredencoding(), 'ignore') + out.write(byt) + elif hasattr(out, 'buffer'): + enc = encoding or getattr(out, 'encoding', None) or preferredencoding() + byt = s.encode(enc, 'ignore') + out.buffer.write(byt) + else: out.write(s) - except UnicodeEncodeError: - # In Windows shells, this can fail even when the codec is just charmap!? - # See https://wiki.python.org/moin/PrintFails#Issue - if sys.platform == 'win32' and hasattr(out, 'encoding'): - s = s.encode(out.encoding, 'ignore').decode(out.encoding) - out.write(s) - else: - raise - out.flush() @@ -1111,11 +1237,11 @@ def setproctitle(title): libc = ctypes.cdll.LoadLibrary("libc.so.6") except OSError: return - title = title - buf = ctypes.create_string_buffer(len(title) + 1) - buf.value = title.encode('utf-8') + title_bytes = title.encode('utf-8') + buf = ctypes.create_string_buffer(len(title_bytes)) + buf.value = title_bytes try: - libc.prctl(15, ctypes.byref(buf), 0, 0, 0) + libc.prctl(15, buf, 0, 0, 0) except AttributeError: return # Strange libc, just skip this @@ -1136,8 +1262,15 @@ class HEADRequest(compat_urllib_request.Request): return "HEAD" -def int_or_none(v, scale=1): - return v if v is None else (int(v) // scale) +def int_or_none(v, scale=1, default=None, get_attr=None): + if get_attr: + if v is not None: + v = getattr(v, get_attr, None) + return default if v is None else (int(v) // scale) + + +def float_or_none(v, scale=1, default=None): + return default if v is None else (float(v) / scale) def parse_duration(s): @@ -1145,7 +1278,7 @@ def parse_duration(s): return None m = re.match( - r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s) + r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s) if not m: return None res = int(m.group('secs')) @@ -1219,9 +1352,11 @@ class PagedList(object): def uppercase_escape(s): + unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( - r'\\U([0-9a-fA-F]{8})', - lambda m: compat_chr(int(m.group(1), base=16)), s) + r'\\U[0-9a-fA-F]{8}', + lambda m: unicode_escape(m.group(0))[0], + s) try: struct.pack(u'!I', 0) @@ -1239,3 +1374,80 @@ except TypeError: else: struct_pack = struct.pack struct_unpack = struct.unpack + + +def read_batch_urls(batch_fd): + def fixup(url): + if not isinstance(url, compat_str): + url = url.decode('utf-8', 'replace') + BOM_UTF8 = u'\xef\xbb\xbf' + if url.startswith(BOM_UTF8): + url = url[len(BOM_UTF8):] + url = url.strip() + if url.startswith(('#', ';', ']')): + return False + return url + + with contextlib.closing(batch_fd) as fd: + return [url for url in map(fixup, fd) if url] + + +def urlencode_postdata(*args, **kargs): + return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii') + + +def parse_xml(s): + class TreeBuilder(xml.etree.ElementTree.TreeBuilder): + def doctype(self, name, pubid, system): + pass # Ignore doctypes + + parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder()) + kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {} + return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs) + + +if sys.version_info < (3, 0) and sys.platform == 'win32': + def compat_getpass(prompt, *args, **kwargs): + if isinstance(prompt, compat_str): + prompt = prompt.encode(preferredencoding()) + return getpass.getpass(prompt, *args, **kwargs) +else: + compat_getpass = getpass.getpass + + +US_RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, +} + + +def strip_jsonp(code): + return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code) + + +def qualities(quality_ids): + """ Get a numeric quality value out of a list of possible values """ + def q(qid): + try: + return quality_ids.index(qid) + except ValueError: + return -1 + return q + + +DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s' + +try: + subprocess_check_output = subprocess.check_output +except AttributeError: + def subprocess_check_output(*args, **kwargs): + assert 'input' not in kwargs + p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs) + output, _ = p.communicate() + ret = p.poll() + if ret: + raise subprocess.CalledProcessError(ret, p.args, output=output) + return output diff --git a/youtube_dl/version.py b/youtube_dl/version.py index a9fead9..6fe7c7b 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.02.17' +__version__ = '2014.06.07'