From: Francois Marier Date: Sun, 24 Sep 2017 23:07:45 +0000 (-0700) Subject: New upstream version 2017.09.24 X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/00368b4c3a5d4e909e1b7ecfc4030bf28da020f3?hp=--cc New upstream version 2017.09.24 --- 00368b4c3a5d4e909e1b7ecfc4030bf28da020f3 diff --git a/ChangeLog b/ChangeLog index 1637876..da60c1b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,509 @@ +version 2017.09.24 + +Core ++ [options] Accept lrc as a subtitle conversion target format (#14292) +* [utils] Fix handling raw TTML subtitles (#14191) + +Extractors +* [24video] Fix timestamp extraction and make non fatal (#14295) ++ [24video] Add support for 24video.adult (#14295) ++ [kakao] Add support for tv.kakao.com (#12298, #14007) ++ [twitter] Add support for URLs without user id (#14270) ++ [americastestkitchen] Add support for americastestkitchen.com (#10764, + #13996) +* [generic] Fix support for multiple HTML5 videos on one page (#14080) +* [mixcloud] Fix extraction (#14088, #14132) ++ [lynda] Add support for educourse.ga (#14286) +* [beeg] Fix extraction (#14275) +* [nbcsports:vplayer] Correct theplatform URL (#13873) +* [twitter] Fix duration extraction (#14141) +* [tvplay] Bypass geo restriction ++ [heise] Add support for YouTube embeds (#14109) ++ [popcorntv] Add support for popcorntv.it (#5914, #14211) +* [viki] Update app data (#14181) +* [morningstar] Relax URL regular expression (#14222) +* [openload] Fix extraction (#14225, #14257) +* [noovo] Fix extraction (#14214) +* [dailymotion:playlist] Relax URL regular expression (#14219) ++ [twitch] Add support for go.twitch.tv URLs (#14215) +* [vgtv] Relax URL regular expression (#14223) + + +version 2017.09.15 + +Core +* [downloader/fragment] Restart inconsistent incomplete fragment downloads + (#13731) +* [YoutubeDL] Download raw subtitles files (#12909, #14191) + +Extractors +* [condenast] Fix extraction (#14196, #14207) ++ [orf] Add support for f4m stories +* [tv4] Relax URL regular expression (#14206) +* [animeondemand] Bypass geo restriction ++ [animeondemand] Add support for flash videos (#9944) + + +version 2017.09.11 + +Extractors +* [rutube:playlist] Fix suitable (#14166) + + +version 2017.09.10 + +Core ++ [utils] Introduce bool_or_none +* [YoutubeDL] Ensure dir existence for each requested format (#14116) + +Extractors +* [fox] Fix extraction (#14147) +* [rutube] Use bool_or_none +* [rutube] Rework and generalize playlist extractors (#13565) ++ [rutube:playlist] Add support for playlists (#13534, #13565) ++ [radiocanada] Add fallback for title extraction (#14145) +* [vk] Use dedicated YouTube embeds extraction routine +* [vice] Use dedicated YouTube embeds extraction routine +* [cracked] Use dedicated YouTube embeds extraction routine +* [chilloutzone] Use dedicated YouTube embeds extraction routine +* [abcnews] Use dedicated YouTube embeds extraction routine +* [youtube] Separate methods for embeds extraction +* [redtube] Fix formats extraction (#14122) +* [arte] Relax unavailability check (#14112) ++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059) +* [vidme:user] Relax URL regular expression (#14054) +* [bpb] Fix extraction (#14043, #14086) +* [soundcloud] Fix download URL with private tracks (#14093) +* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707) +* [viidea] Capture and output lecture error message (#14099) +* [radiocanada] Skip unsupported platforms (#14100) + + +version 2017.09.02 + +Extractors +* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076, + #14077, #14079, #14082, #14083, #14094, #14095, #14096) +* [youtube] Fix upload date extraction (#14065) ++ [charlierose] Add support for episodes (#14062) ++ [bbccouk] Add support for w-prefixed ids (#14056) +* [googledrive] Extend URL regular expression (#9785) ++ [googledrive] Add support for source format (#14046) +* [pornhd] Fix extraction (#14005) + + +version 2017.08.27.1 + +Extractors + +* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037) + + +version 2017.08.27 + +Core ++ [extractor/common] Extract height and format id for HTML5 videos (#14034) +* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023, + #8625, #9483) + * Simplify code and split into separate routines to facilitate maintaining + * Make retry mechanism work on errors during actual download not only + during connection establishment phase + * Retry on ECONNRESET and ETIMEDOUT during reading data from network + * Retry on content too short + * Show error description on retry + +Extractors +* [generic] Lower preference for extraction from LD-JSON +* [rai] Fix audio formats extraction (#14024) +* [youtube] Fix controversy videos extraction (#14027, #14029) +* [mixcloud] Fix extraction (#14015, #14020) + + +version 2017.08.23 + +Core ++ [extractor/common] Introduce _parse_xml +* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries + non fatal (#13970) +* [utils] Fix unescapeHTML for misformed string like "&a"" (#13935) + +Extractors +* [cbc:watch] Bypass geo restriction (#13993) +* [toutv] Relax DRM check (#13994) ++ [googledrive] Add support for subtitles (#13619, #13638) +* [pornhub] Relax uploader regular expression (#13906, #13975) +* [bandcamp:album] Extract track titles (#13962) ++ [bbccouk] Add support for events URLs (#13893) ++ [liveleak] Support multi-video pages (#6542) ++ [liveleak] Support another liveleak embedding pattern (#13336) +* [cda] Fix extraction (#13935) ++ [laola1tv] Add support for tv.ittf.com (#13965) +* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003) + + +version 2017.08.18 + +Core +* [YoutubeDL] Sanitize byte string format URLs (#13951) ++ [extractor/common] Add support for float durations in _parse_mpd_formats + (#13919) + +Extractors +* [arte] Detect unavailable videos (#13945) +* [generic] Convert redirect URLs to unicode strings (#13951) +* [udemy] Fix paid course detection (#13943) +* [pluralsight] Use RPC API for course extraction (#13937) ++ [clippit] Add support for clippituser.tv ++ [qqmusic] Support new URL schemes (#13805) +* [periscope] Renew HLS extraction (#13917) +* [mixcloud] Extract decrypt key + + +version 2017.08.13 + +Core +* [YoutubeDL] Make sure format id is not empty +* [extractor/common] Make _family_friendly_search optional +* [extractor/common] Respect source's type attribute for HTML5 media (#13892) + +Extractors +* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902) ++ [fourtube] Add support pornerbros.com (#6022) ++ [fourtube] Add support porntube.com (#7859, #13901) ++ [fourtube] Add support fux.com +* [limelight] Improve embeds detection (#13895) ++ [reddit] Add support for v.redd.it and reddit.com (#13847) +* [aparat] Extract all formats (#13887) +* [mixcloud] Fix play info decryption (#13885) ++ [generic] Add support for vzaar embeds (#13876) + + +version 2017.08.09 + +Core +* [utils] Skip missing params in cli_bool_option (#13865) + +Extractors +* [xxxymovies] Fix title extraction (#13868) ++ [nick] Add support for nick.com.pl (#13860) +* [mixcloud] Fix play info decryption (#13867) +* [20min] Fix embeds extraction (#13852) +* [dplayit] Fix extraction (#13851) ++ [niconico] Support videos with multiple formats (#13522) ++ [niconico] Support HTML5-only videos (#13806) + + +version 2017.08.06 + +Core +* Use relative paths for DASH fragments (#12990) + +Extractors +* [pluralsight] Fix format selection +- [mpora] Remove extractor (#13826) ++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218) +* [vlive:channel] Limit number of videos per page to 100 (#13830) +* [podomatic] Extend URL regular expression (#13827) +* [cinchcast] Extend URL regular expression +* [yandexdisk] Relax URL regular expression (#13824) +* [vidme] Extract DASH and HLS formats +- [teamfour] Remove extractor (#13782) +* [pornhd] Fix extraction (#13783) +* [udemy] Fix subtitles extraction (#13812) +* [mlb] Extend URL regular expression (#13740, #13773) ++ [pbs] Add support for new URL schema (#13801) +* [nrktv] Update API host (#13796) + + +version 2017.07.30.1 + +Core +* [downloader/hls] Use redirect URL as manifest base (#13755) +* [options] Correctly hide login info from debug outputs (#13696) + +Extractors ++ [watchbox] Add support for watchbox.de (#13739) +- [clipfish] Remove extractor ++ [youjizz] Fix extraction (#13744) ++ [generic] Add support for another ooyala embed pattern (#13727) ++ [ard] Add support for lives (#13771) +* [soundcloud] Update client id ++ [soundcloud:trackstation] Add support for track stations (#13733) +* [svtplay] Use geo verification proxy for API request +* [svtplay] Update API URL (#13767) ++ [yandexdisk] Add support for yadi.sk (#13755) ++ [megaphone] Add support for megaphone.fm +* [amcnetworks] Make rating optional (#12453) +* [cloudy] Fix extraction (#13737) ++ [nickru] Add support for nickelodeon.ru +* [mtv] Improve thumbnal extraction +* [nick] Automate geo-restriction bypass (#13711) +* [niconico] Improve error reporting (#13696) + + +version 2017.07.23 + +Core +* [YoutubeDL] Improve default format specification (#13704) +* [YoutubeDL] Do not override id, extractor and extractor_key for + url_transparent entities +* [extractor/common] Fix playlist_from_matches + +Extractors +* [itv] Fix production id extraction (#13671, #13703) +* [vidio] Make duration non fatal and fix typo +* [mtv] Skip missing video parts (#13690) +* [sportbox:embed] Fix extraction ++ [npo] Add support for npo3.nl URLs (#13695) +* [dramafever] Remove video id from title (#13699) ++ [egghead:lesson] Add support for lessons (#6635) +* [funnyordie] Extract more metadata (#13677) +* [youku:show] Fix playlist extraction (#13248) ++ [dispeak] Recognize sevt subdomain (#13276) +* [adn] Improve error reporting (#13663) +* [crunchyroll] Relax series and season regular expression (#13659) ++ [spiegel:article] Add support for nexx iframe embeds (#13029) ++ [nexx:embed] Add support for iframe embeds +* [nexx] Improve JS embed extraction ++ [pearvideo] Add support for pearvideo.com (#13031) + + +version 2017.07.15 + +Core +* [YoutubeDL] Don't expand environment variables in meta fields (#13637) + +Extractors +* [spiegeltv] Delegate extraction to nexx extractor (#13159) ++ [nexx] Add support for nexx.cloud (#10807, #13465) +* [generic] Fix rutube embeds extraction (#13641) +* [karrierevideos] Fix title extraction (#13641) +* [youtube] Don't capture YouTube Red ad for creator meta field (#13621) +* [slideshare] Fix extraction (#13617) ++ [5tv] Add another video URL pattern (#13354, #13606) +* [drtv] Make HLS and HDS extraction non fatal +* [ted] Fix subtitles extraction (#13628, #13629) +* [vine] Make sure the title won't be empty ++ [twitter] Support HLS streams in vmap URLs ++ [periscope] Support pscp.tv URLs in embedded frames +* [twitter] Extract mp4 urls via mobile API (#12726) +* [niconico] Fix authentication error handling (#12486) +* [giantbomb] Extract m3u8 formats (#13626) ++ [vlive:playlist] Add support for playlists (#13613) + + +version 2017.07.09 + +Core ++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries ++ [utils] Support attributes with no values in get_elements_by_attribute + +Extractors ++ [dailymail] Add support for embeds ++ [joj] Add support for joj.sk (#13268) +* [abc.net.au:iview] Extract more formats (#13492, #13489) +* [egghead:course] Fix extraction (#6635, #13370) ++ [cjsw] Add support for cjsw.com (#13525) ++ [eagleplatform] Add support for referrer protected videos (#13557) ++ [eagleplatform] Add support for another embed pattern (#13557) +* [veoh] Extend URL regular expression (#13601) +* [npo:live] Fix live stream id extraction (#13568, #13605) +* [googledrive] Fix height extraction (#13603) ++ [dailymotion] Add support for new layout (#13580) +- [yam] Remove extractor +* [xhamster] Extract all formats and fix duration extraction (#13593) ++ [xhamster] Add support for new URL schema (#13593) +* [espn] Extend URL regular expression (#13244, #13549) +* [kaltura] Fix typo in subtitles extraction (#13569) +* [vier] Adapt extraction to redesign (#13575) + + +version 2017.07.02 + +Core +* [extractor/common] Improve _json_ld + +Extractors ++ [thisoldhouse] Add more fallbacks for video id +* [thisoldhouse] Fix video id extraction (#13540, #13541) +* [xfileshare] Extend format regular expression (#13536) +* [ted] Fix extraction (#13535) ++ [tastytrade] Add support for tastytrade.com (#13521) +* [dplayit] Relax video id regular expression (#13524) ++ [generic] Extract more generic metadata (#13527) ++ [bbccouk] Capture and output error message (#13501, #13518) +* [cbsnews] Relax video info regular expression (#13284, #13503) ++ [facebook] Add support for plugin video embeds and multiple embeds (#13493) +* [soundcloud] Switch to https for API requests (#13502) +* [pandatv] Switch to https for API and download URLs ++ [pandatv] Add support for https URLs (#13491) ++ [niconico] Support sp subdomain (#13494) + + +version 2017.06.25 + +Core ++ [adobepass] Add support for DIRECTV NOW (mso ATTOTT) (#13472) +* [YoutubeDL] Skip malformed formats for better extraction robustness + +Extractors ++ [wsj] Add support for barrons.com (#13470) ++ [ign] Add another video id pattern (#13328) ++ [raiplay:live] Add support for live streams (#13414) ++ [redbulltv] Add support for live videos and segments (#13486) ++ [onetpl] Add support for videos embedded via pulsembed (#13482) +* [ooyala] Make more robust +* [ooyala] Skip empty format URLs (#13471, #13476) +* [hgtv.com:show] Fix typo + + +version 2017.06.23 + +Core +* [adobepass] Fix extraction on older python 2.6 + +Extractors +* [youtube] Adapt to new automatic captions rendition (#13467) +* [hgtv.com:show] Relax video config regular expression (#13279, #13461) +* [drtuber] Fix formats extraction (#12058) +* [youporn] Fix upload date extraction +* [youporn] Improve formats extraction +* [youporn] Fix title extraction (#13456) +* [googledrive] Fix formats sorting (#13443) +* [watchindianporn] Fix extraction (#13411, #13415) ++ [vimeo] Add fallback mp4 extension for original format ++ [ruv] Add support for ruv.is (#13396) +* [viu] Fix extraction on older python 2.6 +* [pandora.tv] Fix upload_date extraction (#12846) ++ [asiancrush] Add support for asiancrush.com (#13420) + + +version 2017.06.18 + +Core +* [downloader/common] Use utils.shell_quote for debug command line +* [utils] Use compat_shlex_quote in shell_quote +* [postprocessor/execafterdownload] Encode command line (#13407) +* [compat] Fix compat_shlex_quote on Windows (#5889, #10254) +* [postprocessor/metadatafromtitle] Fix missing optional meta fields processing + in --metadata-from-title (#13408) +* [extractor/common] Fix json dumping with --geo-bypass ++ [extractor/common] Improve jwplayer subtitles extraction ++ [extractor/common] Improve jwplayer formats extraction (#13379) + +Extractors +* [polskieradio] Fix extraction (#13392) ++ [xfileshare] Add support for fastvideo.me (#13385) +* [bilibili] Fix extraction of videos with double quotes in titles (#13387) +* [4tube] Fix extraction (#13381, #13382) ++ [disney] Add support for disneychannel.de (#13383) +* [npo] Improve URL regular expression (#13376) ++ [corus] Add support for showcase.ca ++ [corus] Add support for history.ca (#13359) + + +version 2017.06.12 + +Core +* [utils] Handle compat_HTMLParseError in extract_attributes (#13349) ++ [compat] Introduce compat_HTMLParseError +* [utils] Improve unified_timestamp +* [extractor/generic] Ensure format id is unicode string +* [extractor/common] Return unicode string from _match_id ++ [YoutubeDL] Sanitize more fields (#13313) + +Extractors ++ [xfileshare] Add support for rapidvideo.tv (#13348) +* [xfileshare] Modernize and pass Referer ++ [rutv] Add support for testplayer.vgtrk.com (#13347) ++ [newgrounds] Extract more metadata (#13232) ++ [newgrounds:playlist] Add support for playlists (#10611) +* [newgrounds] Improve formats and uploader extraction (#13346) +* [msn] Fix formats extraction +* [turbo] Ensure format id is string +* [sexu] Ensure height is int +* [jove] Ensure comment count is int +* [golem] Ensure format id is string +* [gfycat] Ensure filesize is int +* [foxgay] Ensure height is int +* [flickr] Ensure format id is string +* [sohu] Fix numeric fields +* [safari] Improve authentication detection (#13319) +* [liveleak] Ensure height is int (#13313) +* [streamango] Make title optional (#13292) +* [rtlnl] Improve URL regular expression (#13295) +* [tvplayer] Fix extraction (#13291) + + +version 2017.06.05 + +Core +* [YoutubeDL] Don't emit ANSI escape codes on Windows (#13270) + +Extractors ++ [bandcamp:weekly] Add support for bandcamp weekly (#12758) +* [pornhub:playlist] Fix extraction (#13281) +- [godtv] Remove extractor (#13175) +* [safari] Fix typo (#13252) +* [youtube] Improve chapters extraction (#13247) +* [1tv] Lower preference for HTTP formats (#13246) +* [francetv] Relax URL regular expression +* [drbonanza] Fix extraction (#13231) +* [packtpub] Fix authentication (#13240) + + +version 2017.05.29 + +Extractors +* [youtube] Fix DASH MPD extraction for videos with non-encrypted format URLs + (#13211) +* [xhamster] Fix uploader and like/dislike count extraction (#13216)) ++ [xhamster] Extract categories (#11728) ++ [abcnews] Add support for embed URLs (#12851) +* [gaskrank] Fix extraction (#12493) +* [medialaan] Fix videos with missing videoUrl (#12774) +* [dvtv] Fix playlist support ++ [dvtv] Add support for DASH and HLS formats (#3063) ++ [beam:vod] Add support for beam.pro/mixer.com VODs (#13032)) +* [cbsinteractive] Relax URL regular expression (#13213) +* [adn] Fix formats extraction ++ [youku] Extract more metadata (#10433) +* [cbsnews] Fix extraction (#13205) + + +version 2017.05.26 + +Core ++ [utils] strip_jsonp() can recognize more patterns +* [postprocessor/ffmpeg] Fix metadata filename handling on Python 2 (#13182) + +Extractors ++ [youtube] DASH MPDs with cipher signatures are recognized now (#11381) ++ [bbc] Add support for authentication +* [tudou] Merge into youku extractor (#12214) +* [youku:show] Fix extraction +* [youku] Fix extraction (#13191) +* [udemy] Fix extraction for outputs' format entries without URL (#13192) +* [vimeo] Fix formats' sorting (#13189) +* [cbsnews] Fix extraction for 60 Minutes videos (#12861) + + +version 2017.05.23 + +Core ++ [downloader/external] Pass -loglevel to ffmpeg downloader (#13183) ++ [adobepass] Add support for Bright House Networks (#13149) + +Extractors ++ [streamcz] Add support for subtitles (#13174) +* [youtube] Fix DASH manifest signature decryption (#8944, #13156) +* [toggle] Relax URL regular expression (#13172) +* [toypics] Fix extraction (#13077) +* [njpwworld] Fix extraction (#13162, #13169) ++ [hitbox] Add support for smashcast.tv (#13154) +* [mitele] Update app key regular expression (#13158) + + version 2017.05.18.1 Core diff --git a/Makefile b/Makefile index 0235563..c74eea7 100644 --- a/Makefile +++ b/Makefile @@ -46,8 +46,15 @@ tar: youtube-dl.tar.gz pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish youtube-dl: youtube_dl/*.py youtube_dl/*/*.py - zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py - zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py + mkdir -p zip + for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \ + mkdir -p zip/$$d ;\ + cp -pPR $$d/*.py zip/$$d/ ;\ + done + touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py + mv zip/youtube_dl/__main__.py zip/ + cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py + rm -rf zip echo '#!$(PYTHON)' > youtube-dl cat youtube-dl.zip >> youtube-dl rm youtube-dl.zip @@ -101,7 +108,7 @@ youtube-dl.tar.gz: youtube-dl README.md README.txt youtube-dl.1 youtube-dl.bash- --exclude '*.pyc' \ --exclude '*.pyo' \ --exclude '*~' \ - --exclude '__pycache' \ + --exclude '__pycache__' \ --exclude '.git' \ --exclude 'testdata' \ --exclude 'docs/_build' \ diff --git a/README.md b/README.md index dc0be1f..7818e58 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ If you do not have curl, you can alternatively use a recent wget: sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl sudo chmod a+rx /usr/local/bin/youtube-dl -Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). +Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](https://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`). You can also use pip: @@ -33,7 +33,7 @@ You can also use pip: This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information. -OS X users can install youtube-dl with [Homebrew](http://brew.sh/): +OS X users can install youtube-dl with [Homebrew](https://brew.sh/): brew install youtube-dl @@ -145,18 +145,18 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --max-views COUNT Do not download any videos with more than COUNT views --match-filter FILTER Generic video filter. Specify any key (see - help for -o for a list of available keys) - to match if the key is present, !key to - check if the key is not present, key > - NUMBER (like "comment_count > 12", also - works with >=, <, <=, !=, =) to compare - against a number, key = 'LITERAL' (like - "uploader = 'Mike Smith'", also works with - !=) to match against a string literal and & - to require multiple matches. Values which - are not known are excluded unless you put a - question mark (?) after the operator. For - example, to only match videos that have + the "OUTPUT TEMPLATE" for a list of + available keys) to match if the key is + present, !key to check if the key is not + present, key > NUMBER (like "comment_count + > 12", also works with >=, <, <=, !=, =) to + compare against a number, key = 'LITERAL' + (like "uploader = 'Mike Smith'", also works + with !=) to match against a string literal + and & to require multiple matches. Values + which are not known are excluded unless you + put a question mark (?) after the operator. + For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not available at the given @@ -277,8 +277,8 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo --get-filename Simulate, quiet but print output filename --get-format Simulate, quiet but print output format -j, --dump-json Simulate, quiet but print JSON information. - See --output for a description of available - keys. + See the "OUTPUT TEMPLATE" for a description + of available keys. -J, --dump-single-json Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole @@ -427,7 +427,7 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt) + (currently supported: srt|ass|vtt|lrc) # CONFIGURATION @@ -458,7 +458,7 @@ You can also use `--config-location` if you want to use custom configuration fil ### Authentication with `.netrc` file -You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: +You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you: ``` touch $HOME/.netrc chmod a-rwx,u+rw $HOME/.netrc @@ -474,7 +474,10 @@ machine twitch login my_twitch_account_name password my_twitch_password ``` To activate authentication with the `.netrc` file you should pass `--netrc` to youtube-dl or place it in the [configuration file](#configuration). -On Windows you may also need to setup the `%HOME%` environment variable manually. +On Windows you may also need to setup the `%HOME%` environment variable manually. For example: +``` +set HOME=%USERPROFILE% +``` # OUTPUT TEMPLATE @@ -482,7 +485,7 @@ The `-o` option allows users to indicate a template for the output file names. **tl;dr:** [navigate me to examples](#output-template-examples). -The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: +The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are: - `id` (string): Video identifier - `title` (string): Video title @@ -532,13 +535,14 @@ The basic usage is not to set any template arguments when downloading a single f - `playlist_id` (string): Playlist identifier - `playlist_title` (string): Playlist title - Available for the video that belongs to some logical chapter or section: + - `chapter` (string): Name or title of the chapter the video belongs to - `chapter_number` (numeric): Number of the chapter the video belongs to - `chapter_id` (string): Id of the chapter the video belongs to Available for the video that is an episode of some series or programme: + - `series` (string): Title of the series or programme the video episode belongs to - `season` (string): Title of the season the video episode belongs to - `season_number` (numeric): Number of the season the video episode belongs to @@ -548,6 +552,7 @@ Available for the video that is an episode of some series or programme: - `episode_id` (string): Id of the video episode Available for the media that is a track or a part of a music album: + - `track` (string): Title of the track - `track_number` (numeric): Number of the track within an album or a disc - `track_id` (string): Id of the track @@ -579,7 +584,7 @@ If you are using an output template inside a Windows batch file then you must es #### Output template examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of single. ```bash $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc @@ -598,7 +603,7 @@ $ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext) $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ # Download entire series season keeping each series and each season in separate directory under C:/MyVideos -$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617 +$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 # Stream the video being downloaded to stdout $ youtube-dl -o - BaW_jenozKc @@ -649,7 +654,7 @@ Also filtering work for comparisons `=` (equals), `!=` (not equals), `^=` (begin - `acodec`: Name of the audio codec in use - `vcodec`: Name of the video codec in use - `container`: Name of the container format - - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `m3u8`, or `m3u8_native`) + - `protocol`: The protocol that will be used for the actual download, lower-case (`http`, `https`, `rtsp`, `rtmp`, `rtmpe`, `mms`, `f4m`, `ism`, `http_dash_segments`, `m3u8`, or `m3u8_native`) - `format_id`: A short description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the video hoster. @@ -666,7 +671,7 @@ If you want to preserve the old format selection behavior (prior to youtube-dl 2 #### Format selection examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of single. ```bash # Download best mp4 format available or any other best if no mp4 available @@ -711,17 +716,17 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231 ### How do I update youtube-dl? -If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). +If you've followed [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`). If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update. -If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. +If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like sudo apt-get remove -y youtube-dl -Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html): +Afterwards, simply follow [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html): ``` sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl @@ -761,11 +766,11 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much. youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option. -Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. +Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](https://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed. ### I have downloaded a video but how can I play it? -Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/). +Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](https://www.videolan.org/) or [mplayer](https://www.mplayerhq.hu/). ### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser. @@ -840,10 +845,10 @@ Use the `-o` to specify an [output template](#output-template), for example `-o ### How do I download a video starting with a `-`? -Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`: +Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the options with `--`: youtube-dl -- -wNyEUrxzFU - youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" + youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU" ### How do I pass cookies to youtube-dl? @@ -857,9 +862,9 @@ Passing cookies to youtube-dl is a good way to workaround login when a particula ### How do I stream directly to media player? -You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with: +You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](https://www.videolan.org/) can be achieved with: - youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - ### How do I download only new videos from a playlist? @@ -879,7 +884,7 @@ When youtube-dl detects an HLS video, it can download it either with the built-i When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg. -In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. +In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](https://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader. If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case. @@ -905,7 +910,7 @@ Feel free to bump the issue from time to time by writing a small comment ("Issue ### How can I detect whether a given URL is supported by youtube-dl? -For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. +For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor. @@ -919,7 +924,7 @@ youtube-dl is an open-source project manned by too few volunteers, so we'd rathe # DEVELOPER INSTRUCTIONS -Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution. +Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. To run youtube-dl as a developer, you don't need to build anything either. Simply execute @@ -931,6 +936,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file python test/test_download.py nosetests +See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases. + If you want to create a build of youtube-dl yourself, you'll need * python @@ -967,7 +974,7 @@ After you have ensured this site is distributing its content legally, you can fo class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -998,10 +1005,10 @@ After you have ensured this site is distributing its content legally, you can fo } ``` 5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py). -6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. +6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in. 7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want. -8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. -9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this: +8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+. +9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this: $ git add youtube_dl/extractor/extractors.py $ git add youtube_dl/extractor/yourextractor.py @@ -1157,7 +1164,7 @@ import youtube_dl ydl_opts = {} with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object. @@ -1196,19 +1203,19 @@ ydl_opts = { 'progress_hooks': [my_hook], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) ``` # BUGS -Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +Bugs and suggestions should be reported at: . Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). **Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this: ``` $ youtube-dl -v [debug] System config: [] [debug] User config: [] -[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] +[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -1239,7 +1246,7 @@ For bug reports, this means that your report should contain the *complete* outpu If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/). -**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL. +**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL. ### Are you using the latest version? diff --git a/README.txt b/README.txt index 129756d..a42d837 100644 --- a/README.txt +++ b/README.txt @@ -172,18 +172,18 @@ Video Selection: --max-views COUNT Do not download any videos with more than COUNT views --match-filter FILTER Generic video filter. Specify any key (see - help for -o for a list of available keys) - to match if the key is present, !key to - check if the key is not present, key > - NUMBER (like "comment_count > 12", also - works with >=, <, <=, !=, =) to compare - against a number, key = 'LITERAL' (like - "uploader = 'Mike Smith'", also works with - !=) to match against a string literal and & - to require multiple matches. Values which - are not known are excluded unless you put a - question mark (?) after the operator. For - example, to only match videos that have + the "OUTPUT TEMPLATE" for a list of + available keys) to match if the key is + present, !key to check if the key is not + present, key > NUMBER (like "comment_count + > 12", also works with >=, <, <=, !=, =) to + compare against a number, key = 'LITERAL' + (like "uploader = 'Mike Smith'", also works + with !=) to match against a string literal + and & to require multiple matches. Values + which are not known are excluded unless you + put a question mark (?) after the operator. + For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not available at the given @@ -312,8 +312,8 @@ Verbosity / Simulation Options: --get-filename Simulate, quiet but print output filename --get-format Simulate, quiet but print output format -j, --dump-json Simulate, quiet but print JSON information. - See --output for a description of available - keys. + See the "OUTPUT TEMPLATE" for a description + of available keys. -J, --dump-single-json Simulate, quiet but print JSON information for each command-line argument. If the URL refers to a playlist, dump the whole @@ -474,7 +474,7 @@ Post-processing Options: syntax. Example: --exec 'adb push {} /sdcard/Music/ && rm {}' --convert-subs FORMAT Convert the subtitles to other format - (currently supported: srt|ass|vtt) + (currently supported: srt|ass|vtt|lrc) @@ -545,7 +545,9 @@ To activate authentication with the .netrc file you should pass --netrc to youtube-dl or place it in the configuration file. On Windows you may also need to setup the %HOME% environment variable -manually. +manually. For example: + + set HOME=%USERPROFILE% @@ -558,7 +560,7 @@ names. TL;DR: navigate me to examples. The basic usage is not to set any template arguments when downloading a -single file, like in youtube-dl -o funny_video.flv "http://some/video". +single file, like in youtube-dl -o funny_video.flv "https://some/video". However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to python string formatting operations. For example, %(NAME)s or @@ -623,28 +625,38 @@ with sequence type are: - playlist_title (string): Playlist title Available for the video that belongs to some logical chapter or section: -- chapter (string): Name or title of the chapter the video belongs to - -chapter_number (numeric): Number of the chapter the video belongs to - -chapter_id (string): Id of the chapter the video belongs to + +- chapter (string): Name or title of the chapter the video belongs to +- chapter_number (numeric): Number of the chapter the video belongs to +- chapter_id (string): Id of the chapter the video belongs to Available for the video that is an episode of some series or programme: -- series (string): Title of the series or programme the video episode -belongs to - season (string): Title of the season the video episode -belongs to - season_number (numeric): Number of the season the video -episode belongs to - season_id (string): Id of the season the video -episode belongs to - episode (string): Title of the video episode - -episode_number (numeric): Number of the video episode within a season - -episode_id (string): Id of the video episode - -Available for the media that is a track or a part of a music album: - -track (string): Title of the track - track_number (numeric): Number of -the track within an album or a disc - track_id (string): Id of the track -- artist (string): Artist(s) of the track - genre (string): Genre(s) of -the track - album (string): Title of the album the track belongs to - -album_type (string): Type of the album - album_artist (string): List of -all artists appeared on the album - disc_number (numeric): Number of the -disc or other physical medium the track belongs to - release_year -(numeric): Year (YYYY) when the album was released + +- series (string): Title of the series or programme the video episode + belongs to +- season (string): Title of the season the video episode belongs to +- season_number (numeric): Number of the season the video episode + belongs to +- season_id (string): Id of the season the video episode belongs to +- episode (string): Title of the video episode +- episode_number (numeric): Number of the video episode within a + season +- episode_id (string): Id of the video episode + +Available for the media that is a track or a part of a music album: + +- track (string): Title of the track +- track_number (numeric): Number of the track within an album or a + disc +- track_id (string): Id of the track +- artist (string): Artist(s) of the track +- genre (string): Genre(s) of the track +- album (string): Title of the album the track belongs to +- album_type (string): Type of the album +- album_artist (string): List of all artists appeared on the album +- disc_number (numeric): Number of the disc or other physical medium + the track belongs to +- release_year (numeric): Year (YYYY) when the album was released Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. Note @@ -688,7 +700,8 @@ should stay intact: -o "C:\%HOMEPATH%\Desktop\%%(title)s.%%(ext)s". Output template examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of +single. $ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc youtube-dl test video ''_ä↭𝕐.mp4 # All kinds of weird characters @@ -706,7 +719,7 @@ Note on Windows you may need to use double quotes instead of single. $ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/ # Download entire series season keeping each series and each season in separate directory under C:/MyVideos - $ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617 + $ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617 # Stream the video being downloaded to stdout $ youtube-dl -o - BaW_jenozKc @@ -787,8 +800,8 @@ fields: - ext: File extension - acodec: Name of the audio codec in use - vcodec: Name of the video codec in use - container: Name of the container format - protocol: The protocol that will be used for the actual download, lower-case (http, https, rtsp, rtmp, rtmpe, mms, f4m, -ism, m3u8, or m3u8_native) - format_id: A short description of the -format +ism, http_dash_segments, m3u8, or m3u8_native) - format_id: A short +description of the format Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular @@ -835,7 +848,8 @@ file in order not to type it every time you run youtube-dl. Format selection examples -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of +single. # Download best mp4 format available or any other best if no mp4 available $ youtube-dl -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' @@ -895,7 +909,7 @@ If you have installed youtube-dl using a package manager like _apt-get_ or _yum_, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply -go to http://yt-dl.org/ to find out the current version. Unfortunately, +go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum. @@ -1100,11 +1114,11 @@ all of your downloads, put the option into your configuration file. How do I download a video starting with a -? -Either prepend http://www.youtube.com/watch?v= or separate the ID from +Either prepend https://www.youtube.com/watch?v= or separate the ID from the options with --: youtube-dl -- -wNyEUrxzFU - youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU" + youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU" How do I pass cookies to youtube-dl? @@ -1135,7 +1149,7 @@ You will first need to tell youtube-dl to stream media to stdout with capable of this for streaming) and then pipe former to latter. For example, streaming to vlc can be achieved with: - youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - + youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc - How do I download only new videos from a playlist? @@ -1232,7 +1246,7 @@ How can I detect whether a given URL is supported by youtube-dl? For one, have a look at the list of supported sites. Note that it can sometimes happen that the site changes its URL scheme (say, from -http://example.com/video/1234567 to http://example.com/v/1234567 ) and +https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. @@ -1296,6 +1310,9 @@ test file directly; any of the following work: python test/test_download.py nosetests +See item 6 of new extractor tutorial for how to run extractor specific +test cases. + If you want to create a build of youtube-dl yourself, you'll need - python @@ -1337,7 +1354,7 @@ yourextractor): class YourExtractorIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P[0-9]+)' _TEST = { - 'url': 'http://yourextractor.com/watch/42', + 'url': 'https://yourextractor.com/watch/42', 'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)', 'info_dict': { 'id': '42', @@ -1374,7 +1391,8 @@ yourextractor): _TEST to _TESTS and make it into a list of dictionaries. The tests will then be named TestDownload.test_YourExtractor, TestDownload.test_YourExtractor_1, - TestDownload.test_YourExtractor_2, etc. + TestDownload.test_YourExtractor_2, etc. Note that tests with + only_matching key in test's dict are not counted in. 7. Have a look at youtube_dl/extractor/common.py for possible helper methods and a detailed description of what your extractor should and may return. Add tests and code for as many as you want. @@ -1572,7 +1590,7 @@ fashion, like this: ydl_opts = {} with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) Most likely, you'll want to use various options. For a list of options available, have a look at youtube_dl/YoutubeDL.py. For a start, if you @@ -1613,7 +1631,7 @@ downloads/converts the video to an mp3 file: 'progress_hooks': [my_hook], } with youtube_dl.YoutubeDL(ydl_opts) as ydl: - ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc']) + ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc']) @@ -1635,7 +1653,7 @@ to this: $ youtube-dl -v [debug] System config: [] [debug] User config: [] - [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] + [debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 [debug] youtube-dl version 2015.12.06 [debug] Git HEAD: 135392e @@ -1690,9 +1708,9 @@ command-line) or upload the .dump files you get when you add SITE SUPPORT REQUESTS MUST CONTAIN AN EXAMPLE URL. An example URL is a URL you might want to download, like -http://www.youtube.com/watch?v=BaW_jenozKc. There should be an obvious +https://www.youtube.com/watch?v=BaW_jenozKc. There should be an obvious video present. Except under very special circumstances, the main page of -a video service (e.g. http://www.youtube.com/) is _not_ an example URL. +a video service (e.g. https://www.youtube.com/) is _not_ an example URL. Are you using the latest version? diff --git a/devscripts/check-porn.py b/devscripts/check-porn.py index 7a219eb..72b2ee4 100644 --- a/devscripts/check-porn.py +++ b/devscripts/check-porn.py @@ -14,7 +14,7 @@ import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from test.helper import get_testcases +from test.helper import gettestcases from youtube_dl.utils import compat_urllib_parse_urlparse from youtube_dl.utils import compat_urllib_request @@ -24,7 +24,7 @@ if len(sys.argv) > 1: else: METHOD = 'EURISTIC' -for test in get_testcases(): +for test in gettestcases(): if METHOD == 'EURISTIC': try: webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read() diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index f9fe63f..76bf873 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -8,7 +8,7 @@ import re ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) README_FILE = os.path.join(ROOT_DIR, 'README.md') -PREFIX = '''%YOUTUBE-DL(1) +PREFIX = r'''%YOUTUBE-DL(1) # NAME diff --git a/docs/supportedsites.md b/docs/supportedsites.md index aa6c118..d36a07c 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -38,11 +38,13 @@ - **afreecatv**: afreecatv.com - **afreecatv:global**: afreecatv.com - **AirMozilla** + - **AliExpressLive** - **AlJazeera** - **Allocine** - **AlphaPorno** - **AMCNetworks** - - **anderetijden**: npo.nl and ntr.nl + - **AmericasTestKitchen** + - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **AnimeOnDemand** - **anitube.se** - **Anvato** @@ -67,6 +69,8 @@ - **arte.tv:info** - **arte.tv:magazine** - **arte.tv:playlist** + - **AsianCrush** + - **AsianCrushPlaylist** - **AtresPlayer** - **ATTTechChannel** - **ATVAt** @@ -87,13 +91,13 @@ - **bambuser:channel** - **Bandcamp** - **Bandcamp:album** + - **Bandcamp:weekly** - **bangumi.bilibili.com**: BiliBili番剧 - **bbc**: BBC - **bbc.co.uk**: BBC iPlayer - **bbc.co.uk:article**: BBC articles - **bbc.co.uk:iplayer:playlist** - **bbc.co.uk:playlist** - - **Beam:live** - **Beatport** - **Beeg** - **BehindKink** @@ -152,8 +156,9 @@ - **chirbit** - **chirbit:profile** - **Cinchcast** - - **Clipfish** + - **CJSW** - **cliphunter** + - **Clippit** - **ClipRs** - **Clipsyndicate** - **CloserToTruth** @@ -235,6 +240,7 @@ - **EbaumsWorld** - **EchoMsk** - **egghead:course**: egghead.io course + - **egghead:lesson**: egghead.io lesson - **eHow** - **Einthusan** - **eitb.tv** @@ -291,6 +297,7 @@ - **Funimation** - **FunnyOrDie** - **Fusion** + - **Fux** - **FXNetworks** - **GameInformer** - **GameOne** @@ -311,7 +318,6 @@ - **Go** - **Go90** - **GodTube** - - **GodTV** - **Golem** - **GoogleDrive** - **Goshgay** @@ -359,6 +365,7 @@ - **IPrima** - **iqiyi**: 爱奇艺 - **Ir90Tv** + - **ITTF** - **ITV** - **ivi**: ivi.ru - **ivi:compilation**: ivi.ru compilations @@ -368,9 +375,11 @@ - **Jamendo** - **JamendoAlbum** - **JeuxVideo** + - **Joj** - **Jove** - **jpopsuki.tv** - **JWPlatform** + - **Kakao** - **Kaltura** - **Kamcord** - **KanalPlay**: Kanal 5/9/11 Play @@ -414,6 +423,7 @@ - **limelight:channel_list** - **LiTV** - **LiveLeak** + - **LiveLeakEmbed** - **livestream** - **livestream:original** - **LnkGo** @@ -430,12 +440,14 @@ - **MakerTV** - **mangomolo:live** - **mangomolo:video** + - **ManyVids** - **MatchTV** - **MDR**: MDR.DE and KiKA - **media.ccc.de** - **Medialaan** - **Mediaset** - **Medici** + - **megaphone.fm**: megaphone.fm embedded players - **Meipai**: 美拍 - **MelonVOD** - **META** @@ -453,6 +465,8 @@ - **mixcloud:playlist** - **mixcloud:stream** - **mixcloud:user** + - **Mixer:live** + - **Mixer:vod** - **MLB** - **Mnet** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -466,7 +480,6 @@ - **MovieFap** - **Moviezine** - **MovingImage** - - **MPORA** - **MSN** - **mtg**: MTG services - **mtv** @@ -511,10 +524,13 @@ - **netease:song**: 网易云音乐 - **Netzkino** - **Newgrounds** + - **NewgroundsPlaylist** - **Newstube** - **NextMedia**: 蘋果日報 - **NextMediaActionNews**: 蘋果日報 - 動新聞 - **NextTV**: 壹電視 + - **Nexx** + - **NexxEmbed** - **nfb**: National Film Board of Canada - **nfl.com** - **NhkVod** @@ -524,6 +540,7 @@ - **nhl.com:videocenter:category**: NHL videocenter category - **nick.com** - **nick.de** + - **nickelodeonru** - **nicknight** - **niconico**: ニコニコ動画 - **NiconicoPlaylist** @@ -545,7 +562,7 @@ - **NowTVList** - **nowvideo**: NowVideo - **Noz** - - **npo**: npo.nl and ntr.nl + - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **npo.nl:live** - **npo.nl:radio** - **npo.nl:radio:fragment** @@ -578,6 +595,7 @@ - **Openload** - **OraTV** - **orf:fm4**: radio FM4 + - **orf:fm4:story**: fm4.orf.at stories - **orf:iptv**: iptv.ORF.at - **orf:oe1**: Radio Österreich 1 - **orf:tvthek**: ORF TVthek @@ -589,6 +607,7 @@ - **Patreon** - **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC) - **pcmag** + - **PearVideo** - **People** - **periscope**: Periscope - **periscope:user**: Periscope user videos @@ -610,7 +629,9 @@ - **Pokemon** - **PolskieRadio** - **PolskieRadioCategory** + - **PopcornTV** - **PornCom** + - **PornerBros** - **PornFlip** - **PornHd** - **PornHub**: PornHub and Thumbzilla @@ -619,6 +640,7 @@ - **Pornotube** - **PornoVoisines** - **PornoXO** + - **PornTube** - **PressTV** - **PrimeShareTV** - **PromptFile** @@ -640,9 +662,12 @@ - **RadioJavan** - **Rai** - **RaiPlay** + - **RaiPlayLive** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** + - **Reddit** + - **RedditR** - **RedTube** - **RegioTV** - **RENTV** @@ -682,8 +707,10 @@ - **rutube:embed**: Rutube embedded videos - **rutube:movie**: Rutube movies - **rutube:person**: Rutube person videos + - **rutube:playlist**: Rutube playlists - **RUTV**: RUTV.RU - **Ruutu** + - **Ruv** - **safari**: safaribooksonline.com online video - **safari:api** - **safari:course**: safaribooksonline.com online courses @@ -722,6 +749,7 @@ - **soundcloud:playlist** - **soundcloud:search**: Soundcloud search - **soundcloud:set** + - **soundcloud:trackstation** - **soundcloud:user** - **soundgasm** - **soundgasm:profile** @@ -762,13 +790,13 @@ - **Tagesschau** - **tagesschau:player** - **Tass** - - **TBS** + - **TastyTrade** + - **TBS** (Currently broken) - **TDSLifeway** - **teachertube**: teachertube.com videos - **teachertube:user:collection**: teachertube.com user and collection videos - **TeachingChannel** - **Teamcoco** - - **TeamFourStar** - **TechTalks** - **techtv.mit.edu** - **ted** @@ -803,16 +831,13 @@ - **ToonGoggles** - **Tosh**: Tosh.0 - **tou.tv** - - **Toypics**: Toypics user profile + - **Toypics**: Toypics video - **ToypicsUser**: Toypics user profile - **TrailerAddict** (Currently broken) - **Trilulilu** - **TruTV** - **Tube8** - **TubiTv** - - **tudou** - - **tudou:album** - - **tudou:playlist** - **Tumblr** - **tunein:clip** - **tunein:program** @@ -936,13 +961,15 @@ - **vk:wallpost** - **vlive** - **vlive:channel** + - **vlive:playlist** - **Vodlocker** - **VODPl** - **VODPlatform** - **VoiceRepublic** + - **Voot** - **VoxMedia** - **Vporn** - - **vpro**: npo.nl and ntr.nl + - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be - **vrv** @@ -957,6 +984,7 @@ - **washingtonpost** - **washingtonpost:article** - **wat.tv** + - **WatchBox** - **WatchIndianPorn**: Watch Indian Porn - **WDR** - **wdr:mobile** @@ -968,7 +996,7 @@ - **wholecloud**: WholeCloud - **Wimp** - **Wistia** - - **wnl**: npo.nl and ntr.nl + - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** - **wrzuta.pl** - **wrzuta.pl:playlist** @@ -976,7 +1004,7 @@ - **WSJArticle** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo + - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me - **XHamster** - **XHamsterEmbed** - **xiami:album**: 虾米音乐 - 专辑 @@ -992,7 +1020,7 @@ - **XVideos** - **XXXYMovies** - **Yahoo**: Yahoo screen and movies - - **Yam**: 蕃薯藤yam天空部落 + - **YandexDisk** - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index 6f52e11..f18a823 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -10,6 +10,7 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from test.helper import FakeYDL, expect_dict, expect_value +from youtube_dl.compat import compat_etree_fromstring from youtube_dl.extractor.common import InfoExtractor from youtube_dl.extractor import YoutubeIE, get_info_extractor from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError @@ -488,6 +489,91 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_mpd_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/13919 + 'float_duration', + 'http://unknown/manifest.mpd', + [{ + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '318597', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 318.597, + 'width': 340, + 'height': 192, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '638590', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.42001f', + 'tbr': 638.59, + 'width': 512, + 'height': 288, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '1022565', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 1022.565, + 'width': 688, + 'height': 384, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '2046506', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.4d001f', + 'tbr': 2046.506, + 'width': 1024, + 'height': 576, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '3998017', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640029', + 'tbr': 3998.017, + 'width': 1280, + 'height': 720, + }, { + 'manifest_url': 'http://unknown/manifest.mpd', + 'ext': 'mp4', + 'format_id': '5997485', + 'format_note': 'DASH video', + 'protocol': 'http_dash_segments', + 'acodec': 'none', + 'vcodec': 'avc1.640032', + 'tbr': 5997.485, + 'width': 1920, + 'height': 1080, + }] + ), + ] + + for mpd_file, mpd_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/mpd/%s.mpd' % mpd_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_mpd_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + mpd_url=mpd_url) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 75945e3..e70cbcd 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -41,6 +41,7 @@ def _make_result(formats, **kwargs): 'id': 'testid', 'title': 'testttitle', 'extractor': 'testex', + 'extractor_key': 'TestEx', } res.update(**kwargs) return res @@ -370,6 +371,19 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'format': 'best[height>360]'}) self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy()) + def test_format_selection_issue_10083(self): + # See https://github.com/rg3/youtube-dl/issues/10083 + formats = [ + {'format_id': 'regular', 'height': 360, 'url': TEST_URL}, + {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL}, + {'format_id': 'audio', 'vcodec': 'none', 'url': TEST_URL}, + ] + info_dict = _make_result(formats) + + ydl = YDL({'format': 'best[height>360]/bestvideo[height>360]+bestaudio'}) + ydl.process_ie_result(info_dict.copy()) + self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'video+audio') + def test_invalid_format_specs(self): def assert_syntax_error(format_spec): ydl = YDL({'format': format_spec}) @@ -448,6 +462,17 @@ class TestFormatSelection(unittest.TestCase): pass self.assertEqual(ydl.downloaded_info_dicts, []) + def test_default_format_spec(self): + ydl = YDL({'simulate': True}) + self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best') + + ydl = YDL({'outtmpl': '-'}) + self.assertEqual(ydl._default_format_spec({}), 'best') + + ydl = YDL({}) + self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best') + self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best') + class TestYoutubeDL(unittest.TestCase): def test_subtitles(self): @@ -527,6 +552,8 @@ class TestYoutubeDL(unittest.TestCase): 'ext': 'mp4', 'width': None, 'height': 1080, + 'title1': '$PATH', + 'title2': '%PATH%', } def fname(templ): @@ -545,10 +572,14 @@ class TestYoutubeDL(unittest.TestCase): self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4') self.assertEqual(fname('%(height) 0 6d.%(ext)s'), ' 01080.mp4') + self.assertEqual(fname('%%'), '%') + self.assertEqual(fname('%%%%'), '%%') self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4') self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4') self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s') self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4') + self.assertEqual(fname('Hello %(title1)s'), 'Hello $PATH') + self.assertEqual(fname('Hello %(title2)s'), 'Hello %PATH%') def test_format_note(self): ydl = YoutubeDL() @@ -755,7 +786,8 @@ class TestYoutubeDL(unittest.TestCase): '_type': 'url_transparent', 'url': 'foo2:', 'ie_key': 'Foo2', - 'title': 'foo1 title' + 'title': 'foo1 title', + 'id': 'foo1_id', } class Foo2IE(InfoExtractor): @@ -781,6 +813,9 @@ class TestYoutubeDL(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['url'], TEST_URL) self.assertEqual(downloaded['title'], 'foo1 title') + self.assertEqual(downloaded['id'], 'testid') + self.assertEqual(downloaded['extractor'], 'testex') + self.assertEqual(downloaded['extractor_key'], 'TestEx') if __name__ == '__main__': diff --git a/test/test_options.py b/test/test_options.py new file mode 100644 index 0000000..3a25a6b --- /dev/null +++ b/test/test_options.py @@ -0,0 +1,26 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.options import _hide_login_info + + +class TestOptions(unittest.TestCase): + def test_hide_login_info(self): + self.assertEqual(_hide_login_info(['-u', 'foo', '-p', 'bar']), + ['-u', 'PRIVATE', '-p', 'PRIVATE']) + self.assertEqual(_hide_login_info(['-u']), ['-u']) + self.assertEqual(_hide_login_info(['-u', 'foo', '-u', 'bar']), + ['-u', 'PRIVATE', '-u', 'PRIVATE']) + self.assertEqual(_hide_login_info(['--username=foo']), + ['--username=PRIVATE']) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index f31559e..efa73d0 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -98,6 +98,7 @@ from youtube_dl.compat import ( compat_chr, compat_etree_fromstring, compat_getenv, + compat_os_name, compat_setenv, compat_urlparse, compat_parse_qs, @@ -278,6 +279,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unescapeHTML('/'), '/') self.assertEqual(unescapeHTML('é'), 'é') self.assertEqual(unescapeHTML('�'), '�') + self.assertEqual(unescapeHTML('&a"'), '&a"') # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'') @@ -340,6 +342,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500) self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100) self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) + self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -447,7 +450,9 @@ class TestUtil(unittest.TestCase): def test_shell_quote(self): args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')] - self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""") + self.assertEqual( + shell_quote(args), + """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''') def test_str_to_int(self): self.assertEqual(str_to_int('123,456'), 123456) @@ -678,6 +683,14 @@ class TestUtil(unittest.TestCase): d = json.loads(stripped) self.assertEqual(d, {'status': 'success'}) + stripped = strip_jsonp('window.cb && window.cb({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + + stripped = strip_jsonp('window.cb && cb({"status": "success"});') + d = json.loads(stripped) + self.assertEqual(d, {'status': 'success'}) + def test_uppercase_escape(self): self.assertEqual(uppercase_escape('aä'), 'aä') self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐') @@ -907,6 +920,8 @@ class TestUtil(unittest.TestCase): supports_outside_bmp = False if supports_outside_bmp: self.assertEqual(extract_attributes(''), {'x': 'Smile \U0001f600!'}) + # Malformed HTML should not break attributes extraction on older Python + self.assertEqual(extract_attributes(''), {}) def test_clean_html(self): self.assertEqual(clean_html('a:\nb'), 'a: b') @@ -921,7 +936,7 @@ class TestUtil(unittest.TestCase): def test_args_to_str(self): self.assertEqual( args_to_str(['foo', 'ba/r', '-baz', '2 be', '']), - 'foo ba/r -baz \'2 be\' \'\'' + 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""' ) def test_parse_filesize(self): @@ -1049,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')

Ignored, three

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The following line contains Chinese characters and special symbols @@ -1074,7 +1089,7 @@ Line

The first line

- ''' + '''.encode('utf-8') srt_data = '''1 00:00:00,000 --> 00:00:01,000 The first line @@ -1100,7 +1115,7 @@ The first line

inner
style

-''' +'''.encode('utf-8') srt_data = '''1 00:00:02,080 --> 00:00:05,839 default stylecustom style @@ -1123,6 +1138,26 @@ part 3 ''' self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) + dfxp_data_non_utf8 = ''' + + +
+

Line 1

+

第二行

+
+ +
'''.encode('utf-16') + srt_data = '''1 +00:00:00,000 --> 00:00:01,000 +Line 1 + +2 +00:00:01,000 --> 00:00:02,000 +第二行 + +''' + self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data) + def test_cli_option(self): self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) @@ -1168,6 +1203,10 @@ part 3 cli_bool_option( {'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), ['--check-certificate=true']) + self.assertEqual( + cli_bool_option( + {}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='), + []) def test_ohdave_rsa_encrypt(self): N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd @@ -1217,6 +1256,12 @@ part 3 self.assertEqual(get_element_by_attribute('class', 'foo', html), None) self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None) + html = ''' + + ''' + + self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo') + def test_get_elements_by_class(self): html = ''' nicealso nice diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py index cb12f83..324ca85 100644 --- a/test/test_youtube_chapters.py +++ b/test/test_youtube_chapters.py @@ -254,6 +254,13 @@ class TestYoutubeChapters(unittest.TestCase): 'title': '3 - Из серпов луны...[Iz serpov luny]', }] ), + ( + # https://www.youtube.com/watch?v=xZW70zEasOk + # time point more than duration + '''● LCS Spring finals: Saturday and Sunday from 13:30 outside the venue!
● PAX East: Fri, Sat & Sun - more info in tomorrows video on the main channel!''', + 283, + [] + ), ] def test_youtube_chapters(self): diff --git a/youtube-dl b/youtube-dl index bf4b72a..b87f23e 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube-dl.1 b/youtube-dl.1 index 49818dc..6c8c7bc 100644 --- a/youtube-dl.1 +++ b/youtube-dl.1 @@ -242,11 +242,11 @@ Do not download any videos with more than COUNT views .TP .B \-\-match\-filter \f[I]FILTER\f[] Generic video filter. -Specify any key (see help for \-o for a list of available keys) to match -if the key is present, !key to check if the key is not present, key > -NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to -compare against a number, key = \[aq]LITERAL\[aq] (like "uploader = -\[aq]Mike Smith\[aq]", also works with !=) to match against a string +Specify any key (see the "OUTPUT TEMPLATE" for a list of available keys) +to match if the key is present, !key to check if the key is not present, +key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, +=) to compare against a number, key = \[aq]LITERAL\[aq] (like "uploader += \[aq]Mike Smith\[aq]", also works with !=) to match against a string literal and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the operator. @@ -550,7 +550,7 @@ Simulate, quiet but print output format .TP .B \-j, \-\-dump\-json Simulate, quiet but print JSON information. -See \-\-output for a description of available keys. +See the "OUTPUT TEMPLATE" for a description of available keys. .RS .RE .TP @@ -894,7 +894,8 @@ Example: \-\-exec \[aq]adb push {} /sdcard/Music/ && rm {}\[aq] .RE .TP .B \-\-convert\-subs \f[I]FORMAT\f[] -Convert the subtitles to other format (currently supported: srt|ass|vtt) +Convert the subtitles to other format (currently supported: +srt|ass|vtt|lrc) .RS .RE .SH CONFIGURATION @@ -952,7 +953,7 @@ pass credentials as command line arguments on every youtube\-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a \f[C]\&.netrc\f[] -file (http://stackoverflow.com/tags/.netrc/info) on a per extractor +file (https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a \f[C]\&.netrc\f[] file in your \f[C]$HOME\f[] and restrict permissions to read/write by only you: @@ -989,6 +990,13 @@ file (#configuration). .PP On Windows you may also need to setup the \f[C]%HOME%\f[] environment variable manually. +For example: +.IP +.nf +\f[C] +set\ HOME=%USERPROFILE% +\f[] +.fi .SH OUTPUT TEMPLATE .PP The \f[C]\-o\f[] option allows users to indicate a template for the @@ -998,7 +1006,7 @@ output file names. .PP The basic usage is not to set any template arguments when downloading a single file, like in -\f[C]youtube\-dl\ \-o\ funny_video.flv\ "http://some/video"\f[]. +\f[C]youtube\-dl\ \-o\ funny_video.flv\ "https://some/video"\f[]. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to python string @@ -1114,32 +1122,60 @@ padded with leading zeros according to the total length of the playlist \f[C]playlist_title\f[] (string): Playlist title .PP Available for the video that belongs to some logical chapter or section: -\- \f[C]chapter\f[] (string): Name or title of the chapter the video -belongs to \- \f[C]chapter_number\f[] (numeric): Number of the chapter -the video belongs to \- \f[C]chapter_id\f[] (string): Id of the chapter -the video belongs to +.IP \[bu] 2 +\f[C]chapter\f[] (string): Name or title of the chapter the video +belongs to +.IP \[bu] 2 +\f[C]chapter_number\f[] (numeric): Number of the chapter the video +belongs to +.IP \[bu] 2 +\f[C]chapter_id\f[] (string): Id of the chapter the video belongs to .PP Available for the video that is an episode of some series or programme: -\- \f[C]series\f[] (string): Title of the series or programme the video -episode belongs to \- \f[C]season\f[] (string): Title of the season the -video episode belongs to \- \f[C]season_number\f[] (numeric): Number of -the season the video episode belongs to \- \f[C]season_id\f[] (string): -Id of the season the video episode belongs to \- \f[C]episode\f[] -(string): Title of the video episode \- \f[C]episode_number\f[] -(numeric): Number of the video episode within a season \- +.IP \[bu] 2 +\f[C]series\f[] (string): Title of the series or programme the video +episode belongs to +.IP \[bu] 2 +\f[C]season\f[] (string): Title of the season the video episode belongs +to +.IP \[bu] 2 +\f[C]season_number\f[] (numeric): Number of the season the video episode +belongs to +.IP \[bu] 2 +\f[C]season_id\f[] (string): Id of the season the video episode belongs +to +.IP \[bu] 2 +\f[C]episode\f[] (string): Title of the video episode +.IP \[bu] 2 +\f[C]episode_number\f[] (numeric): Number of the video episode within a +season +.IP \[bu] 2 \f[C]episode_id\f[] (string): Id of the video episode .PP -Available for the media that is a track or a part of a music album: \- -\f[C]track\f[] (string): Title of the track \- \f[C]track_number\f[] -(numeric): Number of the track within an album or a disc \- -\f[C]track_id\f[] (string): Id of the track \- \f[C]artist\f[] (string): -Artist(s) of the track \- \f[C]genre\f[] (string): Genre(s) of the track -\- \f[C]album\f[] (string): Title of the album the track belongs to \- -\f[C]album_type\f[] (string): Type of the album \- \f[C]album_artist\f[] -(string): List of all artists appeared on the album \- +Available for the media that is a track or a part of a music album: +.IP \[bu] 2 +\f[C]track\f[] (string): Title of the track +.IP \[bu] 2 +\f[C]track_number\f[] (numeric): Number of the track within an album or +a disc +.IP \[bu] 2 +\f[C]track_id\f[] (string): Id of the track +.IP \[bu] 2 +\f[C]artist\f[] (string): Artist(s) of the track +.IP \[bu] 2 +\f[C]genre\f[] (string): Genre(s) of the track +.IP \[bu] 2 +\f[C]album\f[] (string): Title of the album the track belongs to +.IP \[bu] 2 +\f[C]album_type\f[] (string): Type of the album +.IP \[bu] 2 +\f[C]album_artist\f[] (string): List of all artists appeared on the +album +.IP \[bu] 2 \f[C]disc_number\f[] (numeric): Number of the disc or other physical -medium the track belongs to \- \f[C]release_year\f[] (numeric): Year -(YYYY) when the album was released +medium the track belongs to +.IP \[bu] 2 +\f[C]release_year\f[] (numeric): Year (YYYY) when the album was released .PP Each aforementioned sequence when referenced in an output template will be replaced by the actual value corresponding to the sequence name. @@ -1185,7 +1221,8 @@ environment variables for expansion should stay intact: \f[C]\-o\ "C:\\%HOMEPATH%\\Desktop\\%%(title)s.%%(ext)s"\f[]. .SS Output template examples .PP -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of +single. .IP .nf \f[C] @@ -1205,7 +1242,7 @@ $\ youtube\-dl\ \-o\ \[aq]%(uploader)s/%(playlist)s/%(playlist_index)s\ \-\ %(ti $\ youtube\-dl\ \-u\ user\ \-p\ password\ \-o\ \[aq]~/MyVideos/%(playlist)s/%(chapter_number)s\ \-\ %(chapter)s/%(title)s.%(ext)s\[aq]\ https://www.udemy.com/java\-tutorial/ #\ Download\ entire\ series\ season\ keeping\ each\ series\ and\ each\ season\ in\ separate\ directory\ under\ C:/MyVideos -$\ youtube\-dl\ \-o\ "C:/MyVideos/%(series)s/%(season_number)s\ \-\ %(season)s/%(episode_number)s\ \-\ %(episode)s.%(ext)s"\ http://videomore.ru/kino_v_detalayah/5_sezon/367617 +$\ youtube\-dl\ \-o\ "C:/MyVideos/%(series)s/%(season_number)s\ \-\ %(season)s/%(episode_number)s\ \-\ %(episode)s.%(ext)s"\ https://videomore.ru/kino_v_detalayah/5_sezon/367617 #\ Stream\ the\ video\ being\ downloaded\ to\ stdout $\ youtube\-dl\ \-o\ \-\ BaW_jenozKc @@ -1303,9 +1340,9 @@ in use \- \f[C]vcodec\f[]: Name of the video codec in use \- \f[C]container\f[]: Name of the container format \- \f[C]protocol\f[]: The protocol that will be used for the actual download, lower\-case (\f[C]http\f[], \f[C]https\f[], \f[C]rtsp\f[], \f[C]rtmp\f[], -\f[C]rtmpe\f[], \f[C]mms\f[], \f[C]f4m\f[], \f[C]ism\f[], \f[C]m3u8\f[], -or \f[C]m3u8_native\f[]) \- \f[C]format_id\f[]: A short description of -the format +\f[C]rtmpe\f[], \f[C]mms\f[], \f[C]f4m\f[], \f[C]ism\f[], +\f[C]http_dash_segments\f[], \f[C]m3u8\f[], or \f[C]m3u8_native\f[]) \- +\f[C]format_id\f[]: A short description of the format .PP Note that none of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular @@ -1360,7 +1397,8 @@ You may want to add it to the configuration file (#configuration) in order not to type it every time you run youtube\-dl. .SS Format selection examples .PP -Note on Windows you may need to use double quotes instead of single. +Note that on Windows you may need to use double quotes instead of +single. .IP .nf \f[C] @@ -1412,7 +1450,7 @@ $\ youtube\-dl\ \-\-dateafter\ 20000101\ \-\-datebefore\ 20091231 .SS How do I update youtube\-dl? .PP If you\[aq]ve followed our manual installation -instructions (http://rg3.github.io/youtube-dl/download.html), you can +instructions (https://rg3.github.io/youtube-dl/download.html), you can simply run \f[C]youtube\-dl\ \-U\f[] (or, on Linux, \f[C]sudo\ youtube\-dl\ \-U\f[]). .PP @@ -1425,7 +1463,7 @@ mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube\-dl releases at least once a month, and often weekly or even daily. -Simply go to http://yt\-dl.org/ to find out the current version. +Simply go to https://yt\-dl.org to find out the current version. Unfortunately, there is nothing we youtube\-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker @@ -1442,7 +1480,7 @@ sudo\ apt\-get\ remove\ \-y\ youtube\-dl .fi .PP Afterwards, simply follow our manual installation -instructions (http://rg3.github.io/youtube-dl/download.html): +instructions (https://rg3.github.io/youtube-dl/download.html): .IP .nf \f[C] @@ -1525,12 +1563,13 @@ automatically pick the best option. Videos or video formats streamed via RTMP protocol can only be downloaded when rtmpdump (https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either -mplayer (http://mplayerhq.hu/) or mpv (https://mpv.io/) to be installed. +mplayer (https://mplayerhq.hu/) or mpv (https://mpv.io/) to be +installed. .SS I have downloaded a video but how can I play it? .PP Once the video is fully downloaded, use any video player, such as -mpv (https://mpv.io/), vlc (http://www.videolan.org/) or -mplayer (http://www.mplayerhq.hu/). +mpv (https://mpv.io/), vlc (https://www.videolan.org/) or +mplayer (https://www.mplayerhq.hu/). .SS I extracted a video URL with \f[C]\-g\f[], but it does not play on another machine / in my web browser. .PP @@ -1663,13 +1702,13 @@ If you want this for all of your downloads, put the option into your configuration file (#configuration). .SS How do I download a video starting with a \f[C]\-\f[]? .PP -Either prepend \f[C]http://www.youtube.com/watch?v=\f[] or separate the +Either prepend \f[C]https://www.youtube.com/watch?v=\f[] or separate the ID from the options with \f[C]\-\-\f[]: .IP .nf \f[C] youtube\-dl\ \-\-\ \-wNyEUrxzFU -youtube\-dl\ "http://www.youtube.com/watch?v=\-wNyEUrxzFU" +youtube\-dl\ "https://www.youtube.com/watch?v=\-wNyEUrxzFU" \f[] .fi .SS How do I pass cookies to youtube\-dl? @@ -1708,12 +1747,12 @@ YouTube, CloudFlare). You will first need to tell youtube\-dl to stream media to stdout with \f[C]\-o\ \-\f[], and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. -For example, streaming to vlc (http://www.videolan.org/) can be achieved -with: +For example, streaming to vlc (https://www.videolan.org/) can be +achieved with: .IP .nf \f[C] -youtube\-dl\ \-o\ \-\ "http://www.youtube.com/watch?v=BaW_jenozKcj"\ |\ vlc\ \- +youtube\-dl\ \-o\ \-\ "https://www.youtube.com/watch?v=BaW_jenozKcj"\ |\ vlc\ \- \f[] .fi .SS How do I download only new videos from a playlist? @@ -1761,8 +1800,8 @@ improvements of the built\-in downloader and/or ffmpeg. .PP In particular, the generic extractor (used when your website is not in the list of supported sites by -youtube\-dl (http://rg3.github.io/youtube-dl/supportedsites.html) cannot -mandate one specific downloader. +youtube\-dl (https://rg3.github.io/youtube-dl/supportedsites.html) +cannot mandate one specific downloader. .PP If you put either \f[C]\-\-hls\-prefer\-native\f[] or \f[C]\-\-hls\-prefer\-ffmpeg\f[] into your configuration, a different @@ -1829,8 +1868,8 @@ Please do not declare your issue as \f[C]important\f[] or For one, have a look at the list of supported sites (docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme -(say, from http://example.com/video/1234567 to -http://example.com/v/1234567 ) and youtube\-dl reports an URL of a +(say, from https://example.com/video/1234567 to +https://example.com/v/1234567 ) and youtube\-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug. .PP @@ -1875,7 +1914,7 @@ such as checking that your version of youtube\-dl is current. .SH DEVELOPER INSTRUCTIONS .PP Most users do not need to build youtube\-dl and can download the -builds (http://rg3.github.io/youtube-dl/download.html) or get them from +builds (https://rg3.github.io/youtube-dl/download.html) or get them from their distribution. .PP To run youtube\-dl as a developer, you don\[aq]t need to build anything @@ -1899,6 +1938,9 @@ nosetests \f[] .fi .PP +See item 6 of new extractor tutorial (#adding-support-for-a-new-site) +for how to run extractor specific test cases. +.PP If you want to create a build of youtube\-dl yourself, you\[aq]ll need .IP \[bu] 2 python @@ -1960,7 +2002,7 @@ from\ .common\ import\ InfoExtractor class\ YourExtractorIE(InfoExtractor): \ \ \ \ _VALID_URL\ =\ r\[aq]https?://(?:www\\.)?yourextractor\\.com/watch/(?P[0\-9]+)\[aq] \ \ \ \ _TEST\ =\ { -\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]http://yourextractor.com/watch/42\[aq], +\ \ \ \ \ \ \ \ \[aq]url\[aq]:\ \[aq]https://yourextractor.com/watch/42\[aq], \ \ \ \ \ \ \ \ \[aq]md5\[aq]:\ \[aq]TODO:\ md5\ sum\ of\ the\ first\ 10241\ bytes\ of\ the\ video\ file\ (use\ \-\-test)\[aq], \ \ \ \ \ \ \ \ \[aq]info_dict\[aq]:\ { \ \ \ \ \ \ \ \ \ \ \ \ \[aq]id\[aq]:\ \[aq]42\[aq], @@ -2005,6 +2047,8 @@ If you decide to add more than one test, then rename \f[C]_TEST\f[] to The tests will then be named \f[C]TestDownload.test_YourExtractor\f[], \f[C]TestDownload.test_YourExtractor_1\f[], \f[C]TestDownload.test_YourExtractor_2\f[], etc. +Note that tests with \f[C]only_matching\f[] key in test\[aq]s dict are +not counted in. .IP " 7." 4 Have a look at \f[C]youtube_dl/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) @@ -2016,12 +2060,13 @@ Add tests and code for as many as you want. Make sure your code follows youtube\-dl coding conventions (#youtube-dl-coding-conventions) and check the code with flake8 (https://pypi.python.org/pypi/flake8). -Also make sure your code works under all Python (http://www.python.org/) -versions claimed supported by youtube\-dl, namely 2.6, 2.7, and 3.2+. +Also make sure your code works under all +Python (https://www.python.org/) versions claimed supported by +youtube\-dl, namely 2.6, 2.7, and 3.2+. .IP " 9." 4 -When the tests pass, add (http://git-scm.com/docs/git-add) the new files -and commit (http://git-scm.com/docs/git-commit) them and -push (http://git-scm.com/docs/git-push) the result, like this: +When the tests pass, add (https://git-scm.com/docs/git-add) the new +files and commit (https://git-scm.com/docs/git-commit) them and +push (https://git-scm.com/docs/git-push) the result, like this: .RS 4 .IP .nf @@ -2272,7 +2317,7 @@ import\ youtube_dl ydl_opts\ =\ {} with\ youtube_dl.YoutubeDL(ydl_opts)\ as\ ydl: -\ \ \ \ ydl.download([\[aq]http://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) +\ \ \ \ ydl.download([\[aq]https://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) \f[] .fi .PP @@ -2319,7 +2364,7 @@ ydl_opts\ =\ { \ \ \ \ \[aq]progress_hooks\[aq]:\ [my_hook], } with\ youtube_dl.YoutubeDL(ydl_opts)\ as\ ydl: -\ \ \ \ ydl.download([\[aq]http://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) +\ \ \ \ ydl.download([\[aq]https://www.youtube.com/watch?v=BaW_jenozKc\[aq]]) \f[] .fi .SH BUGS @@ -2331,7 +2376,7 @@ GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel #youtube\-dl (irc://chat.freenode.net/#youtube-dl) on freenode -(webchat (http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). +(webchat (https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)). .PP \f[B]Please include the full output of youtube\-dl when run with \f[C]\-v\f[]\f[], i.e. @@ -2345,7 +2390,7 @@ It should look similar to this: $\ youtube\-dl\ \-v\ [debug]\ System\ config:\ [] [debug]\ User\ config:\ [] -[debug]\ Command\-line\ args:\ [u\[aq]\-v\[aq],\ u\[aq]http://www.youtube.com/watch?v=BaW_jenozKcj\[aq]] +[debug]\ Command\-line\ args:\ [u\[aq]\-v\[aq],\ u\[aq]https://www.youtube.com/watch?v=BaW_jenozKcj\[aq]] [debug]\ Encodings:\ locale\ cp1251,\ fs\ mbcs,\ out\ cp866,\ pref\ cp1251 [debug]\ youtube\-dl\ version\ 2015.12.06 [debug]\ Git\ HEAD:\ 135392e @@ -2410,11 +2455,11 @@ command\-line) or upload the \f[C]\&.dump\f[] files you get when you add .PP \f[B]Site support requests must contain an example URL\f[]. An example URL is a URL you might want to download, like -\f[C]http://www.youtube.com/watch?v=BaW_jenozKc\f[]. +\f[C]https://www.youtube.com/watch?v=BaW_jenozKc\f[]. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. -\f[C]http://www.youtube.com/\f[]) is \f[I]not\f[] an example URL. +\f[C]https://www.youtube.com/\f[]) is \f[I]not\f[] an example URL. .SS Are you using the latest version? .PP Before reporting any issue, type \f[C]youtube\-dl\ \-U\f[]. diff --git a/youtube-dl.fish b/youtube-dl.fish index 3778979..00d1845 100644 --- a/youtube-dl.fish +++ b/youtube-dl.fish @@ -38,7 +38,7 @@ complete --command youtube-dl --long-option datebefore --description 'Download o complete --command youtube-dl --long-option dateafter --description 'Download only videos uploaded on or after this date (i.e. inclusive)' complete --command youtube-dl --long-option min-views --description 'Do not download any videos with less than COUNT views' complete --command youtube-dl --long-option max-views --description 'Do not download any videos with more than COUNT views' -complete --command youtube-dl --long-option match-filter --description 'Generic video filter. Specify any key (see help for -o for a list of available keys) to match if the key is present, !key to check if the key is not present, key > NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against a number, key = '"'"'LITERAL'"'"' (like "uploader = '"'"'Mike Smith'"'"'", also works with !=) to match against a string literal and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the operator. For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & dislike_count NUMBER (like "comment_count > 12", also works with >=, <, <=, !=, =) to compare against a number, key = '"'"'LITERAL'"'"' (like "uploader = '"'"'Mike Smith'"'"'", also works with !=) to match against a string literal and & to require multiple matches. Values which are not known are excluded unless you put a question mark (?) after the operator. For example, to only match videos that have been liked more than 100 times and disliked less than 50 times (or the dislike functionality is not available at the given service), but who also have a description, use --match-filter "like_count > 100 & dislike_count 3.4 + + # HTMLParseError has been deprecated in Python 3.3 and removed in + # Python 3.5. Introducing dummy exception for Python >3.5 for compatible + # and uniform cross-version exceptiong handling + class compat_HTMLParseError(Exception): + pass + try: from subprocess import DEVNULL compat_subprocess_get_DEVNULL = lambda: DEVNULL @@ -2604,14 +2617,22 @@ except ImportError: # Python 2 parsed_result[name] = [value] return parsed_result -try: - from shlex import quote as compat_shlex_quote -except ImportError: # Python < 3.3 + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': def compat_shlex_quote(s): - if re.match(r'^[-_\w./]+$', s): - return s - else: - return "'" + s.replace("'", "'\"'\"'") + "'" + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + try: + from shlex import quote as compat_shlex_quote + except ImportError: # Python < 3.3 + def compat_shlex_quote(s): + if re.match(r'^[-_\w./]+$', s): + return s + else: + return "'" + s.replace("'", "'\"'\"'") + "'" try: @@ -2636,9 +2657,6 @@ def compat_ord(c): return ord(c) -compat_os_name = os._name if os.name == 'java' else os.name - - if sys.version_info >= (3, 0): compat_getenv = os.getenv compat_expanduser = os.path.expanduser @@ -2880,8 +2898,16 @@ else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack +try: + from future_builtins import zip as compat_zip +except ImportError: # not 2.6+ or is 3.x + try: + from itertools import izip as compat_zip # < 2.5 or 3.x + except ImportError: + compat_zip = zip __all__ = [ + 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', 'compat_basestring', @@ -2929,5 +2955,6 @@ __all__ = [ 'compat_urlretrieve', 'compat_xml_parse_error', 'compat_xpath', + 'compat_zip', 'workaround_optparse_bug9161', ] diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 5d66211..75b8166 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -8,10 +8,11 @@ import random from ..compat import compat_os_name from ..utils import ( + decodeArgument, encodeFilename, error_to_compat_str, - decodeArgument, format_bytes, + shell_quote, timeconvert, ) @@ -303,11 +304,11 @@ class FileDownloader(object): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, count, retries): + def report_retry(self, err, count, retries): """Report retry in case of HTTP error 5xx""" self.to_screen( - '[download] Got server HTTP error. Retrying (attempt %d of %s)...' - % (count, self.format_retries(retries))) + '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...' + % (error_to_compat_str(err), count, self.format_retries(retries))) def report_file_already_downloaded(self, file_name): """Report file has already been fully downloaded.""" @@ -381,10 +382,5 @@ class FileDownloader(object): if exe is None: exe = os.path.basename(str_args[0]) - try: - import pipes - shell_quote = lambda args: ' '.join(map(pipes.quote, str_args)) - except ImportError: - shell_quote = repr self.to_screen('[debug] %s command line: %s' % ( exe, shell_quote(str_args))) diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py index 7491fda..576ece6 100644 --- a/youtube_dl/downloader/dash.py +++ b/youtube_dl/downloader/dash.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from .fragment import FragmentFD from ..compat import compat_urllib_error +from ..utils import urljoin class DashSegmentsFD(FragmentFD): @@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD): FD_NAME = 'dashsegments' def real_download(self, filename, info_dict): - segments = info_dict['fragments'][:1] if self.params.get( + fragment_base_url = info_dict.get('fragment_base_url') + fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] ctx = { 'filename': filename, - 'total_frags': len(segments), + 'total_frags': len(fragments), } self._prepare_and_start_frag_download(ctx) @@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD): skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) frag_index = 0 - for i, segment in enumerate(segments): + for i, fragment in enumerate(fragments): frag_index += 1 if frag_index <= ctx['fragment_index']: continue @@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + fragment_url = fragment.get('url') + if not fragment_url: + assert fragment_base_url + fragment_url = urljoin(fragment_base_url, fragment['path']) + success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) if not success: return False self._append_fragment(ctx, frag_content) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index e78169a..db018fa 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -212,6 +212,11 @@ class FFmpegFD(ExternalFD): args = [ffpp.executable, '-y'] + for log_level in ('quiet', 'verbose'): + if self.params.get(log_level, False): + args += ['-loglevel', log_level] + break + seekable = info_dict.get('_seekable') if seekable is not None: # setting -seekable prevents ffmpeg from guessing if the server diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index bccc8ec..6f6fb4a 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -151,10 +151,15 @@ class FragmentFD(FileDownloader): if self.__do_ytdl_file(ctx): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) + if ctx['fragment_index'] > 0 and resume_len == 0: + self.report_error( + 'Inconsistent state of incomplete fragment download. ' + 'Restarting from the beginning...') + ctx['fragment_index'] = resume_len = 0 + self._write_ytdl_file(ctx) else: self._write_ytdl_file(ctx) - if ctx['fragment_index'] > 0: - assert resume_len > 0 + assert ctx['fragment_index'] == 0 dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode) diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 0e29c8a..46308cf 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -59,9 +59,9 @@ class HlsFD(FragmentFD): man_url = info_dict['url'] self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME) - manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read() - - s = manifest.decode('utf-8', 'ignore') + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + s = urlh.read().decode('utf-8', 'ignore') if not self.can_download(s, info_dict): if info_dict.get('extra_param_to_segment_url'): diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index af405b9..8a6638c 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -22,8 +22,16 @@ from ..utils import ( class HttpFD(FileDownloader): def real_download(self, filename, info_dict): url = info_dict['url'] - tmpfilename = self.temp_name(filename) - stream = None + + class DownloadContext(dict): + __getattr__ = dict.get + __setattr__ = dict.__setitem__ + __delattr__ = dict.__delitem__ + + ctx = DownloadContext() + ctx.filename = filename + ctx.tmpfilename = self.temp_name(filename) + ctx.stream = None # Do not include the Accept-Encoding header headers = {'Youtubedl-no-compression': 'True'} @@ -38,46 +46,51 @@ class HttpFD(FileDownloader): if is_test: request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) - # Establish possible resume length - if os.path.isfile(encodeFilename(tmpfilename)): - resume_len = os.path.getsize(encodeFilename(tmpfilename)) - else: - resume_len = 0 - - open_mode = 'wb' - if resume_len != 0: - if self.params.get('continuedl', True): - self.report_resuming_byte(resume_len) - request.add_header('Range', 'bytes=%d-' % resume_len) - open_mode = 'ab' - else: - resume_len = 0 + ctx.open_mode = 'wb' + ctx.resume_len = 0 + + if self.params.get('continuedl', True): + # Establish possible resume length + if os.path.isfile(encodeFilename(ctx.tmpfilename)): + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) count = 0 retries = self.params.get('retries', 0) - while count <= retries: + + class SucceedDownload(Exception): + pass + + class RetryDownload(Exception): + def __init__(self, source_error): + self.source_error = source_error + + def establish_connection(): + if ctx.resume_len != 0: + self.report_resuming_byte(ctx.resume_len) + request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + ctx.open_mode = 'ab' # Establish connection try: - data = self.ydl.urlopen(request) + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if resume_len > 0: - content_range = data.headers.get('Content-Range') + if ctx.resume_len > 0: + content_range = ctx.data.headers.get('Content-Range') if content_range: content_range_m = re.search(r'bytes (\d+)-', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and resume_len == int(content_range_m.group(1)): - break + if content_range_m and ctx.resume_len == int(content_range_m.group(1)): + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return except (compat_urllib_error.HTTPError, ) as err: if (err.code < 500 or err.code >= 600) and err.code != 416: # Unexpected HTTP error @@ -86,15 +99,15 @@ class HttpFD(FileDownloader): # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - data = self.ydl.urlopen(basic_request) - content_length = data.info()['Content-Length'] + ctx.data = self.ydl.urlopen(basic_request) + content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: raise else: # Examine the reported length if (content_length is not None and - (resume_len - 100 < int(content_length) < resume_len + 100)): + (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)): # The file had already been fully downloaded. # Explanation to the above condition: in issue #175 it was revealed that # YouTube sometimes adds or removes a few bytes from the end of the file, @@ -102,152 +115,184 @@ class HttpFD(FileDownloader): # I decided to implement a suggested change and consider the file # completely downloaded if the file size differs less than 100 bytes from # the one in the hard drive. - self.report_file_already_downloaded(filename) - self.try_rename(tmpfilename, filename) + self.report_file_already_downloaded(ctx.filename) + self.try_rename(ctx.tmpfilename, ctx.filename) self._hook_progress({ - 'filename': filename, + 'filename': ctx.filename, 'status': 'finished', - 'downloaded_bytes': resume_len, - 'total_bytes': resume_len, + 'downloaded_bytes': ctx.resume_len, + 'total_bytes': ctx.resume_len, }) - return True + raise SucceedDownload() else: # The length does not match, we start the download over self.report_unable_to_resume() - resume_len = 0 - open_mode = 'wb' - break - except socket.error as e: - if e.errno != errno.ECONNRESET: + ctx.resume_len = 0 + ctx.open_mode = 'wb' + return + raise RetryDownload(err) + except socket.error as err: + if err.errno != errno.ECONNRESET: # Connection reset is no problem, just retry raise + raise RetryDownload(err) + + def download(): + data_len = ctx.data.info().get('Content-length', None) + + # Range HTTP header may be ignored/unsupported by a webserver + # (e.g. extractor/scivee.py, extractor/bambuser.py). + # However, for a test we still would like to download just a piece of a file. + # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control + # block size when downloading a file. + if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): + data_len = self._TEST_FILE_SIZE + + if data_len is not None: + data_len = int(data_len) + ctx.resume_len + min_data_len = self.params.get('min_filesize') + max_data_len = self.params.get('max_filesize') + if min_data_len is not None and data_len < min_data_len: + self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + return False + if max_data_len is not None and data_len > max_data_len: + self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + return False - # Retry - count += 1 - if count <= retries: - self.report_retry(count, retries) - - if count > retries: - self.report_error('giving up after %s retries' % retries) - return False - - data_len = data.info().get('Content-length', None) - - # Range HTTP header may be ignored/unsupported by a webserver - # (e.g. extractor/scivee.py, extractor/bambuser.py). - # However, for a test we still would like to download just a piece of a file. - # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control - # block size when downloading a file. - if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE): - data_len = self._TEST_FILE_SIZE - - if data_len is not None: - data_len = int(data_len) + resume_len - min_data_len = self.params.get('min_filesize') - max_data_len = self.params.get('max_filesize') - if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) - return False - if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) - return False - - byte_counter = 0 + resume_len - block_size = self.params.get('buffersize', 1024) - start = time.time() + byte_counter = 0 + ctx.resume_len + block_size = self.params.get('buffersize', 1024) + start = time.time() - # measure time over whole while-loop, so slow_down() and best_block_size() work together properly - now = None # needed for slow_down() in the first loop run - before = start # start measuring - while True: + # measure time over whole while-loop, so slow_down() and best_block_size() work together properly + now = None # needed for slow_down() in the first loop run + before = start # start measuring - # Download and write - data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - byte_counter += len(data_block) + def retry(e): + if ctx.tmpfilename != '-': + ctx.stream.close() + ctx.stream = None + ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + raise RetryDownload(e) - # exit loop when download is finished - if len(data_block) == 0: - break + while True: + try: + # Download and write + data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) + # socket.timeout is a subclass of socket.error but may not have + # errno set + except socket.timeout as e: + retry(e) + except socket.error as e: + if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT): + raise + retry(e) + + byte_counter += len(data_block) + + # exit loop when download is finished + if len(data_block) == 0: + break + + # Open destination file just in time + if ctx.stream is None: + try: + ctx.stream, ctx.tmpfilename = sanitize_open( + ctx.tmpfilename, ctx.open_mode) + assert ctx.stream is not None + ctx.filename = self.undo_temp_name(ctx.tmpfilename) + self.report_destination(ctx.filename) + except (OSError, IOError) as err: + self.report_error('unable to open for writing: %s' % str(err)) + return False + + if self.params.get('xattr_set_filesize', False) and data_len is not None: + try: + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + except (XAttrUnavailableError, XAttrMetadataError) as err: + self.report_error('unable to set filesize xattr: %s' % str(err)) - # Open destination file just in time - if stream is None: try: - (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode) - assert stream is not None - filename = self.undo_temp_name(tmpfilename) - self.report_destination(filename) - except (OSError, IOError) as err: - self.report_error('unable to open for writing: %s' % str(err)) + ctx.stream.write(data_block) + except (IOError, OSError) as err: + self.to_stderr('\n') + self.report_error('unable to write data: %s' % str(err)) return False - if self.params.get('xattr_set_filesize', False) and data_len is not None: - try: - write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) - except (XAttrUnavailableError, XAttrMetadataError) as err: - self.report_error('unable to set filesize xattr: %s' % str(err)) - - try: - stream.write(data_block) - except (IOError, OSError) as err: + # Apply rate limit + self.slow_down(start, now, byte_counter - ctx.resume_len) + + # end measuring of one loop run + now = time.time() + after = now + + # Adjust block size + if not self.params.get('noresizebuffer', False): + block_size = self.best_block_size(after - before, len(data_block)) + + before = after + + # Progress message + speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) + if data_len is None: + eta = None + else: + eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + + self._hook_progress({ + 'status': 'downloading', + 'downloaded_bytes': byte_counter, + 'total_bytes': data_len, + 'tmpfilename': ctx.tmpfilename, + 'filename': ctx.filename, + 'eta': eta, + 'speed': speed, + 'elapsed': now - start, + }) + + if is_test and byte_counter == data_len: + break + + if ctx.stream is None: self.to_stderr('\n') - self.report_error('unable to write data: %s' % str(err)) + self.report_error('Did not get any data blocks') return False + if ctx.tmpfilename != '-': + ctx.stream.close() - # Apply rate limit - self.slow_down(start, now, byte_counter - resume_len) + if data_len is not None and byte_counter != data_len: + err = ContentTooShortError(byte_counter, int(data_len)) + if count <= retries: + retry(err) + raise err - # end measuring of one loop run - now = time.time() - after = now + self.try_rename(ctx.tmpfilename, ctx.filename) - # Adjust block size - if not self.params.get('noresizebuffer', False): - block_size = self.best_block_size(after - before, len(data_block)) - - before = after - - # Progress message - speed = self.calc_speed(start, now, byte_counter - resume_len) - if data_len is None: - eta = None - else: - eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len) + # Update file modification time + if self.params.get('updatetime', True): + info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None)) self._hook_progress({ - 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, - 'tmpfilename': tmpfilename, - 'filename': filename, - 'eta': eta, - 'speed': speed, - 'elapsed': now - start, + 'total_bytes': byte_counter, + 'filename': ctx.filename, + 'status': 'finished', + 'elapsed': time.time() - start, }) - if is_test and byte_counter == data_len: - break - - if stream is None: - self.to_stderr('\n') - self.report_error('Did not get any data blocks') - return False - if tmpfilename != '-': - stream.close() - - if data_len is not None and byte_counter != data_len: - raise ContentTooShortError(byte_counter, int(data_len)) - self.try_rename(tmpfilename, filename) - - # Update file modification time - if self.params.get('updatetime', True): - info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None)) - - self._hook_progress({ - 'downloaded_bytes': byte_counter, - 'total_bytes': byte_counter, - 'filename': filename, - 'status': 'finished', - 'elapsed': time.time() - start, - }) - - return True + return True + + while count <= retries: + try: + establish_connection() + download() + return True + except RetryDownload as e: + count += 1 + if count <= retries: + self.report_retry(e.source_error, count, retries) + continue + except SucceedDownload: + return True + + self.report_error('giving up after %s retries' % retries) + return False diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 5f6f9fa..9b001ec 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -98,7 +98,7 @@ def write_piff_header(stream, params): if is_audio: smhd_payload = s88.pack(0) # balance - smhd_payload = u16.pack(0) # reserved + smhd_payload += u16.pack(0) # reserved media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header else: vmhd_payload = u16.pack(0) # graphics mode @@ -126,7 +126,6 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) else: - sample_entry_payload = sample_entry_payload sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved sample_entry_payload += u32.pack(0) * 3 # pre defined diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py index 0247cab..60f753b 100644 --- a/youtube_dl/extractor/abc.py +++ b/youtube_dl/extractor/abc.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( ExtractorError, js_to_json, int_or_none, parse_iso8601, + try_get, ) @@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor): title = video_params.get('title') or video_params['seriesTitle'] stream = next(s for s in video_params['playlist'] if s.get('type') == 'program') - formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id) + format_urls = [ + try_get(stream, lambda x: x['hds-unmetered'], compat_str)] + + # May have higher quality video + sd_url = try_get( + stream, lambda x: x['streams']['hds']['sd'], compat_str) + if sd_url: + format_urls.append(sd_url.replace('metered', 'um')) + + formats = [] + for format_url in format_urls: + if format_url: + formats.extend( + self._extract_akamai_formats(format_url, video_id)) self._sort_formats(formats) subtitles = {} diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index 4f56c4c..f770fe9 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -7,12 +7,21 @@ import time from .amp import AMPIE from .common import InfoExtractor +from .youtube import YoutubeIE from ..compat import compat_urlparse class AbcNewsVideoIE(AMPIE): IE_NAME = 'abcnews:video' - _VALID_URL = r'https?://abcnews\.go\.com/[^/]+/video/(?P[0-9a-z-]+)-(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + abcnews\.go\.com/ + (?: + [^/]+/video/(?P[0-9a-z-]+)-| + video/embed\?.*?\bid= + ) + (?P\d+) + ''' _TESTS = [{ 'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932', @@ -29,6 +38,9 @@ class AbcNewsVideoIE(AMPIE): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://abcnews.go.com/video/embed?id=46979033', + 'only_matching': True, }, { 'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478', 'only_matching': True, @@ -97,9 +109,7 @@ class AbcNewsIE(InfoExtractor): r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL') full_video_url = compat_urlparse.urljoin(url, video_url) - youtube_url = self._html_search_regex( - r']+src="(https://www\.youtube\.com/embed/[^"]+)"', - webpage, 'YouTube URL', default=None) + youtube_url = YoutubeIE._extract_url(webpage) timestamp = None date_str = self._html_search_regex( @@ -129,7 +139,7 @@ class AbcNewsIE(InfoExtractor): } if youtube_url: - entries = [entry, self.url_result(youtube_url, 'Youtube')] + entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())] return self.playlist_result(entries) return entry diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py index 76e9813..03b92a3 100644 --- a/youtube_dl/extractor/abcotvs.py +++ b/youtube_dl/extractor/abcotvs.py @@ -22,7 +22,7 @@ class ABCOTVSIE(InfoExtractor): 'display_id': 'east-bay-museum-celebrates-vintage-synthesizers', 'ext': 'mp4', 'title': 'East Bay museum celebrates vintage synthesizers', - 'description': 'md5:a4f10fb2f2a02565c1749d4adbab4b10', + 'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3', 'thumbnail': r're:^https?://.*\.jpg$', 'timestamp': 1421123075, 'upload_date': '20150113', diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 66caf6a..cffdab6 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -15,6 +15,7 @@ from ..utils import ( intlist_to_bytes, srt_subtitles_timecode, strip_or_none, + urljoin, ) @@ -31,25 +32,28 @@ class ADNIE(InfoExtractor): 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', } } + _BASE_URL = 'http://animedigitalnetwork.fr' def _get_subtitles(self, sub_path, video_id): if not sub_path: return None enc_subtitles = self._download_webpage( - 'http://animedigitalnetwork.fr/' + sub_path, - video_id, fatal=False) + urljoin(self._BASE_URL, sub_path), + video_id, fatal=False, headers={ + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0', + }) if not enc_subtitles: return None # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'), + bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'), bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( - dec_subtitles[:-compat_ord(dec_subtitles[-1])], + dec_subtitles[:-compat_ord(dec_subtitles[-1])].decode(), None, fatal=False) if not subtitles_json: return None @@ -103,9 +107,18 @@ class ADNIE(InfoExtractor): metas = options.get('metas') or {} title = metas.get('title') or video_info['title'] links = player_config.get('links') or {} + error = None + if not links: + links_url = player_config['linksurl'] + links_data = self._download_json(urljoin( + self._BASE_URL, links_url), video_id) + links = links_data.get('links') or {} + error = links_data.get('error') formats = [] for format_id, qualities in links.items(): + if not isinstance(qualities, dict): + continue for load_balancer_url in qualities.values(): load_balancer_data = self._download_json( load_balancer_url, video_id, fatal=False) or {} @@ -119,7 +132,8 @@ class ADNIE(InfoExtractor): for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - error = options.get('error') + if not error: + error = options.get('error') if not formats and error: raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) self._sort_formats(formats) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 7da96c6..b83b51e 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -6,12 +6,16 @@ import time import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_kwargs, + compat_urlparse, +) from ..utils import ( unescapeHTML, urlencode_postdata, unified_timestamp, ExtractorError, + NO_DEFAULT, ) @@ -21,6 +25,11 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'ATTOTT': { + 'name': 'DIRECTV NOW', + 'username_field': 'email', + 'password_field': 'loginpassword', + }, 'Rogers': { 'name': 'Rogers', 'username_field': 'UserName', @@ -36,6 +45,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Brighthouse': { + 'name': 'Bright House Networks | Spectrum', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, 'Charter_Direct': { 'name': 'Charter Spectrum', 'username_field': 'IDToken1', @@ -1308,11 +1322,14 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + _DOWNLOADING_LOGIN_PAGE = 'Downloading Provider Login Page' + def _download_webpage_handle(self, *args, **kwargs): headers = kwargs.get('headers', {}) headers.update(self.geo_verification_headers()) kwargs['headers'] = headers - return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + return super(AdobePassIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1356,6 +1373,21 @@ class AdobePassIE(InfoExtractor): 'Use --ap-mso to specify Adobe Pass Multiple-system operator Identifier ' 'and --ap-username and --ap-password or --netrc to provide account credentials.', expected=True) + def extract_redirect_url(html, url=None, fatal=False): + # TODO: eliminate code duplication with generic extractor and move + # redirection code into _download_webpage_handle + REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' + redirect_url = self._search_regex( + r'(?i)\d+)' + _TEST = { + 'url': 'https://live.aliexpress.com/live/2800002704436634', + 'md5': 'e729e25d47c5e557f2630eaf99b740a5', + 'info_dict': { + 'id': '2800002704436634', + 'ext': 'mp4', + 'title': 'CASIMA7.22', + 'thumbnail': r're:http://.*\.jpg', + 'uploader': 'CASIMA Official Store', + 'timestamp': 1500717600, + 'upload_date': '20170722', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data = self._parse_json( + self._search_regex( + r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var', + webpage, 'runParams'), + video_id) + + title = data['title'] + + formats = self._extract_m3u8_formats( + data['replyStreamUrl'], video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': data.get('coverUrl'), + 'uploader': try_get( + data, lambda x: x['followBar']['name'], compat_str), + 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000), + 'formats': formats, + } diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index 3a0ec67..dd3b18d 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -3,9 +3,10 @@ from __future__ import unicode_literals from .theplatform import ThePlatformIE from ..utils import ( - update_url_query, - parse_age_limit, int_or_none, + parse_age_limit, + try_get, + update_url_query, ) @@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE): info = self._parse_theplatform_metadata(theplatform_metadata) video_id = theplatform_metadata['pid'] title = theplatform_metadata['title'] - rating = theplatform_metadata['ratings'][0]['rating'] + rating = try_get( + theplatform_metadata, lambda x: x['ratings'][0]['rating']) auth_required = self._search_regex( r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') diff --git a/youtube_dl/extractor/americastestkitchen.py b/youtube_dl/extractor/americastestkitchen.py new file mode 100755 index 0000000..0173687 --- /dev/null +++ b/youtube_dl/extractor/americastestkitchen.py @@ -0,0 +1,85 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + try_get, + unified_strdate, +) + + +class AmericasTestKitchenIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:episode|videos)/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.americastestkitchen.com/episode/548-summer-dinner-party', + 'md5': 'b861c3e365ac38ad319cfd509c30577f', + 'info_dict': { + 'id': '1_5g5zua6e', + 'title': 'Summer Dinner Party', + 'ext': 'mp4', + 'description': 'md5:858d986e73a4826979b6a5d9f8f6a1ec', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1497285541, + 'upload_date': '20170612', + 'uploader_id': 'roger.metcalf@americastestkitchen.com', + 'release_date': '20170617', + 'series': "America's Test Kitchen", + 'season_number': 17, + 'episode': 'Summer Dinner Party', + 'episode_number': 24, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + partner_id = self._search_regex( + r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)', + webpage, 'kaltura partner id') + + video_data = self._parse_json( + self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*', + webpage, 'initial context'), + video_id) + + ep_data = try_get( + video_data, + (lambda x: x['episodeDetail']['content']['data'], + lambda x: x['videoDetail']['content']['data']), dict) + ep_meta = ep_data.get('full_video', {}) + external_id = ep_data.get('external_id') or ep_meta['external_id'] + + title = ep_data.get('title') or ep_meta.get('title') + description = clean_html(ep_meta.get('episode_description') or ep_data.get( + 'description') or ep_meta.get('description')) + thumbnail = try_get(ep_meta, lambda x: x['photo']['image_url']) + release_date = unified_strdate(ep_data.get('aired_at')) + + season_number = int_or_none(ep_meta.get('season_number')) + episode = ep_meta.get('title') + episode_number = int_or_none(ep_meta.get('episode_number')) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, external_id), + 'ie_key': 'Kaltura', + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'release_date': release_date, + 'series': "America's Test Kitchen", + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + } diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9e28f25..69d3633 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -3,16 +3,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urlparse, - compat_str, -) +from ..compat import compat_str from ..utils import ( determine_ext, extract_attributes, ExtractorError, - sanitized_Request, urlencode_postdata, + urljoin, ) @@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor): _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' + # German-speaking countries of Europe + _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] _TESTS = [{ # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', @@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor): # Full length film, non-series, ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/185', 'only_matching': True, + }, { + # Flash videos + 'url': 'https://www.anime-on-demand.de/anime/12', + 'only_matching': True, }] def _login(self): @@ -72,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor): 'post url', default=self._LOGIN_URL, group='url') if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) - - request = sanitized_Request( - post_url, urlencode_postdata(login_form)) - request.add_header('Referer', self._LOGIN_URL) + post_url = urljoin(self._LOGIN_URL, post_url) response = self._download_webpage( - request, None, 'Logging in as %s' % username) + post_url, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form), headers={ + 'Referer': self._LOGIN_URL, + }) if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): error = self._search_regex( @@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor): formats = [] for input_ in re.findall( - r']+class=["\'].*?streamstarter_html5[^>]+>', html): + r']+class=["\'].*?streamstarter[^>]+>', html): attributes = extract_attributes(input_) + title = attributes.get('data-dialog-header') playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist'): + for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): playlist_url = attributes.get(playlist_key) if isinstance(playlist_url, compat_str) and re.match( r'/?[\da-zA-Z]+', playlist_url): @@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) - request = sanitized_Request( - compat_urlparse.urljoin(url, playlist_url), + item_id_list = [] + if format_id: + item_id_list.append(format_id) + item_id_list.append('videomaterial') + playlist = self._download_json( + urljoin(url, playlist_url), video_id, + 'Downloading %s JSON' % ' '.join(item_id_list), headers={ 'X-Requested-With': 'XMLHttpRequest', 'X-CSRF-Token': csrf_token, 'Referer': url, 'Accept': 'application/json, text/javascript, */*; q=0.01', - }) - playlist = self._download_json( - request, video_id, 'Downloading %s playlist JSON' % format_id, - fatal=False) + }, fatal=False) if not playlist: continue + stream_url = playlist.get('streamurl') + if stream_url: + rtmp = re.search( + r'^(?Prtmpe?://(?P[^/]+)/(?P.+/))(?Pmp[34]:.+)', + stream_url) + if rtmp: + formats.append({ + 'url': rtmp.group('url'), + 'app': rtmp.group('app'), + 'play_path': rtmp.group('playpath'), + 'page_url': url, + 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', + 'rtmp_real_time': True, + 'format_id': 'rtmp', + 'ext': 'flv', + }) + continue start_video = playlist.get('startvideo', 0) playlist = playlist.get('playlist') if not playlist or not isinstance(playlist, list): @@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor): f.update({ 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), - 'url': compat_urlparse.urljoin(url, m.group('href')), + 'url': urljoin(url, m.group('href')), }) entries.append(f) diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 025e29a..e394cb6 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -3,13 +3,13 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - ExtractorError, - HEADRequest, + int_or_none, + mimetype2ext, ) class AparatIE(InfoExtractor): - _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' + _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P[a-zA-Z0-9]+)' _TEST = { 'url': 'http://www.aparat.com/v/wP8On', @@ -29,30 +29,41 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id - webpage = self._download_webpage(embed_url, video_id) - - file_list = self._parse_json(self._search_regex( - r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) - for i, item in enumerate(file_list[0]): - video_url = item['file'] - req = HEADRequest(video_url) - res = self._request_webpage( - req, video_id, note='Testing video URL %d' % i, errnote=False) - if res: - break - else: - raise ExtractorError('No working video URLs found') + webpage = self._download_webpage( + 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id, + video_id) title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title') + + file_list = self._parse_json( + self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, + 'file list'), + video_id) + + formats = [] + for item in file_list[0]: + file_url = item.get('file') + if not file_url: + continue + ext = mimetype2ext(item.get('type')) + label = item.get('label') + formats.append({ + 'url': file_url, + 'ext': ext, + 'format_id': label or ext, + 'height': int_or_none(self._search_regex( + r'(\d+)[pP]', label or '', 'height', default=None)), + }) + self._sort_formats(formats) + thumbnail = self._search_regex( r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False) return { 'id': video_id, 'title': title, - 'url': video_url, - 'ext': 'mp4', 'thumbnail': thumbnail, 'age_limit': self._family_friendly_search(webpage), + 'formats': formats, } diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 2d55994..3f248b1 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor): duration = int_or_none(media_info.get('_duration')) thumbnail = media_info.get('_previewImage') + is_live = media_info.get('_isLive') is True subtitles = {} subtitle_url = media_info.get('_subtitleUrl') @@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor): 'id': video_id, 'duration': duration, 'thumbnail': thumbnail, + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } @@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor): # determine video id from url m = re.match(self._VALID_URL, url) + document_id = None + numid = re.search(r'documentId=([0-9]+)', url) if numid: - video_id = numid.group(1) + document_id = video_id = numid.group(1) else: video_id = m.group('video_id') @@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor): 'formats': formats, } else: # request JSON file + if not document_id: + video_id = self._search_regex( + r'/play/(?:config|media)/(\d+)', webpage, 'media id') info = self._extract_media_info( - 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id) + 'http://www.ardmediathek.de/play/media/%s' % video_id, + webpage, video_id) info.update({ 'id': video_id, - 'title': title, + 'title': self._live_title(title) if info.get('is_live') else title, 'description': description, 'thumbnail': thumbnail, }) diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 56baef2..5cde90c 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -9,12 +9,13 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + ExtractorError, find_xpath_attr, - unified_strdate, get_element_by_attribute, int_or_none, NO_DEFAULT, qualities, + unified_strdate, ) # There are different sources of video in arte.tv, the extraction process @@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] + vsr = player_info['VSR'] + + if not vsr: + raise ExtractorError( + 'Video %s is not available' % player_info.get('VID') or video_id, + expected=True) + upload_date_str = player_info.get('shootingDate') if not upload_date_str: upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] @@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor): langcode = LANGS.get(lang, lang) formats = [] - for format_id, format_dict in player_info['VSR'].items(): + for format_id, format_dict in vsr.items(): f = dict(format_dict) versionCode = f.get('versionCode') l = re.escape(langcode) diff --git a/youtube_dl/extractor/asiancrush.py b/youtube_dl/extractor/asiancrush.py new file mode 100644 index 0000000..594c88c --- /dev/null +++ b/youtube_dl/extractor/asiancrush.py @@ -0,0 +1,93 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import ( + extract_attributes, + remove_end, + urlencode_postdata, +) + + +class AsianCrushIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/video/(?:[^/]+/)?0+(?P\d+)v\b' + _TESTS = [{ + 'url': 'https://www.asiancrush.com/video/012869v/women-who-flirt/', + 'md5': 'c3b740e48d0ba002a42c0b72857beae6', + 'info_dict': { + 'id': '1_y4tmjm5r', + 'ext': 'mp4', + 'title': 'Women Who Flirt', + 'description': 'md5:3db14e9186197857e7063522cb89a805', + 'timestamp': 1496936429, + 'upload_date': '20170608', + 'uploader_id': 'craig@crifkin.com', + }, + }, { + 'url': 'https://www.asiancrush.com/video/she-was-pretty/011886v-pretty-episode-3/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'https://www.asiancrush.com/wp-admin/admin-ajax.php', video_id, + data=urlencode_postdata({ + 'postid': video_id, + 'action': 'get_channel_kaltura_vars', + })) + + entry_id = data['entry_id'] + + return self.url_result( + 'kaltura:%s:%s' % (data['partner_id'], entry_id), + ie=KalturaIE.ie_key(), video_id=entry_id, + video_title=data.get('vid_label')) + + +class AsianCrushPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?asiancrush\.com/series/0+(?P\d+)s\b' + _TEST = { + 'url': 'https://www.asiancrush.com/series/012481s/scholar-walks-night/', + 'info_dict': { + 'id': '12481', + 'title': 'Scholar Who Walks the Night', + 'description': 'md5:7addd7c5132a09fd4741152d96cce886', + }, + 'playlist_count': 20, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [] + + for mobj in re.finditer( + r']+href=(["\'])(?P%s.*?)\1[^>]*>' % AsianCrushIE._VALID_URL, + webpage): + attrs = extract_attributes(mobj.group(0)) + if attrs.get('class') == 'clearfix': + entries.append(self.url_result( + mobj.group('url'), ie=AsianCrushIE.ie_key())) + + title = remove_end( + self._html_search_regex( + r'(?s)]\bid=["\']movieTitle[^>]+>(.+?)', webpage, + 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, 'title', + default=None) or self._search_regex( + r'([^<]+)', webpage, 'title', fatal=False), + ' | AsianCrush') + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'twitter:description', webpage, 'description', fatal=False) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index e48bb89..393f381 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor): def from_clip(field): if clip: - clip.get(field) + return clip.get(field) audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( 'audio', webpage, 'audio url') diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 489d0ba..be41bd5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -14,14 +14,16 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + KNOWN_EXTENSIONS, parse_filesize, unescapeHTML, update_url_query, + unified_strdate, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P.*)' + _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -155,7 +157,7 @@ class BandcampIE(InfoExtractor): class BandcampAlbumIE(InfoExtractor): IE_NAME = 'Bandcamp:album' - _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))' + _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' _TESTS = [{ 'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -222,6 +224,12 @@ class BandcampAlbumIE(InfoExtractor): 'playlist_count': 2, }] + @classmethod + def suitable(cls, url): + return (False + if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url) + else super(BandcampAlbumIE, cls).suitable(url)) + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) uploader_id = mobj.group('subdomain') @@ -234,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor): raise ExtractorError('The page doesn\'t contain any tracks') # Only tracks with duration info have songs entries = [ - self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key()) + self.url_result( + compat_urlparse.urljoin(url, t_path), + ie=BandcampIE.ie_key(), + video_title=self._search_regex( + r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', + elem_content, 'track title', fatal=False)) for elem_content, t_path in track_elements if self._html_search_meta('duration', elem_content, default=None)] @@ -250,3 +263,92 @@ class BandcampAlbumIE(InfoExtractor): 'title': title, 'entries': entries, } + + +class BandcampWeeklyIE(InfoExtractor): + IE_NAME = 'Bandcamp:weekly' + _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://bandcamp.com/?show=224', + 'md5': 'b00df799c733cf7e0c567ed187dea0fd', + 'info_dict': { + 'id': '224', + 'ext': 'opus', + 'title': 'BC Weekly April 4th 2017 - Magic Moments', + 'description': 'md5:5d48150916e8e02d030623a48512c874', + 'duration': 5829.77, + 'release_date': '20170404', + 'series': 'Bandcamp Weekly', + 'episode': 'Magic Moments', + 'episode_number': 208, + 'episode_id': '224', + } + }, { + 'url': 'https://bandcamp.com/?blah/blah@&show=228', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + blob = self._parse_json( + self._search_regex( + r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, + 'blob', group='blob'), + video_id, transform_source=unescapeHTML) + + show = blob['bcw_show'] + + # This is desired because any invalid show id redirects to `bandcamp.com` + # which happens to expose the latest Bandcamp Weekly episode. + show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) + + formats = [] + for format_id, format_url in show['audio_stream'].items(): + if not isinstance(format_url, compat_str): + continue + for known_ext in KNOWN_EXTENSIONS: + if known_ext in format_id: + ext = known_ext + break + else: + ext = None + formats.append({ + 'format_id': format_id, + 'url': format_url, + 'ext': ext, + 'vcodec': 'none', + }) + self._sort_formats(formats) + + title = show.get('audio_title') or 'Bandcamp Weekly' + subtitle = show.get('subtitle') + if subtitle: + title += ' - %s' % subtitle + + episode_number = None + seq = blob.get('bcw_seq') + + if seq and isinstance(seq, list): + try: + episode_number = next( + int_or_none(e.get('episode_number')) + for e in seq + if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) + except StopIteration: + pass + + return { + 'id': video_id, + 'title': title, + 'description': show.get('desc') or show.get('short_desc'), + 'duration': float_or_none(show.get('audio_duration')), + 'is_live': False, + 'release_date': unified_strdate(show.get('published_date')), + 'series': 'Bandcamp Weekly', + 'episode': show.get('subtitle'), + 'episode_number': episode_number, + 'episode_id': compat_str(video_id), + 'formats': formats + } diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index dd65b8d..8b20c03 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -6,14 +6,18 @@ import itertools from .common import InfoExtractor from ..utils import ( + clean_html, dict_get, ExtractorError, float_or_none, + get_element_by_class, int_or_none, parse_duration, parse_iso8601, try_get, unescapeHTML, + urlencode_postdata, + urljoin, ) from ..compat import ( compat_etree_fromstring, @@ -25,19 +29,23 @@ from ..compat import ( class BBCCoUkIE(InfoExtractor): IE_NAME = 'bbc.co.uk' IE_DESC = 'BBC iPlayer' - _ID_REGEX = r'[pb][\da-z]{7}' + _ID_REGEX = r'[pbw][\da-z]{7}' _VALID_URL = r'''(?x) https?:// (?:www\.)?bbc\.co\.uk/ (?: programmes/(?!articles/)| iplayer(?:/[^/]+)?/(?:episode/|playlist/)| - music/clips[/#]| - radio/player/ + music/(?:clips|audiovideo/popular)[/#]| + radio/player/| + events/[^/]+/play/[^/]+/ ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _LOGIN_URL = 'https://account.bbc.com/signin' + _NETRC_MACHINE = 'bbc' + _MEDIASELECTOR_URLS = [ # Provides HQ HLS streams with even better quality that pc mediaset but fails # with geolocation in some cases when it's even not geo restricted at all (e.g. @@ -222,11 +230,49 @@ class BBCCoUkIE(InfoExtractor): }, { 'url': 'http://www.bbc.co.uk/radio/player/p03cchwf', 'only_matching': True, - } - ] + }, { + 'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55', + 'only_matching': True, + }, { + 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9', + 'only_matching': True, + }] _USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8' + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading signin page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, + 'post url', default=self._LOGIN_URL, group='url')) + + response, urlh = self._download_webpage_handle( + post_url, None, 'Logging in', data=urlencode_postdata(login_form), + headers={'Referer': self._LOGIN_URL}) + + if self._LOGIN_URL in urlh.geturl(): + error = clean_html(get_element_by_class('form-message', response)) + if error: + raise ExtractorError( + 'Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + class MediaSelectionError(Exception): def __init__(self, id): self.id = id @@ -483,6 +529,12 @@ class BBCCoUkIE(InfoExtractor): webpage = self._download_webpage(url, group_id, 'Downloading video page') + error = self._search_regex( + r'<div\b[^>]+\bclass=["\']smp__message delta["\'][^>]*>([^<]+)<', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + programme_id = None duration = None diff --git a/youtube_dl/extractor/beampro.py b/youtube_dl/extractor/beampro.py index f3a9e32..2eaec1a 100644 --- a/youtube_dl/extractor/beampro.py +++ b/youtube_dl/extractor/beampro.py @@ -6,18 +6,33 @@ from ..utils import ( ExtractorError, clean_html, compat_str, + float_or_none, int_or_none, parse_iso8601, try_get, + urljoin, ) -class BeamProLiveIE(InfoExtractor): - IE_NAME = 'Beam:live' - _VALID_URL = r'https?://(?:\w+\.)?beam\.pro/(?P<id>[^/?#&]+)' +class BeamProBaseIE(InfoExtractor): + _API_BASE = 'https://mixer.com/api/v1' _RATINGS = {'family': 0, 'teen': 13, '18+': 18} + + def _extract_channel_info(self, chan): + user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) + return { + 'uploader': chan.get('token') or try_get( + chan, lambda x: x['user']['username'], compat_str), + 'uploader_id': compat_str(user_id) if user_id else None, + 'age_limit': self._RATINGS.get(chan.get('audience')), + } + + +class BeamProLiveIE(BeamProBaseIE): + IE_NAME = 'Mixer:live' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.beam.pro/niterhayven', + 'url': 'http://mixer.com/niterhayven', 'info_dict': { 'id': '261562', 'ext': 'mp4', @@ -38,11 +53,17 @@ class BeamProLiveIE(InfoExtractor): }, } + _MANIFEST_URL_TEMPLATE = '%s/channels/%%s/manifest.%%s' % BeamProBaseIE._API_BASE + + @classmethod + def suitable(cls, url): + return False if BeamProVodIE.suitable(url) else super(BeamProLiveIE, cls).suitable(url) + def _real_extract(self, url): channel_name = self._match_id(url) chan = self._download_json( - 'https://beam.pro/api/v1/channels/%s' % channel_name, channel_name) + '%s/channels/%s' % (self._API_BASE, channel_name), channel_name) if chan.get('online') is False: raise ExtractorError( @@ -50,24 +71,118 @@ class BeamProLiveIE(InfoExtractor): channel_id = chan['id'] + def manifest_url(kind): + return self._MANIFEST_URL_TEMPLATE % (channel_id, kind) + formats = self._extract_m3u8_formats( - 'https://beam.pro/api/v1/channels/%s/manifest.m3u8' % channel_id, - channel_name, ext='mp4', m3u8_id='hls', fatal=False) + manifest_url('m3u8'), channel_name, ext='mp4', m3u8_id='hls', + fatal=False) + formats.extend(self._extract_smil_formats( + manifest_url('smil'), channel_name, fatal=False)) self._sort_formats(formats) - user_id = chan.get('userId') or try_get(chan, lambda x: x['user']['id']) - - return { + info = { 'id': compat_str(chan.get('id') or channel_name), 'title': self._live_title(chan.get('name') or channel_name), 'description': clean_html(chan.get('description')), - 'thumbnail': try_get(chan, lambda x: x['thumbnail']['url'], compat_str), + 'thumbnail': try_get( + chan, lambda x: x['thumbnail']['url'], compat_str), 'timestamp': parse_iso8601(chan.get('updatedAt')), - 'uploader': chan.get('token') or try_get( - chan, lambda x: x['user']['username'], compat_str), - 'uploader_id': compat_str(user_id) if user_id else None, - 'age_limit': self._RATINGS.get(chan.get('audience')), 'is_live': True, 'view_count': int_or_none(chan.get('viewersTotal')), 'formats': formats, } + info.update(self._extract_channel_info(chan)) + + return info + + +class BeamProVodIE(BeamProBaseIE): + IE_NAME = 'Mixer:vod' + _VALID_URL = r'https?://(?:\w+\.)?(?:beam\.pro|mixer\.com)/[^/?#&]+\?.*?\bvod=(?P<id>\d+)' + _TEST = { + 'url': 'https://mixer.com/willow8714?vod=2259830', + 'md5': 'b2431e6e8347dc92ebafb565d368b76b', + 'info_dict': { + 'id': '2259830', + 'ext': 'mp4', + 'title': 'willow8714\'s Channel', + 'duration': 6828.15, + 'thumbnail': r're:https://.*source\.png$', + 'timestamp': 1494046474, + 'upload_date': '20170506', + 'uploader': 'willow8714', + 'uploader_id': '6085379', + 'age_limit': 13, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + } + + @staticmethod + def _extract_format(vod, vod_type): + if not vod.get('baseUrl'): + return [] + + if vod_type == 'hls': + filename, protocol = 'manifest.m3u8', 'm3u8_native' + elif vod_type == 'raw': + filename, protocol = 'source.mp4', 'https' + else: + assert False + + data = vod.get('data') if isinstance(vod.get('data'), dict) else {} + + format_id = [vod_type] + if isinstance(data.get('Height'), compat_str): + format_id.append('%sp' % data['Height']) + + return [{ + 'url': urljoin(vod['baseUrl'], filename), + 'format_id': '-'.join(format_id), + 'ext': 'mp4', + 'protocol': protocol, + 'width': int_or_none(data.get('Width')), + 'height': int_or_none(data.get('Height')), + 'fps': int_or_none(data.get('Fps')), + 'tbr': int_or_none(data.get('Bitrate'), 1000), + }] + + def _real_extract(self, url): + vod_id = self._match_id(url) + + vod_info = self._download_json( + '%s/recordings/%s' % (self._API_BASE, vod_id), vod_id) + + state = vod_info.get('state') + if state != 'AVAILABLE': + raise ExtractorError( + 'VOD %s is not available (state: %s)' % (vod_id, state), + expected=True) + + formats = [] + thumbnail_url = None + + for vod in vod_info['vods']: + vod_type = vod.get('format') + if vod_type in ('hls', 'raw'): + formats.extend(self._extract_format(vod, vod_type)) + elif vod_type == 'thumbnail': + thumbnail_url = urljoin(vod.get('baseUrl'), 'source.png') + + self._sort_formats(formats) + + info = { + 'id': vod_id, + 'title': vod_info.get('name') or vod_id, + 'duration': float_or_none(vod_info.get('duration')), + 'thumbnail': thumbnail_url, + 'timestamp': parse_iso8601(vod_info.get('createdAt')), + 'view_count': int_or_none(vod_info.get('viewsTotal')), + 'formats': formats, + } + info.update(self._extract_channel_info(vod_info.get('channel') or {})) + + return info diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index d5c5822..bbeae4b 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -9,6 +9,7 @@ from ..compat import ( from ..utils import ( int_or_none, parse_iso8601, + urljoin, ) @@ -36,9 +37,11 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) cpl_url = self._search_regex( - r'<script[^>]+src=(["\'])(?P<url>(?:https?:)?//static\.beeg\.com/cpl/\d+\.js.*?)\1', + r'<script[^>]+src=(["\'])(?P<url>(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1', webpage, 'cpl', default=None, group='url') + cpl_url = urljoin(url, cpl_url) + beeg_version, beeg_salt = [None] * 2 if cpl_url: @@ -54,7 +57,7 @@ class BeegIE(InfoExtractor): r'beeg_salt\s*=\s*(["\'])(?P<beeg_salt>.+?)\1', cpl, 'beeg salt', default=None, group='beeg_salt') - beeg_version = beeg_version or '2000' + beeg_version = beeg_version or '2185' beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H' video = self._download_json( diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 1e3f255..1e57310 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -54,6 +54,22 @@ class BiliBiliIE(InfoExtractor): 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', }, 'skip': 'Geo-restricted to China', + }, { + # Title with double quotes + 'url': 'http://www.bilibili.com/video/av8903802/', + 'info_dict': { + 'id': '8903802', + 'ext': 'mp4', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'uploader': '阿滴英文', + 'uploader_id': '65880958', + 'timestamp': 1488382620, + 'upload_date': '20170301', + }, + 'params': { + 'skip_download': True, # Test metadata only + }, }] _APP_KEY = '84956560bc028eb7' @@ -135,7 +151,7 @@ class BiliBiliIE(InfoExtractor): 'formats': formats, }) - title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') + title = self._html_search_regex('<h1[^>]*>([^<]+)</h1>', webpage, 'title') description = self._html_search_meta('description', webpage) timestamp = unified_timestamp(self._html_search_regex( r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None)) diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py index 9661ade..0783353 100644 --- a/youtube_dl/extractor/bpb.py +++ b/youtube_dl/extractor/bpb.py @@ -33,13 +33,18 @@ class BpbIE(InfoExtractor): title = self._html_search_regex( r'<h2 class="white">(.*?)</h2>', webpage, 'title') video_info_dicts = re.findall( - r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage) + r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage) formats = [] for video_info in video_info_dicts: - video_info = self._parse_json(video_info, video_id, transform_source=js_to_json) - quality = video_info['quality'] - video_url = video_info['src'] + video_info = self._parse_json( + video_info, video_id, transform_source=js_to_json, fatal=False) + if not video_info: + continue + video_url = video_info.get('src') + if not video_url: + continue + quality = 'high' if '_high' in video_url else 'low' formats.append({ 'url': video_url, 'preference': 10 if quality == 'high' else 0, diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index 75fa92d..ec41109 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -84,9 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) - facebook_url = FacebookIE._extract_url(webpage) - if facebook_url: - entries.append(self.url_result(facebook_url)) + facebook_urls = FacebookIE._extract_urls(webpage) + entries.extend([ + self.url_result(facebook_url) + for facebook_url in facebook_urls]) return { '_type': 'playlist', diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 87ad14e..9faf402 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor): 'media': 'http://search.yahoo.com/mrss/', 'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/', } + _GEO_COUNTRIES = ['CA'] def _call_api(self, path, video_id): url = path if path.startswith('http') else self._API_BASE_URL + path @@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor): class CBCWatchVideoIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch:video' _VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _TEST = { + # geo-restricted to Canada, bypassable + 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235', + 'only_matching': True, + } def _real_extract(self, url): video_id = self._match_id(url) @@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE): IE_NAME = 'cbc.ca:watch' _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)' _TESTS = [{ + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4', 'info_dict': { - 'id': '38e815a-009e3ab12e4', + 'id': '9673749a-5e77-484c-8b62-a1092a6b5168', 'ext': 'mp4', 'title': 'Customer (Dis)Service', 'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87', @@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE): 'skip_download': True, 'format': 'bestvideo', }, - 'skip': 'Geo-restricted to Canada', }, { + # geo-restricted to Canada, bypassable 'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057', 'info_dict': { 'id': '1ed4b385-cd84-49cf-95f0-80f004680057', @@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE): 'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.', }, 'playlist_mincount': 30, - 'skip': 'Geo-restricted to Canada', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index 58f258c..1268e38 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -49,13 +49,13 @@ class CBSIE(CBSBaseIE): 'only_matching': True, }] - def _extract_video_info(self, content_id): + def _extract_video_info(self, content_id, site='cbs', mpx_acc=2198311517): items_data = self._download_xml( 'http://can.cbs.com/thunder/player/videoPlayerService.php', - content_id, query={'partner': 'cbs', 'contentId': content_id}) + content_id, query={'partner': site, 'contentId': content_id}) video_data = xpath_element(items_data, './/item') title = xpath_text(video_data, 'videoTitle', 'title', True) - tp_path = 'dJ5BDC/media/guid/2198311517/%s' % content_id + tp_path = 'dJ5BDC/media/guid/%d/%s' % (mpx_acc, content_id) tp_release_url = 'http://link.theplatform.com/s/' + tp_path asset_types = [] diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 57b18e8..681d63e 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -3,17 +3,18 @@ from __future__ import unicode_literals import re -from .theplatform import ThePlatformIE +from .cbs import CBSIE from ..utils import int_or_none -class CBSInteractiveIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video/share)/(?P<id>[^/?]+)' +class CBSInteractiveIE(CBSIE): + _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', 'info_dict': { - 'id': '56f4ea68-bd21-4852-b08c-4de5b8354c60', - 'ext': 'flv', + 'id': 'R49SYt__yAfmlXR85z4f7gNmCBDcN_00', + 'display_id': 'hands-on-with-microsofts-windows-8-1-update', + 'ext': 'mp4', 'title': 'Hands-on with Microsoft Windows 8.1 Update', 'description': 'The new update to the Windows 8 OS brings improved performance for mouse and keyboard users.', 'uploader_id': '6085384d-619e-11e3-b231-14feb5ca9861', @@ -22,13 +23,19 @@ class CBSInteractiveIE(ThePlatformIE): 'timestamp': 1396479627, 'upload_date': '20140402', }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.cnet.com/videos/whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187/', + 'md5': 'f11d27b2fa18597fbf92444d2a9ed386', 'info_dict': { - 'id': '56527b93-d25d-44e3-b738-f989ce2e49ba', - 'ext': 'flv', + 'id': 'kjOJd_OoVJqbg_ZD8MZCOk8Wekb9QccK', + 'display_id': 'whiny-pothole-tweets-at-local-government-when-hit-by-cars-tomorrow-daily-187', + 'ext': 'mp4', 'title': 'Whiny potholes tweet at local government when hit by cars (Tomorrow Daily 187)', - 'description': 'Khail and Ashley wonder what other civic woes can be solved by self-tweeting objects, investigate a new kind of VR camera and watch an origami robot self-assemble, walk, climb, dig and dissolve. #TDPothole', + 'description': 'md5:d2b9a95a5ffe978ae6fbd4cf944d618f', 'uploader_id': 'b163284d-6b73-44fc-b3e6-3da66c392d40', 'uploader': 'Ashley Esqueda', 'duration': 1482, @@ -38,23 +45,28 @@ class CBSInteractiveIE(ThePlatformIE): }, { 'url': 'http://www.zdnet.com/video/share/video-keeping-android-smartphones-and-tablets-secure/', 'info_dict': { - 'id': 'bc1af9f0-a2b5-4e54-880d-0d95525781c0', + 'id': 'k0r4T_ehht4xW_hAOqiVQPuBDPZ8SRjt', + 'display_id': 'video-keeping-android-smartphones-and-tablets-secure', 'ext': 'mp4', 'title': 'Video: Keeping Android smartphones and tablets secure', 'description': 'Here\'s the best way to keep Android devices secure, and what you do when they\'ve come to the end of their lives.', 'uploader_id': 'f2d97ea2-8175-11e2-9d12-0018fe8a00b0', 'uploader': 'Adrian Kingsley-Hughes', - 'timestamp': 1448961720, - 'upload_date': '20151201', + 'duration': 731, + 'timestamp': 1449129925, + 'upload_date': '20151203', }, 'params': { # m3u8 download 'skip_download': True, - } + }, + }, { + 'url': 'http://www.zdnet.com/video/huawei-matebook-x-video/', + 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/kYEXFC/%s?mbr=true' + MPX_ACCOUNTS = { - 'cnet': 2288573011, + 'cnet': 2198311517, 'zdnet': 2387448114, } @@ -68,7 +80,8 @@ class CBSInteractiveIE(ThePlatformIE): data = self._parse_json(data_json, display_id) vdata = data.get('video') or data['videos'][0] - video_id = vdata['id'] + video_id = vdata['mpxRefId'] + title = vdata['title'] author = vdata.get('author') if author: @@ -78,20 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): uploader = None uploader_id = None - media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) - formats, subtitles = [], {} - for (fkey, vid) in vdata['files'].items(): - if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: - continue - release_url = self.TP_RELEASE_URL_TEMPLATE % vid - if fkey == 'hds': - release_url += '&manifest=f4m' - tp_formats, tp_subtitles = self._extract_theplatform_smil(release_url, video_id, 'Downloading %s SMIL data' % fkey) - formats.extend(tp_formats) - subtitles = self._merge_subtitles(subtitles, tp_subtitles) - self._sort_formats(formats) - - info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_video_info(video_id, site, self.MPX_ACCOUNTS[site]) info.update({ 'id': video_id, 'display_id': display_id, @@ -99,7 +99,5 @@ class CBSInteractiveIE(ThePlatformIE): 'duration': int_or_none(vdata.get('duration')), 'uploader': uploader, 'uploader_id': uploader_id, - 'subtitles': subtitles, - 'formats': formats, }) return info diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 17bb9af..51df15f 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -15,19 +15,23 @@ class CBSNewsIE(CBSIE): _TESTS = [ { - 'url': 'http://www.cbsnews.com/news/tesla-and-spacex-elon-musks-industrial-empire/', + # 60 minutes + 'url': 'http://www.cbsnews.com/news/artificial-intelligence-positioned-to-be-a-game-changer/', 'info_dict': { - 'id': 'tesla-and-spacex-elon-musks-industrial-empire', - 'ext': 'flv', - 'title': 'Tesla and SpaceX: Elon Musk\'s industrial empire', - 'thumbnail': 'http://beta.img.cbsnews.com/i/2014/03/30/60147937-2f53-4565-ad64-1bdd6eb64679/60-0330-pelley-640x360.jpg', - 'duration': 791, + 'id': '_B6Ga3VJrI4iQNKsir_cdFo9Re_YJHE_', + 'ext': 'mp4', + 'title': 'Artificial Intelligence', + 'description': 'md5:8818145f9974431e0fb58a1b8d69613c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1606, + 'uploader': 'CBSI-NEW', + 'timestamp': 1498431900, + 'upload_date': '20170625', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, - 'skip': 'Subscribers only', }, { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -52,6 +56,22 @@ class CBSNewsIE(CBSIE): 'skip_download': True, }, }, + { + # 48 hours + 'url': 'http://www.cbsnews.com/news/maria-ridulph-murder-will-the-nations-oldest-cold-case-to-go-to-trial-ever-get-solved/', + 'info_dict': { + 'id': 'QpM5BJjBVEAUFi7ydR9LusS69DPLqPJ1', + 'ext': 'mp4', + 'title': 'Cold as Ice', + 'description': 'Can a childhood memory of a friend\'s murder solve a 1957 cold case? "48 Hours" correspondent Erin Moriarty has the latest.', + 'upload_date': '20170604', + 'timestamp': 1496538000, + 'uploader': 'CBSI-NEW', + }, + 'params': { + 'skip_download': True, + }, + }, ] def _real_extract(self, url): @@ -60,12 +80,18 @@ class CBSNewsIE(CBSIE): webpage = self._download_webpage(url, video_id) video_info = self._parse_json(self._html_search_regex( - r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', - webpage, 'video JSON info'), video_id) + r'(?:<ul class="media-list items" id="media-related-items"[^>]*><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'', + webpage, 'video JSON info', default='{}'), video_id, fatal=False) + + if video_info: + item = video_info['item'] if 'item' in video_info else video_info + else: + state = self._parse_json(self._search_regex( + r'data-cbsvideoui-options=(["\'])(?P<json>{.+?})\1', webpage, + 'playlist JSON info', group='json'), video_id)['state'] + item = state['playlist'][state['pid']] - item = video_info['item'] if 'item' in video_info else video_info - guid = item['mpxRefId'] - return self._extract_video_info(guid) + return self._extract_video_info(item['mpxRefId'], 'cbsnews') class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 78b7a92..0c3af23 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -124,7 +124,7 @@ class CDAIE(InfoExtractor): } def extract_format(page, version): - json_str = self._search_regex( + json_str = self._html_search_regex( r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, '%s player_json' % version, fatal=False, group='player_data') if not json_str: diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py index 2d517f2..42c9af2 100644 --- a/youtube_dl/extractor/charlierose.py +++ b/youtube_dl/extractor/charlierose.py @@ -5,7 +5,7 @@ from ..utils import remove_end class CharlieRoseIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://charlierose.com/videos/27996', 'md5': 'fda41d49e67d4ce7c2411fd2c4702e09', @@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor): }, { 'url': 'https://charlierose.com/videos/27996', 'only_matching': True, + }, { + 'url': 'https://charlierose.com/episodes/30887?autoplay=true', + 'only_matching': True, }] _PLAYER_BASE = 'https://charlierose.com/video/player/%s' diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py index 0206d96..d4769da 100644 --- a/youtube_dl/extractor/chilloutzone.py +++ b/youtube_dl/extractor/chilloutzone.py @@ -5,6 +5,7 @@ import base64 import json from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( clean_html, ExtractorError @@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor): # If nativePlatform is None a fallback mechanism is used (i.e. youtube embed) if native_platform is None: - youtube_url = self._html_search_regex( - r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"', - webpage, 'fallback video URL', default=None) - if youtube_url is not None: - return self.url_result(youtube_url, ie='Youtube') + youtube_url = YoutubeIE._extract_url(webpage) + if youtube_url: + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) # Non Fallback: Decide to use native source (e.g. youtube or vimeo) or # the own CDN diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py index 562c9bb..b861d54 100644 --- a/youtube_dl/extractor/cinchcast.py +++ b/youtube_dl/extractor/cinchcast.py @@ -9,12 +9,20 @@ from ..utils import ( class CinchcastIE(InfoExtractor): - _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)' - _TEST = { + _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', + 'info_dict': { + 'id': '5258197', + 'ext': 'mp3', + 'title': 'Train Your Brain to Up Your Game with Coach Mandy', + 'upload_date': '20130816', + }, + }, { # Actual test is run in generic, look for undergroundwellness 'url': 'http://player.cinchcast.com/?platformId=1&assetType=single&assetId=7141703', 'only_matching': True, - } + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py new file mode 100644 index 0000000..505bdbe --- /dev/null +++ b/youtube_dl/extractor/cjsw.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + unescapeHTML, +) + + +class CJSWIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620', + 'md5': 'cee14d40f1e9433632c56e3d14977120', + 'info_dict': { + 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41', + 'ext': 'mp3', + 'title': 'Freshly Squeezed – Episode June 20, 2017', + 'description': 'md5:c967d63366c3898a80d0c7b0ff337202', + 'series': 'Freshly Squeezed', + 'episode_id': '20170620', + }, + }, { + # no description + 'url': 'http://cjsw.com/program/road-pops/episode/20170707/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + program, episode_id = mobj.group('program', 'id') + audio_id = '%s/%s' % (program, episode_id) + + webpage = self._download_webpage(url, episode_id) + + title = unescapeHTML(self._search_regex( + (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)', + r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'), + webpage, 'title', group='title')) + + audio_url = self._search_regex( + r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'audio url', group='url') + + audio_id = self._search_regex( + r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3', + audio_url, 'audio id', default=audio_id) + + formats = [{ + 'url': audio_url, + 'ext': determine_ext(audio_url, 'mp3'), + 'vcodec': 'none', + }] + + description = self._html_search_regex( + r'<p>(?P<description>.+?)</p>', webpage, 'description', + default=None) + series = self._search_regex( + r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage, + 'series', default=program, group='name') + + return { + 'id': audio_id, + 'title': title, + 'description': description, + 'formats': formats, + 'series': series, + 'episode_id': episode_id, + } diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py deleted file mode 100644 index 0920f62..0000000 --- a/youtube_dl/extractor/clipfish.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, -) - - -class ClipfishIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' - _TEST = { - 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': 'b9a5dc46294154c1193e2d10e0c95693', - 'info_dict': { - 'id': '4343170', - 'ext': 'mp4', - 'title': 'S01 E01 - Ugly Americans - Date in der Hölle', - 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.', - 'upload_date': '20161005', - 'duration': 1291, - 'view_count': int, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - video_info = self._download_json( - 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id, - video_id)['items'][0] - - formats = [] - - m3u8_url = video_info.get('media_videourl_hls') - if m3u8_url: - formats.append({ - 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'), - 'ext': 'mp4', - 'format_id': 'hls', - }) - - mp4_url = video_info.get('media_videourl') - if mp4_url: - formats.append({ - 'url': mp4_url, - 'format_id': 'mp4', - 'width': int_or_none(video_info.get('width')), - 'height': int_or_none(video_info.get('height')), - 'tbr': int_or_none(video_info.get('bitrate')), - }) - - descr = video_info.get('descr') - if descr: - descr = descr.strip() - - return { - 'id': video_id, - 'title': video_info['title'], - 'description': descr, - 'formats': formats, - 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'), - 'duration': int_or_none(video_info.get('media_length')), - 'upload_date': unified_strdate(video_info.get('pubDate')), - 'view_count': int_or_none(video_info.get('media_views')) - } diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py new file mode 100644 index 0000000..a1a7a77 --- /dev/null +++ b/youtube_dl/extractor/clippit.py @@ -0,0 +1,74 @@ +# coding: utf-8 + +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + qualities, +) + +import re + + +class ClippitIE(InfoExtractor): + + _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)' + _TEST = { + 'url': 'https://www.clippituser.tv/c/evmgm', + 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09', + 'info_dict': { + 'id': 'evmgm', + 'ext': 'mp4', + 'title': 'Bye bye Brutus. #BattleBots - Clippit', + 'uploader': 'lizllove', + 'uploader_url': 'https://www.clippituser.tv/p/lizllove', + 'timestamp': 1472183818, + 'upload_date': '20160826', + 'description': 'BattleBots | ABC', + 'thumbnail': r're:^https?://.*\.jpg$', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title.*>(.+?)', webpage, 'title') + + FORMATS = ('sd', 'hd') + quality = qualities(FORMATS) + formats = [] + for format_id in FORMATS: + url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id, + webpage, 'url', fatal=False) + if not url: + continue + match = re.search(r'/(?P\d+)\.mp4', url) + formats.append({ + 'url': url, + 'format_id': format_id, + 'quality': quality(format_id), + 'height': int(match.group('height')) if match else None, + }) + + uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n', + webpage, 'uploader', fatal=False) + uploader_url = ('https://www.clippituser.tv/p/' + uploader + if uploader else None) + + timestamp = self._html_search_regex(r'datetime="(.+?)"', + webpage, 'date', fatal=False) + thumbnail = self._html_search_regex(r'data-image="(.+?)"', + webpage, 'thumbnail', fatal=False) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'timestamp': parse_iso8601(timestamp), + 'description': self._og_search_description(webpage), + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9bc8dbe..85ca20e 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id) + 'https://www.cloudy.ec/embed.php', video_id, query={ + 'id': video_id, + 'playerPage': 1, + 'autoplay': 1, + }) info = self._parse_html5_media_entries(url, webpage, video_id)[0] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index fec39da..2bbbf8f 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -27,6 +27,7 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..downloader.f4m import remove_encrypted_media from ..utils import ( @@ -376,7 +377,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -420,7 +421,7 @@ class InfoExtractor(object): if country_code: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) if self._downloader.params.get('verbose', False): - self._downloader.to_stdout( + self._downloader.to_screen( '[debug] Using fake IP %s (%s) as X-Forwarded-For.' % (self._x_forwarded_for_ip, country_code.upper())) @@ -646,15 +647,29 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal) + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -730,12 +745,12 @@ class InfoExtractor(object): video_info['title'] = video_title return video_info - def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None): - urlrs = orderedSet( + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): + urls = orderedSet( self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) for m in matches) return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) + urls, playlist_id=playlist_id, playlist_title=playlist_title) @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): @@ -940,7 +955,8 @@ class InfoExtractor(object): def _family_friendly_search(self, html): # See http://schema.org/VideoObject - family_friendly = self._html_search_meta('isFamilyFriendly', html) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) if not family_friendly: return None @@ -1002,17 +1018,17 @@ class InfoExtractor(object): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info - if item_type == 'TVEpisode': + if item_type in ('TVEpisode', 'Episode'): info.update({ 'episode': unescapeHTML(e.get('name')), 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) elif item_type == 'Article': info.update({ @@ -1022,10 +1038,10 @@ class InfoExtractor(object): }) elif item_type == 'VideoObject': extract_video_object(e) - elif item_type == 'WebPage': - video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': - extract_video_object(video) + continue + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) break return dict((k, v) for k, v in info.items() if v is not None) @@ -1785,7 +1801,7 @@ class InfoExtractor(object): ms_info['timescale'] = int(timescale) segment_duration = source.get('duration') if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + ms_info['segment_duration'] = float(segment_duration) def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) @@ -1892,9 +1908,13 @@ class InfoExtractor(object): 'Bandwidth': bandwidth, } + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time @@ -1904,7 +1924,7 @@ class InfoExtractor(object): segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ - 'url': media_template % { + media_location_key: media_template % { 'Number': segment_number, 'Bandwidth': bandwidth, }, @@ -1928,7 +1948,7 @@ class InfoExtractor(object): 'Number': segment_number, } representation_ms_info['fragments'].append({ - 'url': segment_url, + media_location_key: segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), }) @@ -1952,8 +1972,9 @@ class InfoExtractor(object): for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): + segment_uri = representation_ms_info['segment_urls'][segment_index] fragments.append({ - 'url': representation_ms_info['segment_urls'][segment_index], + location_key(segment_uri): segment_uri, 'duration': duration, }) segment_index += 1 @@ -1962,6 +1983,7 @@ class InfoExtractor(object): # No fragments key is present in this case. if 'fragments' in representation_ms_info: f.update({ + 'fragment_base_url': base_url, 'fragments': [], 'protocol': 'http_dash_segments', }) @@ -1969,10 +1991,8 @@ class InfoExtractor(object): initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url - f['fragments'].append({'url': initialization_url}) + f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - for fragment in f['fragments']: - fragment['url'] = urljoin(base_url, fragment['url']) try: existing_format = next( fo for fo in formats @@ -2110,19 +2130,19 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type): + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) - ext = determine_ext(full_url) + ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, - preference=preference) + preference=preference, fatal=False) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id) + full_url, video_id, mpd_id=mpd_id, fatal=False) else: is_plain_url = True formats = [{ @@ -2132,15 +2152,18 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] + # amp-video and amp-audio are very similar to their HTML5 counterparts + # so we wll include them right here (see + # https://www.ampproject.org/docs/reference/components/amp-video) media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] + in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see # https://github.com/rg3/youtube-dl/issues/11979, example URL: # http://www.porntrex.com/maps/videositemap.xml). - r'(?s)(<(?Pvideo|audio)(?:\s+[^>]*)?>)(.*?)', webpage)) + r'(?s)(<(?P(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)', webpage)) for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], @@ -2158,9 +2181,15 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - is_plain_url, formats = _media_formats(src, media_type) + f = parse_content_type(source_attributes.get('type')) + is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: - f = parse_content_type(source_attributes.get('type')) + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: @@ -2299,6 +2328,8 @@ class InfoExtractor(object): tracks = video_data.get('tracks') if tracks and isinstance(tracks, list): for track in tracks: + if not isinstance(track, dict): + continue if track.get('kind') != 'captions': continue track_url = urljoin(base_url, track.get('file')) @@ -2328,6 +2359,8 @@ class InfoExtractor(object): urls = [] formats = [] for source in jwplayer_sources_data: + if not isinstance(source, dict): + continue source_url = self._proto_relative_url(source.get('file')) if not source_url: continue @@ -2416,10 +2449,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, port is not None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index 0c3f0c0..ed278fe 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -116,16 +116,16 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video_params(self, webpage): - query = {} - params = self._search_regex( - r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) - if params: - query.update({ - 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), - 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), - 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), - }) + def _extract_video_params(self, webpage, display_id): + query = self._parse_json( + self._search_regex( + r'(?s)var\s+params\s*=\s*({.+?})[;,]', webpage, 'player params', + default='{}'), + display_id, transform_source=js_to_json, fatal=False) + if query: + query['videoId'] = self._search_regex( + r'(?:data-video-id=|currentVideoId\s*=\s*)["\']([\da-f]+)', + webpage, 'video id', default=None) else: params = extract_attributes(self._search_regex( r'(<[^>]+data-js="video-player"[^>]+>)', @@ -141,17 +141,27 @@ class CondeNastIE(InfoExtractor): video_id = params['videoId'] video_info = None - if params.get('playerId'): - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=params) - if info_page: - video_info = info_page.get('video') - if not video_info: - info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=params) - else: + + # New API path + query = params.copy() + query['embedType'] = 'inline' + info_page = self._download_json( + 'http://player.cnevids.com/embed-api.json', video_id, + 'Downloading embed info', fatal=False, query=query) + + # Old fallbacks + if not info_page: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', video_id, + 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + if not video_info: info_page = self._download_webpage( 'https://player.cnevids.com/inline/video/%s.js' % video_id, video_id, 'Downloading inline info', query={ @@ -215,7 +225,7 @@ class CondeNastIE(InfoExtractor): if url_type == 'series': return self._extract_series(url, webpage) else: - params = self._extract_video_params(webpage) + params = self._extract_video_params(webpage, display_id) info = self._search_json_ld( webpage, display_id, fatal=False) info.update(self._extract_video(params)) diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py index 7b2f500..807a29e 100644 --- a/youtube_dl/extractor/corus.py +++ b/youtube_dl/extractor/corus.py @@ -8,7 +8,16 @@ from ..utils import int_or_none class CorusIE(ThePlatformFeedIE): - _VALID_URL = r'https?://(?:www\.)?(?P(?:globaltv|etcanada)\.com|(?:hgtv|foodnetwork|slice)\.ca)/(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))(?P\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P + (?:globaltv|etcanada)\.com| + (?:hgtv|foodnetwork|slice|history|showcase)\.ca + ) + /(?:video/|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=)) + (?P\d+) + ''' _TESTS = [{ 'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/', 'md5': '05dcbca777bf1e58c2acbb57168ad3a6', @@ -27,6 +36,12 @@ class CorusIE(ThePlatformFeedIE): }, { 'url': 'http://etcanada.com/video/873675331955/meet-the-survivor-game-changers-castaways-part-2/', 'only_matching': True, + }, { + 'url': 'http://www.history.ca/the-world-without-canada/video/full-episodes/natural-resources/video.html?v=955054659646#video', + 'only_matching': True, + }, { + 'url': 'http://www.showcase.ca/eyewitness/video/eyewitness++106/video.html?v=955070531919&p=1&s=da#video', + 'only_matching': True, }] _TP_FEEDS = { @@ -50,6 +65,14 @@ class CorusIE(ThePlatformFeedIE): 'feed_id': '5tUJLgV2YNJ5', 'account_id': 2414427935, }, + 'history': { + 'feed_id': 'tQFx_TyyEq4J', + 'account_id': 2369613659, + }, + 'showcase': { + 'feed_id': '9H6qyshBZU3E', + 'account_id': 2414426607, + }, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py index 94d03ce..f77a68e 100644 --- a/youtube_dl/extractor/cracked.py +++ b/youtube_dl/extractor/cracked.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from .youtube import YoutubeIE from ..utils import ( parse_iso8601, str_to_int, @@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - youtube_url = self._search_regex( - r']+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"', - webpage, 'youtube url', default=None) + youtube_url = YoutubeIE._extract_url(webpage) if youtube_url: - return self.url_result(youtube_url, 'Youtube') + return self.url_result(youtube_url, ie=YoutubeIE.ie_key()) video_url = self._html_search_regex( [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r']+>\s*]+>([^<]+)', + r'(?s)]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)]+id=["\']showmedia_about_episode_num[^>]+>.+?\s*

\s*Season (\d+)', + r'(?s)]+id=["\']showmedia_about_episode_num[^>]+>.+?\s*

\s*Season (\d+)', webpage, 'season number', default=None)) return { diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 538565c..af39780 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,8 +14,8 @@ from ..utils import ( class DailyMailIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P[0-9]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P[0-9]+)' + _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { @@ -22,7 +24,16 @@ class DailyMailIE(InfoExtractor): 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } - } + }, { + 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r']+\bsrc=["\'](?P(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', + webpage) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index f8db76c..e9d0dd1 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -147,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): view_count_str = self._search_regex( (r']+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"', r'video_views_count[^>]+>\s+([\s\d\,.]+)'), - webpage, 'view count', fatal=False) + webpage, 'view count', default=None) if view_count_str: view_count_str = re.sub(r'\s', '', view_count_str) view_count = str_to_int(view_count_str) @@ -159,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826 r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);', r'buildPlayer\(({.+?})\);', - r'var\s+config\s*=\s*({.+?});'], + r'var\s+config\s*=\s*({.+?});', + # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580) + r'__PLAYER_CONFIG__\s*=\s*({.+?});'], webpage, 'player v5', default=None) if player_v5: player = self._parse_json(player_v5, video_id) @@ -323,7 +325,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): IE_NAME = 'dailymotion:playlist' - _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P.+?)/' + _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P[^/?#&]+)' _MORE_PAGES_INDICATOR = r'(?s)
.*?(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P[a-z0-9]{24})|(?:[^/]+/)?(?P[^/?#]+))''' + https?://(?P(?:[^/]+\.)?(?:disney\.[a-z]{2,3}(?:\.[a-z]{2})?|disney(?:(?:me|latino)\.com|turkiye\.com\.tr|channel\.de)|(?:starwars|marvelkids)\.com))/(?:(?:embed/|(?:[^/]+/)+[\w-]+-)(?P[a-z0-9]{24})|(?:[^/]+/)?(?P[^/?#]+))''' _TESTS = [{ # Disney.EmbedVideo 'url': 'http://video.disney.com/watch/moana-trailer-545ed1857afee5a0ec239977', @@ -68,6 +68,9 @@ class DisneyIE(InfoExtractor): }, { 'url': 'http://disneyjunior.en.disneyme.com/dj/watch-my-friends-tigger-and-pooh-promo', 'only_matching': True, + }, { + 'url': 'http://disneychannel.de/sehen/soy-luna-folge-118-5518518987ba27f3cc729268', + 'only_matching': True, }, { 'url': 'http://disneyjunior.disney.com/galactech-the-galactech-grab-galactech-an-admiral-rescue', 'only_matching': True, diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py index a78cb8a..c05f601 100644 --- a/youtube_dl/extractor/dispeak.py +++ b/youtube_dl/extractor/dispeak.py @@ -13,7 +13,7 @@ from ..utils import ( class DigitallySpeakingIE(InfoExtractor): - _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P[^.]+)\.xml' + _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P[^.]+)\.xml' _TESTS = [{ # From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface @@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor): # From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC 'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml', 'only_matching': True, + }, { + # From http://www.gdcvault.com/play/1013700/Advanced-Material + 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml', + 'only_matching': True, }] def _parse_mp4(self, metadata): diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index 87c5dd6..76e7841 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -7,16 +7,18 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urlparse, compat_HTTPError, + compat_str, + compat_urlparse, ) from ..utils import ( - USER_AGENTS, ExtractorError, int_or_none, - unified_strdate, remove_end, + try_get, + unified_strdate, update_url_query, + USER_AGENTS, ) @@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - info_url = self._search_regex( - r'url\s*:\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', - webpage, 'video id') - title = remove_end(self._og_search_title(webpage), ' | Dplay') - try: - info = self._download_json( - info_url, display_id, headers={ - 'Authorization': 'Bearer %s' % self._get_cookies(url).get( - 'dplayit_token').value, - 'Referer': url, - }) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): - info = self._parse_json(e.cause.read().decode('utf-8'), display_id) - error = info['errors'][0] - if error.get('code') == 'access.denied.geoblocked': - self.raise_geo_restricted( - msg=error.get('detail'), countries=self._GEO_COUNTRIES) - raise ExtractorError(info['errors'][0]['detail'], expected=True) - raise + video_id = None + + info = self._search_regex( + r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")', + webpage, 'playback JSON', default=None) + if info: + for _ in range(2): + info = self._parse_json(info, display_id, fatal=False) + if not info: + break + else: + video_id = try_get(info, lambda x: x['data']['id']) + + if not info: + info_url = self._search_regex( + r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)', + webpage, 'info url') + + video_id = info_url.rpartition('/')[-1] + + try: + info = self._download_json( + info_url, display_id, headers={ + 'Authorization': 'Bearer %s' % self._get_cookies(url).get( + 'dplayit_token').value, + 'Referer': url, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403): + info = self._parse_json(e.cause.read().decode('utf-8'), display_id) + error = info['errors'][0] + if error.get('code') == 'access.denied.geoblocked': + self.raise_geo_restricted( + msg=error.get('detail'), countries=self._GEO_COUNTRIES) + raise ExtractorError(info['errors'][0]['detail'], expected=True) + raise hls_url = info['data']['attributes']['streaming']['hls']['url'] @@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor): season_number = episode_number = upload_date = None return { - 'id': info_url.rpartition('/')[-1], + 'id': compat_str(video_id or display_id), 'display_id': display_id, 'title': title, 'description': self._og_search_description(webpage), diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index e7abc88..9a498d7 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -12,6 +12,7 @@ from ..utils import ( ExtractorError, clean_html, int_or_none, + remove_end, sanitized_Request, urlencode_postdata ) @@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE): 'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/', 'info_dict': { 'id': '4512.1', - 'ext': 'mp4', - 'title': 'Cooking with Shin 4512.1', + 'ext': 'flv', + 'title': 'Cooking with Shin', 'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0', 'episode': 'Episode 1', 'episode_number': 1, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1404336058, 'upload_date': '20140702', - 'duration': 343, + 'duration': 344, }, 'params': { # m3u8 download @@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE): 'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1', 'info_dict': { 'id': '4826.4', - 'ext': 'mp4', - 'title': 'Mnet Asian Music Awards 2015 4826.4', + 'ext': 'flv', + 'title': 'Mnet Asian Music Awards 2015', 'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91', 'episode': 'Mnet Asian Music Awards 2015 - Part 3', 'episode_number': 4, 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1450213200, 'upload_date': '20151215', - 'duration': 5602, + 'duration': 5359, }, 'params': { # m3u8 download @@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE): countries=self._GEO_COUNTRIES) raise + # title is postfixed with video id for some reason, removing + if info.get('title'): + info['title'] = remove_end(info['title'], video_id).strip() + series_id, episode_number = video_id.split('.') episode_info = self._download_json( # We only need a single episode info, so restricting page size to one episode diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 79ec212..164e97c 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -1,135 +1,59 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor from ..utils import ( - int_or_none, - parse_iso8601, + js_to_json, + parse_duration, + unescapeHTML, ) class DRBonanzaIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/(?:[^/]+/)+(?:[^/])+?(?:assetId=(?P\d+))?(?:[#&]|$)' - - _TESTS = [{ - 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', + _VALID_URL = r'https?://(?:www\.)?dr\.dk/bonanza/[^/]+/\d+/[^/]+/(?P\d+)/(?P[^/?#&]+)' + _TEST = { + 'url': 'http://www.dr.dk/bonanza/serie/154/matador/40312/matador---0824-komme-fremmede-', 'info_dict': { - 'id': '65517', + 'id': '40312', + 'display_id': 'matador---0824-komme-fremmede-', 'ext': 'mp4', - 'title': 'Talkshowet - Leonard Cohen', - 'description': 'md5:8f34194fb30cd8c8a30ad8b27b70c0ca', - 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - 'timestamp': 1295537932, - 'upload_date': '20110120', - 'duration': 3664, - }, - 'params': { - 'skip_download': True, # requires rtmp - }, - }, { - 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', - 'md5': '6dfe039417e76795fb783c52da3de11d', - 'info_dict': { - 'id': '59410', - 'ext': 'mp3', - 'title': 'EM fodbold 1992 Danmark - Tyskland finale Transmission', - 'description': 'md5:501e5a195749480552e214fbbed16c4e', + 'title': 'MATADOR - 08:24. "Komme fremmede".', + 'description': 'md5:77b4c1ac4d4c1b9d610ab4395212ff84', 'thumbnail': r're:^https?://.*\.(?:gif|jpg)$', - 'timestamp': 1223274900, - 'upload_date': '20081006', - 'duration': 7369, + 'duration': 4613, }, - }] + } def _real_extract(self, url): - url_id = self._match_id(url) - webpage = self._download_webpage(url, url_id) - - if url_id: - info = json.loads(self._html_search_regex(r'({.*?%s.*})' % url_id, webpage, 'json')) - else: - # Just fetch the first video on that page - info = json.loads(self._html_search_regex(r'bonanzaFunctions.newPlaylist\(({.*})\)', webpage, 'json')) - - asset_id = str(info['AssetId']) - title = info['Title'].rstrip(' \'\"-,.:;!?') - duration = int_or_none(info.get('Duration'), scale=1000) - # First published online. "FirstPublished" contains the date for original airing. - timestamp = parse_iso8601( - re.sub(r'\.\d+$', '', info['Created'])) - - def parse_filename_info(url): - match = re.search(r'/\d+_(?P\d+)x(?P\d+)x(?P\d+)K\.(?P\w+)$', url) - if match: - return { - 'width': int(match.group('width')), - 'height': int(match.group('height')), - 'vbr': int(match.group('bitrate')), - 'ext': match.group('ext') - } - match = re.search(r'/\d+_(?P\d+)K\.(?P\w+)$', url) - if match: - return { - 'vbr': int(match.group('bitrate')), - 'ext': match.group(2) - } - return {} + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') - video_types = ['VideoHigh', 'VideoMid', 'VideoLow'] - preferencemap = { - 'VideoHigh': -1, - 'VideoMid': -2, - 'VideoLow': -3, - 'Audio': -4, - } + webpage = self._download_webpage(url, display_id) - formats = [] - for file in info['Files']: - if info['Type'] == 'Video': - if file['Type'] in video_types: - format = parse_filename_info(file['Location']) - format.update({ - 'url': file['Location'], - 'format_id': file['Type'].replace('Video', ''), - 'preference': preferencemap.get(file['Type'], -10), - }) - if format['url'].startswith('rtmp'): - rtmp_url = format['url'] - format['rtmp_live'] = True # --resume does not work - if '/bonanza/' in rtmp_url: - format['play_path'] = rtmp_url.split('/bonanza/')[1] - formats.append(format) - elif file['Type'] == 'Thumb': - thumbnail = file['Location'] - elif info['Type'] == 'Audio': - if file['Type'] == 'Audio': - format = parse_filename_info(file['Location']) - format.update({ - 'url': file['Location'], - 'format_id': file['Type'], - 'vcodec': 'none', - }) - formats.append(format) - elif file['Type'] == 'Thumb': - thumbnail = file['Location'] + info = self._parse_html5_media_entries( + url, webpage, display_id, m3u8_id='hls', + m3u8_entry_protocol='m3u8_native')[0] + self._sort_formats(info['formats']) - description = '%s\n%s\n%s\n' % ( - info['Description'], info['Actors'], info['Colophon']) + asset = self._parse_json( + self._search_regex( + r'(?s)currentAsset\s*=\s*({.+?})\s*]+>\s*

%s:

\s*

\s*]+>\s*

([^<]+)

' % field, + webpage, field, default=None) - return { - 'id': asset_id, + info.update({ + 'id': asset.get('AssetId') or video_id, 'display_id': display_id, 'title': title, - 'formats': formats, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - } + 'description': extract('Programinfo'), + 'duration': parse_duration(extract('Tid')), + 'thumbnail': asset.get('AssetImageUrl'), + }) + return info diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 1eca82b..c5d56a9 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -44,8 +44,23 @@ class DrTuberIE(InfoExtractor): webpage = self._download_webpage( 'http://www.drtuber.com/video/%s' % video_id, display_id) - video_url = self._html_search_regex( - r']*><(?:p|h\d+)[^>]*>([^<]+)<', @@ -75,7 +90,7 @@ class DrTuberIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'formats': formats, 'title': title, 'thumbnail': thumbnail, 'like_count': like_count, diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index c84624f..69effba 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -118,7 +118,7 @@ class DRTVIE(InfoExtractor): if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', - video_id, preference, f4m_id=format_id) + video_id, preference, f4m_id=format_id, fatal=False) if kind == 'AudioResource': for f in f4m_formats: f['vcodec'] = 'none' @@ -126,7 +126,8 @@ class DRTVIE(InfoExtractor): elif target == 'HLS': formats.extend(self._extract_m3u8_formats( uri, video_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id=format_id)) + preference=preference, m3u8_id=format_id, + fatal=False)) else: bitrate = link.get('Bitrate') if bitrate: diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index 974c69d..e85c58b 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -5,9 +5,12 @@ import re from .common import InfoExtractor from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, js_to_json, + mimetype2ext, unescapeHTML, - ExtractorError, ) @@ -24,14 +27,7 @@ class DVTVIE(InfoExtractor): 'id': 'dc0768de855511e49e4b0025900fea04', 'ext': 'mp4', 'title': 'Vondra o Českém století: Při pohledu na Havla mi bylo trapně', - } - }, { - 'url': 'http://video.aktualne.cz/dvtv/stropnicky-policie-vrbetice-preventivne-nekontrolovala/r~82ed4322849211e4a10c0025900fea04/', - 'md5': '6388f1941b48537dbd28791f712af8bf', - 'info_dict': { - 'id': '72c02230849211e49f60002590604f2e', - 'ext': 'mp4', - 'title': 'Stropnický: Policie Vrbětice preventivně nekontrolovala', + 'duration': 1484, } }, { 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', @@ -44,55 +40,100 @@ class DVTVIE(InfoExtractor): 'info_dict': { 'id': 'b0b40906854d11e4bdad0025900fea04', 'ext': 'mp4', - 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne' + 'title': 'Drtinová Veselovský TV 16. 12. 2014: Témata dne', + 'description': 'md5:0916925dea8e30fe84222582280b47a0', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': '5f7652a08b05009c1292317b449ffea2', 'info_dict': { 'id': '420ad9ec854a11e4bdad0025900fea04', 'ext': 'mp4', - 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka' + 'title': 'Školní masakr možná změní boj s Talibanem, říká novinářka', + 'description': 'md5:ff2f9f6de73c73d7cef4f756c1c1af42', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': '498eb9dfa97169f409126c617e2a3d64', 'info_dict': { 'id': '95d35580846a11e4b6d20025900fea04', 'ext': 'mp4', - 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?' + 'title': 'Boj o kliniku: Veřejný zájem, nebo právo na majetek?', + 'description': 'md5:889fe610a70fee5511dc3326a089188e', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }, { 'md5': 'b8dc6b744844032dab6ba3781a7274b9', 'info_dict': { 'id': '6fe14d66853511e4833a0025900fea04', 'ext': 'mp4', - 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády' + 'title': 'Pánek: Odmítání syrských uprchlíků je ostudou české vlády', + 'description': 'md5:544f86de6d20c4815bea11bf2ac3004f', + 'timestamp': 1418760010, + 'upload_date': '20141216', } }], + }, { + 'url': 'https://video.aktualne.cz/dvtv/zeman-si-jen-leci-mindraky-sobotku-nenavidi-a-babis-se-mu-te/r~960cdb3a365a11e7a83b0025900fea04/', + 'md5': 'f8efe9656017da948369aa099788c8ea', + 'info_dict': { + 'id': '3c496fec365911e7a6500025900fea04', + 'ext': 'mp4', + 'title': 'Zeman si jen léčí mindráky, Sobotku nenávidí a Babiš se mu teď hodí, tvrdí Kmenta', + 'duration': 1103, + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, }] def _parse_video_metadata(self, js, video_id): - metadata = self._parse_json(js, video_id, transform_source=js_to_json) + data = self._parse_json(js, video_id, transform_source=js_to_json) - formats = [] - for video in metadata['sources']: - ext = video['type'][6:] - formats.append({ - 'url': video['file'], - 'ext': ext, - 'format_id': '%s-%s' % (ext, video['label']), - 'height': int(video['label'].rstrip('p')), - 'fps': 25, - }) + title = unescapeHTML(data['title']) + formats = [] + for video in data['sources']: + video_url = video.get('file') + if not video_url: + continue + video_type = video.get('type') + ext = determine_ext(video_url, mimetype2ext(video_type)) + if video_type == 'application/vnd.apple.mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif video_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + label = video.get('label') + height = self._search_regex( + r'^(\d+)[pP]', label or '', 'height', default=None) + format_id = ['http'] + for f in (ext, label): + if f: + format_id.append(f) + formats.append({ + 'url': video_url, + 'format_id': '-'.join(format_id), + 'height': int_or_none(height), + }) self._sort_formats(formats) return { - 'id': metadata['mediaid'], - 'title': unescapeHTML(metadata['title']), - 'thumbnail': self._proto_relative_url(metadata['image'], 'http:'), + 'id': data.get('mediaid') or video_id, + 'title': title, + 'description': data.get('description'), + 'thumbnail': data.get('image'), + 'duration': int_or_none(data.get('duration')), + 'timestamp': int_or_none(data.get('pubtime')), 'formats': formats } @@ -103,7 +144,7 @@ class DVTVIE(InfoExtractor): # single video item = self._search_regex( - r"(?s)embedData[0-9a-f]{32}\['asset'\]\s*=\s*(\{.+?\});", + r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});', webpage, 'video', default=None, fatal=False) if item: @@ -113,6 +154,8 @@ class DVTVIE(InfoExtractor): items = re.findall( r"(?s)BBX\.context\.assets\['[0-9a-f]{32}'\]\.push\(({.+?})\);", webpage) + if not items: + items = re.findall(r'(?s)var\s+asset\s*=\s*({.+?});\n', webpage) if items: return { diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 76d39ad..4278927 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -11,6 +11,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + unsmuggle_url, ) @@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor): 'view_count': int, }, 'skip': 'Georestricted', + }, { + # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/) + 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306', + 'only_matching': True, }] @staticmethod @@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor): webpage) if mobj is not None: return mobj.group('url') - # Basic usage embedding (see http://dultonmedia.github.io/eplayer/) + PLAYER_JS_RE = r''' + ]+ + src=(?P["\'])(?:https?:)?//(?P(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) + .+? + ''' + # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/) mobj = re.search( r'''(?xs) - ]+ - src=(?P["\'])(?:https?:)?//(?P.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1) - .+? + %s ]+ - class=(?P["\'])eagleplayer(?P=q2)[^>]+ + class=(?P["\'])eagleplayer(?P=qclass)[^>]+ data-id=["\'](?P\d+) - ''', webpage) + ''' % PLAYER_JS_RE, webpage) + if mobj is not None: + return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + # Generalization of "Javascript code usage", "Combined usage" and + # "Usage without attaching to DOM" embeddings (see + # http://dultonmedia.github.io/eplayer/) + mobj = re.search( + r'''(?xs) + %s + + ''' % PLAYER_JS_RE, webpage) if mobj is not None: return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() @@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor): if status != 200: raise ExtractorError(' '.join(response['errors']), expected=True) - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs): + def _download_json(self, url_or_request, video_id, *args, **kwargs): try: - response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note) + response = super(EaglePlatformIE, self)._download_json( + url_or_request, video_id, *args, **kwargs) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError): response = self._parse_json(ee.cause.read().decode('utf-8'), video_id) @@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor): return self._download_json(url_or_request, video_id, note)['data'][0] def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + mobj = re.match(self._VALID_URL, url) host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id') + headers = {} + query = { + 'id': video_id, + } + + referrer = smuggled_data.get('referrer') + if referrer: + headers['Referer'] = referrer + query['referrer'] = referrer + player_data = self._download_json( - 'http://%s/api/player_data?id=%s' % (host, video_id), video_id) + 'http://%s/api/player_data' % host, video_id, + headers=headers, query=query) media = player_data['data']['playlist']['viewports'][0]['medialist'][0] diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index db92146..e4a3046 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -1,15 +1,18 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) class EggheadCourseIE(InfoExtractor): IE_DESC = 'egghead.io course' IE_NAME = 'egghead:course' - _VALID_URL = r'https://egghead\.io/courses/(?P[a-zA-Z_0-9-]+)' + _VALID_URL = r'https://egghead\.io/courses/(?P[^/?#&]+)' _TEST = { 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, @@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) - title = self._html_search_regex(r'

([^<]+)

', webpage, 'title') - ul = self._search_regex(r'(?s)
    (.*?)
', webpage, 'session list') + course = self._download_json( + 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + + entries = [ + self.url_result( + 'wistia:%s' % lesson['wistia_id'], ie='Wistia', + video_id=lesson['wistia_id'], video_title=lesson.get('title')) + for lesson in course['lessons'] if lesson.get('wistia_id')] + + return self.playlist_result( + entries, playlist_id, course.get('title'), + course.get('description')) + + +class EggheadLessonIE(InfoExtractor): + IE_DESC = 'egghead.io lesson' + IE_NAME = 'egghead:lesson' + _VALID_URL = r'https://egghead\.io/lessons/(?P[^/?#&]+)' + _TEST = { + 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', + 'info_dict': { + 'id': 'fv5yotjxcg', + 'ext': 'mp4', + 'title': 'Create linear data flow with container style types (Box)', + 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', + 'thumbnail': r're:^https?:.*\.jpg$', + 'timestamp': 1481296768, + 'upload_date': '20161209', + 'duration': 304, + 'view_count': 0, + 'tags': ['javascript', 'free'], + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + lesson_id = self._match_id(url) - found = re.findall(r'(?s)\s*
  • https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return mobj.group('url') - + def _extract_urls(webpage): + urls = [] + for mobj in re.finditer( + r']+?src=(["\'])(?Phttps?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + webpage): + urls.append(mobj.group('url')) # Facebook API embed # see https://developers.facebook.com/docs/plugins/embedded-video-player - mobj = re.search(r'''(?x)]+ + for mobj in re.finditer(r'''(?x)]+ class=(?P[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P[\'"])(?P(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) - if mobj is not None: - return mobj.group('url') + data-href=(?P[\'"])(?P(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): + urls.append(mobj.group('url')) + return urls def _login(self): (useremail, password) = self._get_login_info() diff --git a/youtube_dl/extractor/firsttv.py b/youtube_dl/extractor/firsttv.py index 081c718..4803a22 100644 --- a/youtube_dl/extractor/firsttv.py +++ b/youtube_dl/extractor/firsttv.py @@ -102,6 +102,8 @@ class FirstTVIE(InfoExtractor): 'format_id': f.get('name'), 'tbr': tbr, 'source_preference': quality(f.get('name')), + # quality metadata of http formats may be incorrect + 'preference': -1, }) # m3u8 URL format is reverse engineered from [1] (search for # master.m3u8). dashEdges (that is currently balancer-vod.1tv.ru) diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py index 15736c9..9f98637 100644 --- a/youtube_dl/extractor/fivetv.py +++ b/youtube_dl/extractor/fivetv.py @@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor): 'info_dict': { 'id': 'glavnoe', 'ext': 'mp4', - 'title': 'Итоги недели с 8 по 14 июня 2015 года', + 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$', 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_url = self._search_regex( - r']+?href="([^"]+)"[^>]+?class="videoplayer"', + [r']+?class="flowplayer[^>]+?data-href="([^"]+)"', + r']+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') title = self._og_search_title(webpage, default=None) or self._search_regex( diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py index a8e1bf4..9f166ef 100644 --- a/youtube_dl/extractor/flickr.py +++ b/youtube_dl/extractor/flickr.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import ( + compat_str, + compat_urllib_parse_urlencode, +) from ..utils import ( ExtractorError, int_or_none, @@ -81,7 +84,7 @@ class FlickrIE(InfoExtractor): formats = [] for stream in streams['stream']: - stream_type = str(stream.get('type')) + stream_type = compat_str(stream.get('type')) formats.append({ 'format_id': stream_type, 'url': stream['_content'], diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index 9776c84..ad273a0 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -3,39 +3,22 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( parse_duration, parse_iso8601, - sanitized_Request, str_to_int, ) -class FourTubeIE(InfoExtractor): - IE_NAME = '4tube' - _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P\d+)' +class FourTubeBaseIE(InfoExtractor): + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + kind, video_id, display_id = mobj.group('kind', 'id', 'display_id') - _TEST = { - 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', - 'md5': '6516c8ac63b03de06bc8eac14362db4f', - 'info_dict': { - 'id': '209733', - 'ext': 'mp4', - 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', - 'uploader': 'WCP Club', - 'uploader_id': 'wcp-club', - 'upload_date': '20131031', - 'timestamp': 1383263892, - 'duration': 583, - 'view_count': int, - 'like_count': int, - 'categories': list, - 'age_limit': 18, - } - } + if kind == 'm' or not display_id: + url = self._URL_TEMPLATE % video_id - def _real_extract(self, url): - video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) title = self._html_search_meta('name', webpage) @@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'', + r'', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'', + r'', webpage, 'uploader', fatal=False) categories_html = self._search_regex( @@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor): view_count = str_to_int(self._search_regex( r']+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', - webpage, 'view count', fatal=False)) + webpage, 'view count', default=None)) like_count = str_to_int(self._search_regex( r']+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', - webpage, 'like count', fatal=False)) + webpage, 'like count', default=None)) duration = parse_duration(self._html_search_meta('duration', webpage)) media_id = self._search_regex( @@ -85,14 +68,14 @@ class FourTubeIE(InfoExtractor): media_id = params[0] sources = ['%s' % p for p in params[2]] - token_url = 'http://tkn.4tube.com/{0}/desktop/{1}'.format( + token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format( media_id, '+'.join(sources)) - headers = { - b'Content-Type': b'application/x-www-form-urlencoded', - b'Origin': b'http://www.4tube.com', - } - token_req = sanitized_Request(token_url, b'{}', headers) - tokens = self._download_json(token_req, video_id) + + parsed_url = compat_urlparse.urlparse(url) + tokens = self._download_json(token_url, video_id, data=b'', headers={ + 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname), + 'Referer': url, + }) formats = [{ 'url': tokens[format]['token'], 'format_id': format + 'p', @@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor): 'duration': duration, 'age_limit': 18, } + + +class FourTubeIE(FourTubeBaseIE): + IE_NAME = '4tube' + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?4tube\.com/(?:videos|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video' + _TESTS = [{ + 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '209733', + 'ext': 'mp4', + 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black', + 'uploader': 'WCP Club', + 'uploader_id': 'wcp-club', + 'upload_date': '20131031', + 'timestamp': 1383263892, + 'duration': 583, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + }, { + 'url': 'http://www.4tube.com/embed/209733', + 'only_matching': True, + }, { + 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black', + 'only_matching': True, + }] + + +class FuxIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?fux\.com/(?:video|embed)/(?P\d+)(?:/(?P[^/?#&]+))?' + _URL_TEMPLATE = 'https://www.fux.com/video/%s/video' + _TESTS = [{ + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'info_dict': { + 'id': '195359', + 'ext': 'mp4', + 'title': 'Awesome fucking in the kitchen ends with cum swallow', + 'uploader': 'alenci2342', + 'uploader_id': 'alenci2342', + 'upload_date': '20131230', + 'timestamp': 1388361660, + 'duration': 289, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.fux.com/embed/195359', + 'only_matching': True, + }, { + 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow', + 'only_matching': True, + }] + + +class PornTubeIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?porntube\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' + _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759', + 'info_dict': { + 'id': '7089759', + 'ext': 'mp4', + 'title': 'Teen couple doing anal', + 'uploader': 'Alexy', + 'uploader_id': 'Alexy', + 'upload_date': '20150606', + 'timestamp': 1433595647, + 'duration': 5052, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.porntube.com/embed/7089759', + 'only_matching': True, + }, { + 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759', + 'only_matching': True, + }] + + +class PornerBrosIE(FourTubeBaseIE): + _VALID_URL = r'https?://(?:(?Pwww|m)\.)?pornerbros\.com/(?:videos/(?P[^/]+)_|embed/)(?P\d+)' + _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s' + _TESTS = [{ + 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'md5': '6516c8ac63b03de06bc8eac14362db4f', + 'info_dict': { + 'id': '181369', + 'ext': 'mp4', + 'title': 'Skinny brunette takes big cock down her anal hole', + 'uploader': 'PornerBros HD', + 'uploader_id': 'pornerbros-hd', + 'upload_date': '20130130', + 'timestamp': 1359527401, + 'duration': 1224, + 'view_count': int, + 'categories': list, + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.pornerbros.com/embed/181369', + 'only_matching': True, + }, { + 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369', + 'only_matching': True, + }] diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 159fdf9..facc665 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -3,56 +3,99 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - smuggle_url, - update_url_query, + int_or_none, + parse_age_limit, + parse_duration, + try_get, + unified_timestamp, ) class FOXIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P[0-9]+)' - _TEST = { - 'url': 'http://www.fox.com/watch/255180355939/7684182528', + _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P[\da-fA-F]+)' + _TESTS = [{ + # clip + 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/', 'md5': 'ebd296fcc41dd4b19f8115d8461a3165', 'info_dict': { - 'id': '255180355939', + 'id': '4b765a60490325103ea69888fb2bd4e8', 'ext': 'mp4', - 'title': 'Official Trailer: Gotham', - 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.', - 'duration': 129, - 'timestamp': 1400020798, - 'upload_date': '20140513', - 'uploader': 'NEWA-FNG-FOXCOM', + 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'description': 'md5:549cd9c70d413adb32ce2a779b53b486', + 'duration': 102, + 'timestamp': 1504291893, + 'upload_date': '20170901', + 'creator': 'FOX', + 'series': 'Gotham', }, - 'add_ie': ['ThePlatform'], - } + 'params': { + 'skip_download': True, + }, + }, { + # episode, geo-restricted + 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/', + 'only_matching': True, + }, { + # episode, geo-restricted, tv provided required + 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - settings = self._parse_json(self._search_regex( - r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', - webpage, 'drupal settings'), video_id) - fox_pdk_player = settings['fox_pdk_player'] - release_url = fox_pdk_player['release_url'] - query = { - 'mbr': 'true', - 'switch': 'http' - } - if fox_pdk_player.get('access') == 'locked': - ap_p = settings['foxAdobePassProvider'] - rating = ap_p.get('videoRating') - if rating == 'n/a': - rating = None - resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource) - - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': 'ThePlatform', - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'id': video_id, - }) - return info + video = self._download_json( + 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id, + video_id, headers={ + 'apikey': 'abdcbed02c124d393b39e818a4312055', + 'Content-Type': 'application/json', + 'Referer': url, + }) + + title = video['name'] + + m3u8_url = self._download_json( + video['videoRelease']['url'], video_id)['playURL'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = video.get('description') + duration = int_or_none(video.get('durationInSeconds')) or int_or_none( + video.get('duration')) or parse_duration(video.get('duration')) + timestamp = unified_timestamp(video.get('datePublished')) + age_limit = parse_age_limit(video.get('contentRating')) + + data = try_get( + video, lambda x: x['trackingData']['properties'], dict) or {} + + creator = data.get('brand') or data.get('network') or video.get('network') + + series = video.get('seriesName') or data.get( + 'seriesName') or data.get('show') + season_number = int_or_none(video.get('seasonNumber')) + episode = video.get('name') + episode_number = int_or_none(video.get('episodeNumber')) + release_year = int_or_none(video.get('releaseYear')) + + if data.get('authRequired'): + # TODO: AP + pass + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'age_limit': age_limit, + 'creator': creator, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + 'release_year': release_year, + 'formats': formats, + } diff --git a/youtube_dl/extractor/foxgay.py b/youtube_dl/extractor/foxgay.py index e887ae4..512a106 100644 --- a/youtube_dl/extractor/foxgay.py +++ b/youtube_dl/extractor/foxgay.py @@ -5,6 +5,7 @@ import itertools from .common import InfoExtractor from ..utils import ( get_element_by_id, + int_or_none, remove_end, ) @@ -46,7 +47,7 @@ class FoxgayIE(InfoExtractor): formats = [{ 'url': source, - 'height': resolution, + 'height': int_or_none(resolution), } for source, resolution in zip( video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 546d5ca..2bcbb3e 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -112,7 +112,7 @@ class FranceTVBaseInfoExtractor(InfoExtractor): class FranceTVIE(FranceTVBaseInfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P[^/]+)\.html' + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P[^/]+)\.html' _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', @@ -157,6 +157,9 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, { 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', 'only_matching': True, + }, { + 'url': 'https://www.france.tv/142749-rouge-sang.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 4940936..f85e7de 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -1,10 +1,14 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + unified_timestamp, +) class FunnyOrDieIE(InfoExtractor): @@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Heart-Shaped Box: Literal Video Version', 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338', 'thumbnail': r're:^http:.*\.jpg$', + 'uploader': 'DASjr', + 'timestamp': 1317904928, + 'upload_date': '20111006', + 'duration': 318.3, }, }, { 'url': 'http://www.funnyordie.com/embed/e402820827', @@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor): 'title': 'Please Use This Song (Jon Lajoie)', 'description': 'Please use this to sell something. www.jonlajoie.com', 'thumbnail': r're:^http:.*\.jpg$', + 'timestamp': 1398988800, + 'upload_date': '20140502', }, 'params': { 'skip_download': True, @@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor): 'url': 'http://www.funnyordie.com%s' % src, }] - post_json = self._search_regex( - r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details') - post = json.loads(post_json) + timestamp = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp', default=None)) + + uploader = self._html_search_regex( + r']+\bclass=["\']channel-preview-name[^>]+>(.+?)[^/]+)/(?P[^/]+)\.html?' - _TESTS = [ - { - 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', - 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', - 'info_dict': { - 'id': '201601/26955', - 'ext': 'mp4', - 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['motorrad-fun'], - 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', - 'uploader_id': 'Bikefun', - 'upload_date': '20170110', - 'uploader_url': None, - } - }, - { - 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', - 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', - 'info_dict': { - 'id': '201106/15920', - 'ext': 'mp4', - 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', - 'thumbnail': r're:^https?://.*\.jpg$', - 'categories': ['racing'], - 'display_id': 'isle-of-man-tt-2011-michael-du-15920', - 'uploader_id': 'IOM', - 'upload_date': '20160506', - 'uploader_url': 'www.iomtt.com', - } + _VALID_URL = r'https?://(?:www\.)?gaskrank\.tv/tv/(?P[^/]+)/(?P[^/]+)\.htm' + _TESTS = [{ + 'url': 'http://www.gaskrank.tv/tv/motorrad-fun/strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden.htm', + 'md5': '1ae88dbac97887d85ebd1157a95fc4f9', + 'info_dict': { + 'id': '201601/26955', + 'ext': 'mp4', + 'title': 'Strike! Einparken können nur Männer - Flurschaden hält sich in Grenzen *lol*', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['motorrad-fun'], + 'display_id': 'strike-einparken-durch-anfaenger-crash-mit-groesserem-flurschaden', + 'uploader_id': 'Bikefun', + 'upload_date': '20170110', + 'uploader_url': None, } - ] + }, { + 'url': 'http://www.gaskrank.tv/tv/racing/isle-of-man-tt-2011-michael-du-15920.htm', + 'md5': 'c33ee32c711bc6c8224bfcbe62b23095', + 'info_dict': { + 'id': '201106/15920', + 'ext': 'mp4', + 'title': 'Isle of Man - Michael Dunlop vs Guy Martin - schwindelig kucken', + 'thumbnail': r're:^https?://.*\.jpg$', + 'categories': ['racing'], + 'display_id': 'isle-of-man-tt-2011-michael-du-15920', + 'uploader_id': 'IOM', + 'upload_date': '20170523', + 'uploader_url': 'www.iomtt.com', + } + }] def _real_extract(self, url): - """extract information from gaskrank.tv""" - def fix_json(code): - """Removes trailing comma in json: {{},} --> {{}}""" - return re.sub(r',\s*}', r'}', js_to_json(code)) - display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) + categories = [re.match(self._VALID_URL, url).group('categories')] - title = self._search_regex( - r'movieName\s*:\s*\'([^\']*)\'', - webpage, 'title') - thumbnail = self._search_regex( - r'poster\s*:\s*\'([^\']*)\'', - webpage, 'thumbnail', default=None) mobj = re.search( r'Video von:\s*(?P[^|]*?)\s*\|\s*vom:\s*(?P[0-9][0-9]\.[0-9][0-9]\.[0-9][0-9][0-9][0-9])', @@ -89,29 +79,14 @@ class GaskrankIE(InfoExtractor): if average_rating: average_rating = float_or_none(average_rating.replace(',', '.')) - playlist = self._parse_json( - self._search_regex( - r'playlist\s*:\s*\[([^\]]*)\]', - webpage, 'playlist', default='{}'), - display_id, transform_source=fix_json, fatal=False) - video_id = self._search_regex( r'https?://movies\.gaskrank\.tv/([^-]*?)(-[^\.]*)?\.mp4', - playlist.get('0').get('src'), 'video id') - - formats = [] - for key in playlist: - formats.append({ - 'url': playlist[key]['src'], - 'format_id': key, - 'quality': playlist[key].get('quality')}) - self._sort_formats(formats, field_preference=['format_id']) + webpage, 'video id', default=display_id) - return { + entry = self._parse_html5_media_entries(url, webpage, video_id)[0] + entry.update({ 'id': video_id, 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, 'categories': categories, 'display_id': display_id, 'uploader_id': uploader_id, @@ -120,4 +95,7 @@ class GaskrankIE(InfoExtractor): 'tags': tags, 'view_count': view_count, 'average_rating': average_rating, - } + }) + self._sort_formats(entry['formats']) + + return entry diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c108d4a..7d0edf0 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -10,6 +10,7 @@ from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( compat_etree_fromstring, + compat_str, compat_urllib_parse_unquote, compat_urlparse, compat_xml_parse_error, @@ -35,6 +36,10 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -56,6 +61,7 @@ from .dailymotion import ( DailymotionIE, DailymotionCloudIE, ) +from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .mtv import MTVServicesEmbeddedIE @@ -90,6 +96,9 @@ from .anvato import AnvatoIE from .washingtonpost import WashingtonPostIE from .wistia import WistiaIE from .mediaset import MediasetIE +from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE class GenericIE(InfoExtractor): @@ -567,6 +576,19 @@ class GenericIE(InfoExtractor): }, 'skip': 'movie expired', }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -758,6 +780,20 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Dailymotion'], }, + # DailyMail embed + { + 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot', + 'info_dict': { + 'id': '1495629', + 'ext': 'mp4', + 'title': 'Care worker punches elderly dementia patient in head 11 times', + 'description': 'md5:3a743dee84e57e48ec68bf67113199a5', + }, + 'add_ie': ['DailyMail'], + 'params': { + 'skip_download': True, + }, + }, # YouTube embed { 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', @@ -1184,7 +1220,7 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, - # Eagle.Platform embed (generic URL) + # EaglePlatform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1198,8 +1234,26 @@ class GenericIE(InfoExtractor): 'view_count': int, 'age_limit': 0, }, + 'params': { + 'skip_download': True, + }, + }, + # referrer protected EaglePlatform embed + { + 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/', + 'info_dict': { + 'id': '582306', + 'ext': 'mp4', + 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 3382, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, }, - # ClipYou (Eagle.Platform) embed (custom URL) + # ClipYou (EaglePlatform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used @@ -1211,6 +1265,9 @@ class GenericIE(InfoExtractor): 'duration': 216, 'view_count': int, }, + 'params': { + 'skip_download': True, + }, }, # Pladform embed { @@ -1462,14 +1519,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -1511,6 +1581,22 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['BrightcoveLegacy'], }, + # Nexx embed + { + 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', + 'info_dict': { + 'id': '247746', + 'ext': 'mp4', + 'title': "Yesterday's Jam (OV)", + 'description': 'md5:09bc0984723fed34e2581624a84e05f0', + 'timestamp': 1492594816, + 'upload_date': '20170419', + }, + 'params': { + 'format': 'bestvideo', + 'skip_download': True, + }, + }, # Facebook