X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/453698570f26bebd37b39df8537d993b57d77b8b..3cd33af594d34a8809318ebd0be4d606d212a7a0:/youtube_dl/extractor/generic.py?ds=inline diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 7a5bf93..9868ca6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,14 +1,16 @@ -# encoding: utf-8 +# coding: utf-8 from __future__ import unicode_literals import os import re +import sys from .common import InfoExtractor from .youtube import YoutubeIE from ..compat import ( - compat_urllib_parse, + compat_etree_fromstring, + compat_urllib_parse_unquote, compat_urlparse, compat_xml_parse_error, ) @@ -17,20 +19,71 @@ from ..utils import ( ExtractorError, float_or_none, HEADRequest, + is_html, + js_to_json, orderedSet, - parse_xml, + sanitized_Request, smuggle_url, unescapeHTML, unified_strdate, unsmuggle_url, UnsupportedError, - url_basename, + xpath_text, ) -from .brightcove import BrightcoveIE +from .commonprotocols import RtmpIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE +from .tvc import TVCIE +from .sportbox import SportBoxEmbedIE from .smotri import SmotriIE +from .myvi import MyviIE from .condenast import CondeNastIE +from .udn import UDNEmbedIE +from .senateisvp import SenateISVPIE +from .svt import SVTIE +from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE +from .drtuber import DrTuberIE +from .redtube import RedTubeIE +from .vimeo import VimeoIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) +from .onionstudios import OnionStudiosIE +from .viewlift import ViewLiftEmbedIE +from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .videomore import VideomoreIE +from .webcaster import WebcasterFeedIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE +from .digiteka import DigitekaIE +from .arkena import ArkenaIE +from .instagram import InstagramIE +from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudIE +from .tunein import TuneInBaseIE +from .vbox7 import Vbox7IE +from .dbtv import DBTVIE +from .piksel import PikselIE +from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .openload import OpenloadIE +from .videopress import VideoPressIE class GenericIE(InfoExtractor): @@ -38,6 +91,245 @@ class GenericIE(InfoExtractor): _VALID_URL = r'.*' IE_NAME = 'generic' _TESTS = [ + # Direct link to a video + { + 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', + 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + 'info_dict': { + 'id': 'trailer', + 'ext': 'mp4', + 'title': 'trailer', + 'upload_date': '20100513', + } + }, + # Direct link to media delivered compressed (until Accept-Encoding is *) + { + 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', + 'md5': '128c42e68b13950268b648275386fc74', + 'info_dict': { + 'id': 'FictionJunction-Parallel_Hearts', + 'ext': 'flac', + 'title': 'FictionJunction-Parallel_Hearts', + 'upload_date': '20140522', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ], + 'skip': 'URL invalid', + }, + # Direct download with broken HEAD + { + 'url': 'http://ai-radio.org:8000/radio.opus', + 'info_dict': { + 'id': 'radio', + 'ext': 'opus', + 'title': 'radio', + }, + 'params': { + 'skip_download': True, # infinite live stream + }, + 'expected_warnings': [ + r'501.*Not Implemented', + r'400.*Bad Request', + ], + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:.*groundbreaking video review series.*' + }, + 'playlist_mincount': 11, + }, + # RSS feed with enclosure + { + 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', + 'info_dict': { + 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + 'ext': 'm4v', + 'upload_date': '20150228', + 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624', + } + }, + # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng + { + 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', + 'info_dict': { + 'id': 'smil', + 'ext': 'mp4', + 'title': 'Automatics, robotics and biocybernetics', + 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', + 'upload_date': '20130627', + 'formats': 'mincount:16', + 'subtitles': 'mincount:1', + }, + 'params': { + 'force_generic_extractor': True, + 'skip_download': True, + }, + }, + # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html + { + 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', + 'info_dict': { + 'id': 'hds', + 'ext': 'flv', + 'title': 'hds', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from https://www.restudy.dk/video/play/id/1637 + { + 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', + 'info_dict': { + 'id': 'video_1637', + 'ext': 'flv', + 'title': 'video_1637', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm + { + 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', + 'info_dict': { + 'id': 'smil-service', + 'ext': 'flv', + 'title': 'smil-service', + 'formats': 'mincount:1', + }, + 'params': { + 'skip_download': True, + }, + }, + # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 + { + 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', + 'formats': 'mincount:3', + }, + 'params': { + 'skip_download': True, + }, + }, + # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html + { + 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', + 'info_dict': { + 'id': 'mZlp2ctYIUEB', + 'ext': 'mp4', + 'title': 'Tikibad ontruimd wegens brand', + 'description': 'md5:05ca046ff47b931f9b04855015e163a4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 33, + }, + 'params': { + 'skip_download': True, + }, + }, + # MPD from http://dash-mse-test.appspot.com/media.html + { + 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', + 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', + 'info_dict': { + 'id': 'car-20120827-manifest', + 'ext': 'mp4', + 'title': 'car-20120827-manifest', + 'formats': 'mincount:9', + 'upload_date': '20130904', + }, + 'params': { + 'format': 'bestvideo', + }, + }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': 'video gone', + }, + # google redirect + { + 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + 'info_dict': { + 'id': 'cmQHVoWB5FY', + 'ext': 'mp4', + 'upload_date': '20130224', + 'uploader_id': 'TheVerge', + 'description': r're:^Chris Ziegler takes a look at the\.*', + 'uploader': 'The Verge', + 'title': 'First Firefox OS phones side-by-side', + }, + 'params': { + 'skip_download': False, + } + }, + { + # redirect in Refresh HTTP header + 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', + 'info_dict': { + 'id': 'pO8h3EaFRdo', + 'ext': 'mp4', + 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', + 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', + 'upload_date': '20150917', + 'uploader_id': 'brtvofficial', + 'uploader': 'Boiler Room', + }, + 'params': { + 'skip_download': False, + }, + }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', @@ -60,11 +352,11 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' in the - # http requests { - 'add_ie': ['Brightcove'], + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', @@ -77,6 +369,24 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, { # https://github.com/rg3/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', @@ -88,7 +398,8 @@ class GenericIE(InfoExtractor): 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], + 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -103,7 +414,7 @@ class GenericIE(InfoExtractor): }, { # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', @@ -117,16 +428,25 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, - # Direct link to a video { - 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', - 'md5': '67d406c2bcb6af27fa886f31aa934bbe', + # Brightcove with alternative playerID key + 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', 'info_dict': { - 'id': 'trailer', - 'ext': 'mp4', - 'title': 'trailer', - 'upload_date': '20100513', - } + 'id': 'nmeth.2062_SV1', + 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2228375078001', + 'ext': 'mp4', + 'title': 'nmeth.2062-sv1', + 'description': 'nmeth.2062-sv1', + 'timestamp': 1363357591, + 'upload_date': '20130315', + 'uploader': 'Nature Publishing Group', + 'uploader_id': '1964492299001', + }, + }], }, # ooyala video { @@ -136,24 +456,24 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238.231, }, 'add_ie': ['Ooyala'], }, - # google redirect { - 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', + # ooyala video embedded with http://player.ooyala.com/iframe.js + 'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/', 'info_dict': { - 'id': 'cmQHVoWB5FY', + 'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB', 'ext': 'mp4', - 'upload_date': '20130224', - 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', - 'uploader': 'The Verge', - 'title': 'First Firefox OS phones side-by-side', + 'title': '"Steve Jobs: Man in the Machine" trailer', + 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135.427, }, 'params': { - 'skip_download': False, - } + 'skip_download': True, + }, + 'skip': 'movie expired', }, # embed.ly video { @@ -181,14 +501,8 @@ class GenericIE(InfoExtractor): 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama', 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.', }, - }, - # BBC iPlayer embeds - { - 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER', - 'info_dict': { - 'title': 'BBC - Blogs - Adam Curtis - BUGGER', - }, - 'playlist_mincount': 18, + # HEAD requests lead to endless 301, while GET is OK + 'expected_warnings': ['301'], }, # RUTV embed { @@ -204,6 +518,69 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # TVC embed + { + 'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/', + 'info_dict': { + 'id': '55304', + 'ext': 'mp4', + 'title': 'Дошкольное воспитание', + }, + }, + # SportBox embed + { + 'url': 'http://www.vestifinance.ru/articles/25753', + 'info_dict': { + 'id': '25753', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', + }, + 'playlist': [{ + 'info_dict': { + 'id': '370908', + 'title': 'Госзаказ. День 3', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370905', + 'title': 'Госзаказ. День 2', + 'ext': 'mp4', + } + }, { + 'info_dict': { + 'id': '370902', + 'title': 'Госзаказ. День 1', + 'ext': 'mp4', + } + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, + # Myvi.ru embed + { + 'url': 'http://www.kinomyvi.tv/news/detail/Pervij-dublirovannij-trejler--Uzhastikov-_nOw1', + 'info_dict': { + 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', + 'ext': 'mp4', + 'title': 'Ужастики, русский трейлер (2015)', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 153, + } + }, + # XHamster embed + { + 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', + 'info_dict': { + 'id': 'showthread', + 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', + }, + 'playlist_mincount': 7, + # This forum does not allow