X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/47d80ec0b18245caeb97018d4c1af18d0b5b972b..dbb40c34b0cf975d3c249affe5fc8865fd7a3d5d:/youtube_dl/extractor/generic.py diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 3949c8b..274f817 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,6 +20,7 @@ from ..utils import ( float_or_none, HEADRequest, is_html, + js_to_json, orderedSet, sanitized_Request, smuggle_url, @@ -29,6 +30,7 @@ from ..utils import ( UnsupportedError, xpath_text, ) +from .commonprotocols import RtmpIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -73,8 +75,16 @@ from .kaltura import KalturaIE from .eagleplatform import EaglePlatformIE from .facebook import FacebookIE from .soundcloud import SoundcloudIE +from .tunein import TuneInBaseIE from .vbox7 import Vbox7IE from .dbtv import DBTVIE +from .piksel import PikselIE +from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .openload import OpenloadIE +from .videopress import VideoPressIE +from .rutube import RutubeIE class GenericIE(InfoExtractor): @@ -236,7 +246,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { @@ -297,7 +307,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', - 'description': 're:^Chris Ziegler takes a look at the\.*', + 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, @@ -343,10 +353,10 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' in the - # http requests { + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { @@ -360,6 +370,24 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'add_id': ['BrightcoveLegacy'], + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + }, { # https://github.com/rg3/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', @@ -401,6 +429,43 @@ class GenericIE(InfoExtractor): 'skip_download': True, # m3u8 download }, }, + { + # Brightcove with alternative playerID key + 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', + 'info_dict': { + 'id': 'nmeth.2062_SV1', + 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', + }, + 'playlist': [{ + 'info_dict': { + 'id': '2228375078001', + 'ext': 'mp4', + 'title': 'nmeth.2062-sv1', + 'description': 'nmeth.2062-sv1', + 'timestamp': 1363357591, + 'upload_date': '20130315', + 'uploader': 'Nature Publishing Group', + 'uploader_id': '1964492299001', + }, + }], + }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -518,7 +583,7 @@ class GenericIE(InfoExtractor): 'id': 'f4dafcad-ff21-423d-89b5-146cfd89fa1e', 'ext': 'mp4', 'title': 'УжаÑÑики, ÑÑÑÑкий ÑÑÐµÐ¹Ð»ÐµÑ (2015)', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 153, } }, @@ -546,17 +611,6 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embedded Ustream video - { - 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', - 'md5': '27b99cdb639c9b12a79bca876a073417', - 'info_dict': { - 'id': '45734260', - 'ext': 'flv', - 'uploader': 'AU SPA: The NSA and Privacy', - 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman' - } - }, # nowvideo embed hidden behind percent encoding { 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', @@ -738,7 +792,7 @@ class GenericIE(InfoExtractor): 'duration': 48, 'timestamp': 1401537900, 'upload_date': '20140531', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, # Wistia embed @@ -808,6 +862,21 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 7, }, + # TuneIn station embed + { + 'url': 'http://radiocnrv.com/promouvoir-radio-cnrv/', + 'info_dict': { + 'id': '204146', + 'ext': 'mp3', + 'title': 'CNRV', + 'location': 'Paris, France', + 'is_live': True, + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -833,12 +902,13 @@ class GenericIE(InfoExtractor): }, # LazyYT { - 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', + 'url': 'https://skiplagged.com/', 'info_dict': { - 'id': '1986', - 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', + 'id': 'skiplagged', + 'title': 'Skiplagged: The smart way to find cheap flights', }, - 'playlist_mincount': 2, + 'playlist_mincount': 1, + 'add_ie': ['Youtube'], }, # Cinchcast embed { @@ -898,6 +968,43 @@ class GenericIE(InfoExtractor): 'title': 'Webinar: Using Discovery, The National Archivesâ online catalogue', }, }, + # jwplayer rtmp + { + 'url': 'http://www.suffolk.edu/sjc/', + 'info_dict': { + 'id': 'sjclive', + 'ext': 'flv', + 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', + 'uploader': 'www.suffolk.edu', + }, + 'params': { + 'skip_download': True, + } + }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, + { + # JWPlayer config passed as variable + 'url': 'http://www.txxx.com/videos/3326530/ariele/', + 'info_dict': { + 'id': '3326530_hq', + 'ext': 'mp4', + 'title': 'ARIELE | Tube Cup', + 'uploader': 'www.txxx.com', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + } + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', @@ -928,19 +1035,6 @@ class GenericIE(InfoExtractor): 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, - # Kaltura embed protected with referrer - { - 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', - 'info_dict': { - 'id': '1_g4fbemnq', - 'ext': 'mp4', - 'title': 'Violetta - Achter De Schermen - Ruggero', - 'description': 'Achter de schermen met Ruggero', - 'timestamp': 1435133761, - 'upload_date': '20150624', - 'uploader_id': 'echojecka', - }, - }, # Kaltura embed with single quotes { 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', @@ -972,6 +1066,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # Kaltura embedded, some fileExt broken (#11480) + 'url': 'http://www.cornell.edu/video/nima-arkani-hamed-standard-models-of-particle-physics', + 'info_dict': { + 'id': '1_sgtvehim', + 'ext': 'mp4', + 'title': 'Our "Standard Models" of particle physics and cosmology', + 'description': 'md5:67ea74807b8c4fea92a6f38d6d323861', + 'timestamp': 1321158993, + 'upload_date': '20111113', + 'uploader_id': 'kps1', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -981,7 +1089,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'ÐавалÑнÑй вÑÑел на ÑвободÑ', 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 87, 'view_count': int, 'age_limit': 0, @@ -995,7 +1103,7 @@ class GenericIE(InfoExtractor): 'id': '12820', 'ext': 'mp4', 'title': "'O Sole Mio", - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 216, 'view_count': int, }, @@ -1008,7 +1116,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ð¢Ð°Ð¹Ð½Ñ Ð¿ÐµÑевала ÐÑÑлова ⢠1 ÑеÑÐ¸Ñ 2 ÑаÑÑÑ', 'description': 'ÐокÑменÑалÑнÑй ÑеÑиал-ÑаÑÑледование одной из ÑамÑÑ Ð¶ÑÑÐºÐ¸Ñ Ñайн ХХ века', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 694, 'age_limit': 0, }, @@ -1020,7 +1128,7 @@ class GenericIE(InfoExtractor): 'id': '3519514', 'ext': 'mp4', 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer', - 'thumbnail': 're:^https?://.*\.png$', + 'thumbnail': r're:^https?://.*\.png$', 'duration': 45.115, }, }, @@ -1103,7 +1211,7 @@ class GenericIE(InfoExtractor): 'id': '300346', 'ext': 'mp4', 'title': 'ä¸ä¸ä¸ç·å¸«è®æ§ å ¨æ ¡å¸«çåæº', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { # m3u8 download @@ -1149,7 +1257,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sauvons les abeilles ! - Le débat', 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'timestamp': 1434970506, 'upload_date': '20150622', 'uploader': 'Public Sénat', @@ -1163,7 +1271,7 @@ class GenericIE(InfoExtractor): 'id': '2855', 'ext': 'mp4', 'title': 'Donât Understand Bitcoin? This Man Will Mumble An Explanation At You', - 'thumbnail': 're:^https?://.*\.jpe?g$', + 'thumbnail': r're:^https?://.*\.jpe?g$', 'uploader': 'ClickHole', 'uploader_id': 'clickhole', } @@ -1389,6 +1497,77 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 3, }, + { + # Videa embeds + 'url': 'http://forum.dvdtalk.com/movie-talk/623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style.html', + 'info_dict': { + 'id': '623756-deleted-magic-star-wars-ot-deleted-alt-scenes-docu-style', + 'title': 'Deleted Magic - Star Wars: OT Deleted / Alt. Scenes Docu. Style - DVD Talk Forum', + }, + 'playlist_mincount': 2, + }, + { + # 20 minuten embed + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # VideoPress embed + 'url': 'https://en.support.wordpress.com/videopress/', + 'info_dict': { + 'id': 'OcobLTqC', + 'ext': 'm4v', + 'title': 'IMG_5786', + 'timestamp': 1435711927, + 'upload_date': '20150701', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VideoPressIE.ie_key()], + }, + { + # Rutube embed + 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', + 'info_dict': { + 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', + 'ext': 'flv', + 'title': 'Ðагаззино: ÐÐ°Ð·Ð°Ð½Ñ 2', + 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', + 'uploader': 'Ðагаззино', + 'upload_date': '20170228', + 'uploader_id': '996642', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [RutubeIE.ie_key()], + }, + { + # ThePlatform embedded with whitespaces in URLs + 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', + 'only_matching': True, + }, + { + # Senate ISVP iframe https + 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', + 'md5': 'fb8c70b0b515e5037981a2492099aab8', + 'info_dict': { + 'id': 'govtaff020316', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + }, + 'add_ie': [SenateISVPIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1688,14 +1867,6 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) - # Helper method - def _playlist_from_matches(matches, getter=None, ie=None): - urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -1716,28 +1887,28 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(webpage) if bc_urls: - return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: - return _playlist_from_matches(tp_urls, ie='ThePlatform') + return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') # Look for Vessel embeds vessel_urls = VesselIE._extract_urls(webpage) if vessel_urls: - return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) # Look for embedded rtl.nl player matches = re.findall( r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, ie='RtlNl') + return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') vimeo_urls = VimeoIE._extract_urls(url, webpage) if vimeo_urls: - return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) + return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1759,25 +1930,25 @@ class GenericIE(InfoExtractor): (?:embed|v|p)/.+?) \1''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return self.playlist_from_matches( + matches, video_id, video_title, lambda m: unescapeHTML(m[1])) # Look for lazyYT YouTube embed matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) # Look for Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)<div[^>]+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches(matches, lambda m: m[-1]) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) matches = DailymotionIE._extract_urls(webpage) if matches: - return _playlist_from_matches(matches) + return self.playlist_from_matches(matches, video_id, video_title) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1786,8 +1957,8 @@ class GenericIE(InfoExtractor): playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: - return _playlist_from_matches( - playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + return self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player match = re.search( @@ -1880,15 +2051,23 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + return OoyalaIE._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: - return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + return self.playlist_from_matches( + embeds, video_id, video_title, + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) @@ -1950,13 +2129,13 @@ class GenericIE(InfoExtractor): # Look for funnyordie embed matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) if matches: - return _playlist_from_matches( - matches, getter=unescapeHTML, ie='FunnyOrDie') + return self.playlist_from_matches( + matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') # Look for BBC iPlayer embed matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) if matches: - return _playlist_from_matches(matches, ie='BBCCoUk') + return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') # Look for embedded RUTV player rutv_url = RUTVIE._extract_url(webpage) @@ -1971,32 +2150,32 @@ class GenericIE(InfoExtractor): # Look for embedded SportBox player sportbox_urls = SportBoxEmbedIE._extract_urls(webpage) if sportbox_urls: - return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed') + return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed') # Look for embedded XHamster player xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) if xhamster_urls: - return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') # Look for embedded TNAFlixNetwork player tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) if tnaflix_urls: - return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) # Look for embedded PornHub player pornhub_urls = PornHubIE._extract_urls(webpage) if pornhub_urls: - return _playlist_from_matches(pornhub_urls, ie=PornHubIE.ie_key()) + return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) # Look for embedded DrTuber player drtuber_urls = DrTuberIE._extract_urls(webpage) if drtuber_urls: - return _playlist_from_matches(drtuber_urls, ie=DrTuberIE.ie_key()) + return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) # Look for embedded RedTube player redtube_urls = RedTubeIE._extract_urls(webpage) if redtube_urls: - return _playlist_from_matches(redtube_urls, ie=RedTubeIE.ie_key()) + return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) # Look for embedded Tvigle player mobj = re.search( @@ -2011,10 +2190,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'TED') # Look for embedded Ustream videos - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ustream') + ustream_url = UstreamIE._extract_url(webpage) + if ustream_url: + return self.url_result(ustream_url, UstreamIE.ie_key()) # Look for embedded arte.tv player mobj = re.search( @@ -2043,7 +2221,12 @@ class GenericIE(InfoExtractor): # Look for embedded soundcloud player soundcloud_urls = SoundcloudIE._extract_urls(webpage) if soundcloud_urls: - return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) + + # Look for tunein player + tunein_urls = TuneInBaseIE._extract_urls(webpage) + if tunein_urls: + return self.playlist_from_matches(tunein_urls, video_id, video_title) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -2211,6 +2394,11 @@ class GenericIE(InfoExtractor): if arkena_url: return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Piksel embeds + piksel_url = PikselIE._extract_url(webpage) + if piksel_url: + return self.url_result(piksel_url, PikselIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2219,8 +2407,9 @@ class GenericIE(InfoExtractor): 'Channel': 'channel', 'ChannelList': 'channel_list', } - return self.url_result('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2)) + return self.url_result(smuggle_url('limelight:%s:%s' % ( + lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), + 'Limelight%s' % mobj.group(1), mobj.group(2)) mobj = re.search( r'''(?sx) @@ -2230,7 +2419,9 @@ class GenericIE(InfoExtractor): value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) ''', webpage) if mobj: - return self.url_result('limelight:media:%s' % mobj.group('id')) + return self.url_result(smuggle_url( + 'limelight:media:%s' % mobj.group('id'), + {'source_url': url}), 'LimelightMedia', mobj.group('id')) # Look for AdobeTVVideo embeds mobj = re.search( @@ -2318,7 +2509,36 @@ class GenericIE(InfoExtractor): # Look for DBTV embeds dbtv_urls = DBTVIE._extract_urls(webpage) if dbtv_urls: - return _playlist_from_matches(dbtv_urls, ie=DBTVIE.ie_key()) + return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) + + # Look for Videa embeds + videa_urls = VideaIE._extract_urls(webpage) + if videa_urls: + return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) + + # Look for 20 minuten embeds + twentymin_urls = TwentyMinutenIE._extract_urls(webpage) + if twentymin_urls: + return self.playlist_from_matches( + twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) + + # Look for Openload embeds + openload_urls = OpenloadIE._extract_urls(webpage) + if openload_urls: + return self.playlist_from_matches( + openload_urls, video_id, video_title, ie=OpenloadIE.ie_key()) + + # Look for VideoPress embeds + videopress_urls = VideoPressIE._extract_urls(webpage) + if videopress_urls: + return self.playlist_from_matches( + videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) + + # Look for Rutube embeds + rutube_urls = RutubeIE._extract_urls(webpage) + if rutube_urls: + return self.playlist_from_matches( + rutube_urls, ie=RutubeIE.ie_key()) # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( @@ -2344,9 +2564,20 @@ class GenericIE(InfoExtractor): self._sort_formats(entry['formats']) return self.playlist_result(entries) + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + if jwplayer_data: + info = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False) + if not info.get('title'): + info['title'] = video_title + return info + def check_video(vurl): if YoutubeIE.suitable(vurl): return True + if RtmpIE.suitable(vurl): + return True vpath = compat_urlparse.urlparse(vurl).path vext = determine_ext(vpath) return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js') @@ -2415,11 +2646,14 @@ class GenericIE(InfoExtractor): found = re.search(REDIRECT_REGEX, refresh_header) if found: new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) - self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + if new_url != url: + self.report_following_redirect(new_url) + return { + '_type': 'url', + 'url': new_url, + } + else: + found = None if not found: # twitter:player is a https URL to iframe player that may or may not @@ -2454,6 +2688,15 @@ class GenericIE(InfoExtractor): 'age_limit': age_limit, } + if RtmpIE.suitable(video_url): + entry_info_dict.update({ + '_type': 'url_transparent', + 'ie_key': RtmpIE.ie_key(), + 'url': video_url, + }) + entries.append(entry_info_dict) + continue + ext = determine_ext(video_url) if ext == 'smil': entry_info_dict['formats'] = self._extract_smil_formats(video_url, video_id)