X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/9f2b33881274af98a9145c533a1d295fad71521a..d1d8237037fb87a57bd1d073c3b9e642d4a75016:/youtube_dl/extractor/generic.py diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c6bf8d2..4aa2406 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -47,10 +47,11 @@ from .senateisvp import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE +from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -58,6 +59,11 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .instagram import InstagramIE +from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE +from .vessel import VesselIE class GenericIE(InfoExtractor): @@ -102,7 +108,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, # infinite live stream }, 'expected_warnings': [ - r'501.*Not Implemented' + r'501.*Not Implemented', + r'400.*Bad Request', ], }, # Direct link with incorrect MIME type @@ -233,11 +240,41 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', + 'upload_date': '20130904', }, 'params': { 'format': 'bestvideo', }, }, + # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 + { + 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', + 'info_dict': { + 'id': 'content', + 'ext': 'mp4', + 'title': 'content', + 'formats': 'mincount:8', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, + # m3u8 served with Content-Type: text/plain + { + 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', + 'info_dict': { + 'id': 'index', + 'ext': 'mp4', + 'title': 'index', + 'upload_date': '20140720', + 'formats': 'mincount:11', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -375,19 +412,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, - # multiple ooyala embeds on SBN network websites - { - 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'info_dict': { - 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok', - 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com', - }, - 'playlist_mincount': 3, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -576,7 +600,11 @@ class GenericIE(InfoExtractor): 'id': 'k2mm4bCdJ6CQ2i7c8o2', 'ext': 'mp4', 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', 'uploader': 'Spi0n', + 'uploader_id': 'xgditw', + 'upload_date': '20140425', + 'timestamp': 1398441542, }, 'add_ie': ['Dailymotion'], }, @@ -599,13 +627,13 @@ class GenericIE(InfoExtractor): }, # MTVSercices embed { - 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too', - 'md5': '35727f82f58c76d996fc188f9755b0d5', + 'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html', + 'md5': 'ca1aef97695ef2c1d6973256a57e5252', 'info_dict': { - 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9', + 'id': '769f7ec0-0692-4d62-9b45-0d88074bffc1', 'ext': 'mp4', - 'title': 'Review', - 'description': 'Mario\'s life in the fast lane has never looked so good.', + 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', + 'description': 'Two valets share their love for movie star Liam Neesons.', }, }, # YouTube embed via @@ -691,15 +719,18 @@ class GenericIE(InfoExtractor): }, # Wistia embed { - 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', 'info_dict': { - 'id': '1cfaf6b7ea', + 'id': '6e2wtrbdaf', 'ext': 'mov', - 'title': 'md5:51364a8d3d009997ba99656004b5e20d', - 'duration': 643.0, - 'filesize': 182808282, - 'uploader': 'education-portal.com', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'description': 'a Paywall Videos video from Remilon', + 'duration': 644.072, + 'uploader': 'study.com', + 'timestamp': 1459678540, + 'upload_date': '20160403', + 'filesize': 24687186, }, }, { @@ -708,10 +739,29 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'uxjb0lwrcz', 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'title': 'Conversation about Hexagonal Rails Part 1', + 'description': 'a Martin Fowler video from ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', + 'timestamp': 1401832161, + 'upload_date': '20140603', + }, + }, + # Wistia standard embed (async) + { + 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video from getdrip-1', + 'duration': 4986.95, + 'timestamp': 1463607249, + 'upload_date': '20160518', }, + 'params': { + 'skip_download': True, + } }, # Soundcloud embed { @@ -735,6 +785,19 @@ class GenericIE(InfoExtractor): 'title': 'Rosetta #CometLanding webcast HL 10', } }, + # Another Livestream embed, without 'new.' in URL + { + 'url': 'https://www.freespeech.org/', + 'info_dict': { + 'id': '123537347', + 'ext': 'mp4', + 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, # LazyYT { 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', @@ -819,18 +882,6 @@ class GenericIE(InfoExtractor): 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, - # Kaltura embed - { - 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', - 'info_dict': { - 'id': '1_eergr3h1', - 'ext': 'mp4', - 'upload_date': '20150226', - 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', - 'timestamp': int, - 'title': 'John Carlson Postgame 2/25/15', - }, - }, # Kaltura embed (different embed code) { 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', @@ -856,9 +907,23 @@ class GenericIE(InfoExtractor): 'uploader_id': 'echojecka', }, }, + # Kaltura embed with single quotes + { + 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', + 'info_dict': { + 'id': '0_izeg5utt', + 'ext': 'mp4', + 'title': '35871', + 'timestamp': 1355743100, + 'upload_date': '20121217', + 'uploader_id': 'batchUser', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -873,6 +938,7 @@ class GenericIE(InfoExtractor): # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '12820', 'ext': 'mp4', @@ -961,18 +1027,36 @@ class GenericIE(InfoExtractor): 'ext': 'flv', 'title': "PFT Live: New leader in the 'new-look' defense", 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20140107', + 'timestamp': 1389118457, + }, + }, + # NBC News embed + { + 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', + 'md5': '1aa589c675898ae6d37a17913cf68d66', + 'info_dict': { + 'id': '701714499682', + 'ext': 'mp4', + 'title': 'PREVIEW: On Assignment: David Letterman', + 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', }, }, # UDN embed { - 'url': 'http://www.udn.com/news/story/7314/822787', + 'url': 'https://video.udn.com/news/300346', 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, # Ooyala embed { @@ -989,20 +1073,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # Contains a SMIL manifest - { - 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html', - 'info_dict': { - 'id': 'file', - 'ext': 'flv', - 'title': '+ Football: Lottery Champions League Europe', - 'uploader': 'www.telewebion.com', - }, - 'params': { - # rtmpe downloads - 'skip_download': True, - } - }, # Brightcove URL in single quotes { 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', @@ -1013,6 +1083,9 @@ class GenericIE(InfoExtractor): 'title': 'SN Presents: Russell Martin, World Citizen', 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, }, }, # Dailymotion Cloud video @@ -1093,7 +1166,60 @@ class GenericIE(InfoExtractor): # m3u8 downloads 'skip_download': True, } - } + }, + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + { + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + # Another form of arte.tv embed + { + 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', + 'md5': '850bfe45417ddf221288c88a0cffe2e2', + 'info_dict': { + 'id': '030273-562_PLUS7-F', + 'ext': 'mp4', + 'title': 'ARTE Reportage - Nulle part, en France', + 'description': 'md5:e3a0e8868ed7303ed509b9e3af2b870d', + 'upload_date': '20160409', + }, + }, + # LiveLeak embed + { + 'url': 'http://www.wykop.pl/link/3088787/', + 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'info_dict': { + 'id': '874_1459135191', + 'ext': 'mp4', + 'title': 'Man shows poor quality of new apartment building', + 'description': 'The wall is like a sand pile.', + 'uploader': 'Lake8737', + } + }, + # Duplicated embedded video URLs + { + 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', + 'info_dict': { + 'id': '149298443_480_16c25b74_2', + 'ext': 'mp4', + 'title': 'vs. Blue Orange Spring Game', + 'uploader': 'www.hudl.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -1241,28 +1367,31 @@ class GenericIE(InfoExtractor): full_response = self._request_webpage(request, video_id) head_response = full_response + info_dict = { + 'id': video_id, + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'upload_date': unified_strdate(head_response.headers.get('Last-Modified')) + } + # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P.+)$', content_type) + content_type = head_response.headers.get('Content-Type', '').lower() + m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P[^;\s]+)', content_type) if m: - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - formats = [] - if m.group('format_id').endswith('mpegurl'): + format_id = m.group('format_id') + if format_id.endswith('mpegurl'): formats = self._extract_m3u8_formats(url, video_id, 'mp4') + elif format_id == 'f4m': + formats = self._extract_f4m_formats(url, video_id) else: formats = [{ 'format_id': m.group('format_id'), 'url': url, 'vcodec': 'none' if m.group('type') == 'audio' else None }] - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'direct': True, - 'formats': formats, - 'upload_date': upload_date, - } + info_dict['direct'] = True + self._sort_formats(formats) + info_dict['formats'] = formats + return info_dict if not self._downloader.params.get('test', False) and not is_intentional: force = self._downloader.params.get('force_generic_extractor', False) @@ -1282,21 +1411,24 @@ class GenericIE(InfoExtractor): request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) + first_bytes = full_response.read(512) + + # Is it an M3U playlist? + if first_bytes.startswith(b'#EXTM3U'): + info_dict['formats'] = self._extract_m3u8_formats(url, video_id, 'mp4') + self._sort_formats(info_dict['formats']) + return info_dict + # Maybe it's a direct link to a video? # Be careful not to download the whole thing! - first_bytes = full_response.read(512) if not is_html(first_bytes): self._downloader.report_warning( 'URL could be a direct video link, returning it as such.') - upload_date = unified_strdate( - head_response.headers.get('Last-Modified')) - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + info_dict.update({ 'direct': True, 'url': url, - 'upload_date': upload_date, - } + }) + return info_dict webpage = self._webpage_read_content( full_response, url, video_id, prefix=first_bytes) @@ -1309,16 +1441,20 @@ class GenericIE(InfoExtractor): if doc.tag == 'rss': return self._extract_rss(url, video_id, doc) elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): - return self._parse_smil(doc, url, video_id) + smil = self._parse_smil(doc, url, video_id) + self._sort_formats(smil['formats']) + return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): - return { - 'id': video_id, - 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), - 'formats': self._parse_mpd_formats( - doc, video_id, mpd_base_url=url.rpartition('/')[0]), - } + info_dict['formats'] = self._parse_mpd_formats( + doc, video_id, mpd_base_url=url.rpartition('/')[0]) + self._sort_formats(info_dict['formats']) + return info_dict + elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): + info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self._sort_formats(info_dict['formats']) + return info_dict except compat_xml_parse_error: pass @@ -1338,7 +1474,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex( + video_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( r'(?s)(.*?)', webpage, 'video title', default='video') @@ -1356,6 +1493,9 @@ class GenericIE(InfoExtractor): video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + video_description = self._og_search_description(webpage, default=None) + video_thumbnail = self._og_search_thumbnail(webpage, default=None) + # Helper method def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( @@ -1386,6 +1526,16 @@ class GenericIE(InfoExtractor): if bc_urls: return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + # Look for ThePlatform embeds + tp_urls = ThePlatformIE._extract_urls(webpage) + if tp_urls: + return _playlist_from_matches(tp_urls, ie='ThePlatform') + + # Look for Vessel embeds + vessel_urls = VesselIE._extract_urls(webpage) + if vessel_urls: + return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1454,21 +1604,26 @@ class GenericIE(InfoExtractor): 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': video_id, } match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) if match: return { '_type': 'url_transparent', - 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'url': 'wistia:%s' % match.group('id'), 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': match.group('id') } + match = re.search( + r'''(?sx) + ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return self.url_result(self._proto_relative_url( + 'wistia:%s' % match.group('id')), 'Wistia') + # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1633,6 +1788,11 @@ class GenericIE(InfoExtractor): if xhamster_urls: return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded TNAFlixNetwork player + tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) + if tnaflix_urls: + return _playlist_from_matches(tnaflix_urls, ie=TNAFlixNetworkEmbedIE.ie_key()) + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) @@ -1653,7 +1813,7 @@ class GenericIE(InfoExtractor): # Look for embedded arte.tv player mobj = re.search( - r'