from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..utils import (
+from ..compat import (
compat_urllib_parse,
compat_urlparse,
compat_xml_parse_error,
-
+)
+from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
HEADRequest,
+ is_html,
orderedSet,
parse_xml,
smuggle_url,
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ UnsupportedError,
url_basename,
)
from .brightcove import BrightcoveIE
'uploader': 'Championat',
},
},
+ {
+ # https://github.com/rg3/youtube-dl/issues/3541
+ 'add_ie': ['Brightcove'],
+ 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
+ 'info_dict': {
+ 'id': '3866516442001',
+ 'ext': 'mp4',
+ 'title': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'description': 'Leer mij vrouwen kennen: Aflevering 1',
+ 'uploader': 'SBS Broadcasting',
+ },
+ 'skip': 'Restricted to Netherlands',
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
+ },
# Direct link to a video
{
'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
# ooyala video
{
'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
- 'md5': '5644c6ca5d5782c1d0d350dad9bd840c',
+ 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
'info_dict': {
'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
'ext': 'mp4',
'title': '2cc213299525360.mov', # that's what we get
},
+ 'add_ie': ['Ooyala'],
+ },
+ # multiple ooyala embeds on SBN network websites
+ {
+ 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+ 'info_dict': {
+ 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
+ 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
+ },
+ 'playlist_mincount': 3,
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Ooyala'],
},
# google redirect
{
'ext': 'mp4',
'upload_date': '20130224',
'uploader_id': 'TheVerge',
- 'description': 'Chris Ziegler takes a look at the Alcatel OneTouch Fire and the ZTE Open; two of the first Firefox OS handsets to be officially announced.',
+ 'description': 're:^Chris Ziegler takes a look at the\.*',
'uploader': 'The Verge',
'title': 'First Firefox OS phones side-by-side',
},
'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
},
},
+ # BBC iPlayer embeds
+ {
+ 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
+ 'info_dict': {
+ 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
+ },
+ 'playlist_mincount': 18,
+ },
# RUTV embed
{
'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
'info_dict': {
'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
'title': 'Zero Punctuation',
- 'description': 're:'
+ 'description': 're:.*groundbreaking video review series.*'
},
'playlist_mincount': 11,
},
'title': 'Chet Chat 171 - Oct 29, 2014',
'upload_date': '20141029',
}
+ },
+ # Livestream embed
+ {
+ 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
+ 'info_dict': {
+ 'id': '67864563',
+ 'ext': 'flv',
+ 'upload_date': '20141112',
+ 'title': 'Rosetta #CometLanding webcast HL 10',
+ }
+ },
+ # LazyYT
+ {
+ 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
+ 'info_dict': {
+ 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
+ },
+ 'playlist_mincount': 2,
+ },
+ # Direct link with incorrect MIME type
+ {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'md5': '4ccbebe5f36706d85221f204d7eb5913',
+ 'info_dict': {
+ 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
+ 'id': '5_Lennart_Poettering_-_Systemd',
+ 'ext': 'webm',
+ 'title': '5_Lennart_Poettering_-_Systemd',
+ 'upload_date': '20141120',
+ },
+ 'expected_warnings': [
+ 'URL could be a direct video link, returning it as such.'
+ ]
+ },
+ # Cinchcast embed
+ {
+ 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
+ 'info_dict': {
+ 'id': '7141703',
+ 'ext': 'mp3',
+ 'upload_date': '20141126',
+ 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
+ }
+ },
+ # Cinerama player
+ {
+ 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
+ 'info_dict': {
+ 'id': '730m_DandD_1901_512k',
+ 'ext': 'mp4',
+ 'uploader': 'www.abc.net.au',
+ 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
+ }
+ },
+ # embedded viddler video
+ {
+ 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
+ 'info_dict': {
+ 'id': '4d03aad9',
+ 'ext': 'mp4',
+ 'uploader': 'deadspin',
+ 'title': 'WALL-TO-GORTAT',
+ 'timestamp': 1422285291,
+ 'upload_date': '20150126',
+ },
+ 'add_ie': ['Viddler'],
}
]
if default_search in ('error', 'fixup_error'):
raise ExtractorError(
- ('%r is not a valid URL. '
- 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
- ) % (url, url), expected=True)
+ '%r is not a valid URL. '
+ 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
+ % (url, url), expected=True)
else:
if ':' not in default_search:
default_search += ':'
return {
'id': video_id,
'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
'formats': [{
'format_id': m.group('format_id'),
'url': url,
if not self._downloader.params.get('test', False) and not is_intentional:
self._downloader.report_warning('Falling back on generic information extractor.')
- if full_response:
- webpage = self._webpage_read_content(full_response, url, video_id)
- else:
- webpage = self._download_webpage(url, video_id)
+ if not full_response:
+ full_response = self._request_webpage(url, video_id)
+
+ # Maybe it's a direct link to a video?
+ # Be careful not to download the whole thing!
+ first_bytes = full_response.read(512)
+ if not is_html(first_bytes):
+ self._downloader.report_warning(
+ 'URL could be a direct video link, returning it as such.')
+ upload_date = unified_strdate(
+ head_response.headers.get('Last-Modified'))
+ return {
+ 'id': video_id,
+ 'title': os.path.splitext(url_basename(url))[0],
+ 'direct': True,
+ 'url': url,
+ 'upload_date': upload_date,
+ }
+
+ webpage = self._webpage_read_content(
+ full_response, url, video_id, prefix=first_bytes)
+
self.report_extraction(video_id)
# Is it an RSS feed?
r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
# Helper method
- def _playlist_from_matches(matches, getter, ie=None):
+ def _playlist_from_matches(matches, getter=None, ie=None):
urlrs = orderedSet(
- self.url_result(self._proto_relative_url(getter(m)), ie)
+ self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
urlrs, playlist_id=video_id, playlist_title=video_title)
return _playlist_from_matches(
matches, lambda m: unescapeHTML(m[1]))
+ # Look for lazyYT YouTube embed
+ matches = re.findall(
+ r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
+ if matches:
+ return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
+
# Look for embedded Dailymotion player
matches = re.findall(
r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
'title': video_title,
'id': video_id,
}
-
+
match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
if match:
return {
# Look for embedded blip.tv player
mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
if mobj:
- return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')
+ return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
if mobj:
return self.url_result(mobj.group(1), 'BlipTV')
if mobj is not None:
return self.url_result(mobj.group('url'))
+ # Look for embedded Viddler player
+ mobj = re.search(
+ r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'))
+
# Look for Ooyala videos
- mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
- re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage))
+ mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+ re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
if mobj is not None:
return OoyalaIE._build_url_result(mobj.group('ec'))
+ # Look for multiple Ooyala embeds on SBN network websites
+ mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
+ if mobj is not None:
+ embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
+ if embeds:
+ return _playlist_from_matches(
+ embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
+
# Look for Aparat videos
mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
if mobj is not None:
return _playlist_from_matches(
matches, getter=unescapeHTML, ie='FunnyOrDie')
+ # Look for BBC iPlayer embed
+ matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
+ if matches:
+ return _playlist_from_matches(matches, ie='BBCCoUk')
+
# Look for embedded RUTV player
rutv_url = RUTVIE._extract_url(webpage)
if rutv_url:
# Look for embedded TED player
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://embed\.ted\.com/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'TED')
if mobj is not None:
return self.url_result(mobj.group('url'), 'SBS')
+ # Look for embedded Cinchcast player
+ mobj = re.search(
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Cinchcast')
+
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>https?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
+ r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
webpage)
if mobj is not None:
return self.url_result(mobj.group('url'), 'MLB')
if mobj is not None:
return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
+ mobj = re.search(
+ r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
+ webpage)
+ if mobj is not None:
+ return self.url_result(mobj.group('url'), 'Livestream')
+
def check_video(vurl):
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
found = filter_video(re.findall(r'''(?xs)
flowplayer\("[^"]+",\s*
\{[^}]+?\}\s*,
- \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s*
+ \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
["']?url["']?\s*:\s*["']([^"']+)["']
''', webpage))
+ if not found:
+ # Cinerama player
+ found = re.findall(
+ r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
if not found:
# Try to find twitter cards info
found = filter_video(re.findall(
found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
if not found:
# HTML5 video
- found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src="([^"]+)"', webpage)
+ found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
if not found:
found = re.search(
r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
'url': new_url,
}
if not found:
- raise ExtractorError('Unsupported URL: %s' % url)
+ raise UnsupportedError(url)
entries = []
for video_url in found:
'_type': 'playlist',
'entries': entries,
}
-