]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/generic.py
3d756e8481e0aba09d2290cfd5a4a8b21369aa7d
[youtubedl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11 compat_urllib_parse,
12 compat_urlparse,
13 compat_xml_parse_error,
14 )
15 from ..utils import (
16 determine_ext,
17 ExtractorError,
18 float_or_none,
19 HEADRequest,
20 is_html,
21 orderedSet,
22 parse_xml,
23 smuggle_url,
24 unescapeHTML,
25 unified_strdate,
26 unsmuggle_url,
27 UnsupportedError,
28 url_basename,
29 xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37 from .udn import UDNEmbedIE
38 from .senateisvp import SenateISVPIE
39 from .bliptv import BlipTVIE
40 from .svt import SVTIE
41
42
43 class GenericIE(InfoExtractor):
44 IE_DESC = 'Generic downloader that works on some sites'
45 _VALID_URL = r'.*'
46 IE_NAME = 'generic'
47 _TESTS = [
48 {
49 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
50 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
51 'info_dict': {
52 'id': '13601338388002',
53 'ext': 'mp4',
54 'uploader': 'www.hodiho.fr',
55 'title': 'R\u00e9gis plante sa Jeep',
56 }
57 },
58 # bandcamp page with custom domain
59 {
60 'add_ie': ['Bandcamp'],
61 'url': 'http://bronyrock.com/track/the-pony-mash',
62 'info_dict': {
63 'id': '3235767654',
64 'ext': 'mp3',
65 'title': 'The Pony Mash',
66 'uploader': 'M_Pallante',
67 },
68 'skip': 'There is a limit of 200 free downloads / month for the test song',
69 },
70 # embedded brightcove video
71 # it also tests brightcove videos that need to set the 'Referer' in the
72 # http requests
73 {
74 'add_ie': ['Brightcove'],
75 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
76 'info_dict': {
77 'id': '2765128793001',
78 'ext': 'mp4',
79 'title': 'Le cours de bourse : l’analyse technique',
80 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
81 'uploader': 'BFM BUSINESS',
82 },
83 'params': {
84 'skip_download': True,
85 },
86 },
87 {
88 # https://github.com/rg3/youtube-dl/issues/2253
89 'url': 'http://bcove.me/i6nfkrc3',
90 'md5': '0ba9446db037002366bab3b3eb30c88c',
91 'info_dict': {
92 'id': '3101154703001',
93 'ext': 'mp4',
94 'title': 'Still no power',
95 'uploader': 'thestar.com',
96 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
97 },
98 'add_ie': ['Brightcove'],
99 },
100 {
101 'url': 'http://www.championat.com/video/football/v/87/87499.html',
102 'md5': 'fb973ecf6e4a78a67453647444222983',
103 'info_dict': {
104 'id': '3414141473001',
105 'ext': 'mp4',
106 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
107 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
108 'uploader': 'Championat',
109 },
110 },
111 {
112 # https://github.com/rg3/youtube-dl/issues/3541
113 'add_ie': ['Brightcove'],
114 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
115 'info_dict': {
116 'id': '3866516442001',
117 'ext': 'mp4',
118 'title': 'Leer mij vrouwen kennen: Aflevering 1',
119 'description': 'Leer mij vrouwen kennen: Aflevering 1',
120 'uploader': 'SBS Broadcasting',
121 },
122 'skip': 'Restricted to Netherlands',
123 'params': {
124 'skip_download': True, # m3u8 download
125 },
126 },
127 # Direct link to a video
128 {
129 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
130 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
131 'info_dict': {
132 'id': 'trailer',
133 'ext': 'mp4',
134 'title': 'trailer',
135 'upload_date': '20100513',
136 }
137 },
138 # ooyala video
139 {
140 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
141 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
142 'info_dict': {
143 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
144 'ext': 'mp4',
145 'title': '2cc213299525360.mov', # that's what we get
146 },
147 'add_ie': ['Ooyala'],
148 },
149 # multiple ooyala embeds on SBN network websites
150 {
151 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
152 'info_dict': {
153 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
154 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
155 },
156 'playlist_mincount': 3,
157 'params': {
158 'skip_download': True,
159 },
160 'add_ie': ['Ooyala'],
161 },
162 # google redirect
163 {
164 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
165 'info_dict': {
166 'id': 'cmQHVoWB5FY',
167 'ext': 'mp4',
168 'upload_date': '20130224',
169 'uploader_id': 'TheVerge',
170 'description': 're:^Chris Ziegler takes a look at the\.*',
171 'uploader': 'The Verge',
172 'title': 'First Firefox OS phones side-by-side',
173 },
174 'params': {
175 'skip_download': False,
176 }
177 },
178 # embed.ly video
179 {
180 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
181 'info_dict': {
182 'id': '9ODmcdjQcHQ',
183 'ext': 'mp4',
184 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
185 'upload_date': '20140225',
186 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
187 'uploader': 'Tested',
188 'uploader_id': 'testedcom',
189 },
190 # No need to test YoutubeIE here
191 'params': {
192 'skip_download': True,
193 },
194 },
195 # funnyordie embed
196 {
197 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
198 'info_dict': {
199 'id': '18e820ec3f',
200 'ext': 'mp4',
201 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
202 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
203 },
204 },
205 # BBC iPlayer embeds
206 {
207 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
208 'info_dict': {
209 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
210 },
211 'playlist_mincount': 18,
212 },
213 # RUTV embed
214 {
215 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
216 'info_dict': {
217 'id': '776940',
218 'ext': 'mp4',
219 'title': 'Охотское море стало целиком российским',
220 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
221 },
222 'params': {
223 # m3u8 download
224 'skip_download': True,
225 },
226 },
227 # Embedded TED video
228 {
229 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
230 'md5': '65fdff94098e4a607385a60c5177c638',
231 'info_dict': {
232 'id': '1969',
233 'ext': 'mp4',
234 'title': 'Hidden miracles of the natural world',
235 'uploader': 'Louie Schwartzberg',
236 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
237 }
238 },
239 # Embeded Ustream video
240 {
241 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
242 'md5': '27b99cdb639c9b12a79bca876a073417',
243 'info_dict': {
244 'id': '45734260',
245 'ext': 'flv',
246 'uploader': 'AU SPA: The NSA and Privacy',
247 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
248 }
249 },
250 # nowvideo embed hidden behind percent encoding
251 {
252 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
253 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
254 'info_dict': {
255 'id': '06e53103ca9aa',
256 'ext': 'flv',
257 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
258 'description': 'No description',
259 },
260 },
261 # arte embed
262 {
263 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
264 'md5': '7653032cbb25bf6c80d80f217055fa43',
265 'info_dict': {
266 'id': '048195-004_PLUS7-F',
267 'ext': 'flv',
268 'title': 'X:enius',
269 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
270 'upload_date': '20140320',
271 },
272 'params': {
273 'skip_download': 'Requires rtmpdump'
274 }
275 },
276 # Condé Nast embed
277 {
278 'url': 'http://www.wired.com/2014/04/honda-asimo/',
279 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
280 'info_dict': {
281 'id': '53501be369702d3275860000',
282 'ext': 'mp4',
283 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
284 }
285 },
286 # Dailymotion embed
287 {
288 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
289 'md5': '441aeeb82eb72c422c7f14ec533999cd',
290 'info_dict': {
291 'id': 'k2mm4bCdJ6CQ2i7c8o2',
292 'ext': 'mp4',
293 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
294 'uploader': 'Spi0n',
295 },
296 'add_ie': ['Dailymotion'],
297 },
298 # YouTube embed
299 {
300 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
301 'info_dict': {
302 'id': 'FXRb4ykk4S0',
303 'ext': 'mp4',
304 'title': 'The NBL Auction 2014',
305 'uploader': 'BADMINTON England',
306 'uploader_id': 'BADMINTONEvents',
307 'upload_date': '20140603',
308 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
309 },
310 'add_ie': ['Youtube'],
311 'params': {
312 'skip_download': True,
313 }
314 },
315 # MTVSercices embed
316 {
317 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
318 'md5': '35727f82f58c76d996fc188f9755b0d5',
319 'info_dict': {
320 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
321 'ext': 'mp4',
322 'title': 'Review',
323 'description': 'Mario\'s life in the fast lane has never looked so good.',
324 },
325 },
326 # YouTube embed via <data-embed-url="">
327 {
328 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
329 'info_dict': {
330 'id': '4vAffPZIT44',
331 'ext': 'mp4',
332 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
333 'uploader': 'Gameloft',
334 'uploader_id': 'gameloft',
335 'upload_date': '20140828',
336 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
337 },
338 'params': {
339 'skip_download': True,
340 }
341 },
342 # Camtasia studio
343 {
344 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
345 'playlist': [{
346 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
347 'info_dict': {
348 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
349 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
350 'ext': 'flv',
351 'duration': 2235.90,
352 }
353 }, {
354 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
355 'info_dict': {
356 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
357 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
358 'ext': 'flv',
359 'duration': 2235.93,
360 }
361 }],
362 'info_dict': {
363 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
364 }
365 },
366 # Flowplayer
367 {
368 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
369 'md5': '9d65602bf31c6e20014319c7d07fba27',
370 'info_dict': {
371 'id': '5123ea6d5e5a7',
372 'ext': 'mp4',
373 'age_limit': 18,
374 'uploader': 'www.handjobhub.com',
375 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
376 }
377 },
378 # RSS feed
379 {
380 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
381 'info_dict': {
382 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
383 'title': 'Zero Punctuation',
384 'description': 're:.*groundbreaking video review series.*'
385 },
386 'playlist_mincount': 11,
387 },
388 # Multiple brightcove videos
389 # https://github.com/rg3/youtube-dl/issues/2283
390 {
391 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
392 'info_dict': {
393 'id': 'always-never',
394 'title': 'Always / Never - The New Yorker',
395 },
396 'playlist_count': 3,
397 'params': {
398 'extract_flat': False,
399 'skip_download': True,
400 }
401 },
402 # MLB embed
403 {
404 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
405 'md5': '96f09a37e44da40dd083e12d9a683327',
406 'info_dict': {
407 'id': '33322633',
408 'ext': 'mp4',
409 'title': 'Ump changes call to ball',
410 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
411 'duration': 48,
412 'timestamp': 1401537900,
413 'upload_date': '20140531',
414 'thumbnail': 're:^https?://.*\.jpg$',
415 },
416 },
417 # Wistia embed
418 {
419 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
420 'md5': '8788b683c777a5cf25621eaf286d0c23',
421 'info_dict': {
422 'id': '1cfaf6b7ea',
423 'ext': 'mov',
424 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
425 'duration': 643.0,
426 'filesize': 182808282,
427 'uploader': 'education-portal.com',
428 },
429 },
430 {
431 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
432 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
433 'info_dict': {
434 'id': 'uxjb0lwrcz',
435 'ext': 'mp4',
436 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
437 'duration': 1715.0,
438 'uploader': 'thoughtworks.wistia.com',
439 },
440 },
441 # Direct download with broken HEAD
442 {
443 'url': 'http://ai-radio.org:8000/radio.opus',
444 'info_dict': {
445 'id': 'radio',
446 'ext': 'opus',
447 'title': 'radio',
448 },
449 'params': {
450 'skip_download': True, # infinite live stream
451 },
452 'expected_warnings': [
453 r'501.*Not Implemented'
454 ],
455 },
456 # Soundcloud embed
457 {
458 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
459 'info_dict': {
460 'id': '174391317',
461 'ext': 'mp3',
462 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
463 'uploader': 'Sophos Security',
464 'title': 'Chet Chat 171 - Oct 29, 2014',
465 'upload_date': '20141029',
466 }
467 },
468 # Livestream embed
469 {
470 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
471 'info_dict': {
472 'id': '67864563',
473 'ext': 'flv',
474 'upload_date': '20141112',
475 'title': 'Rosetta #CometLanding webcast HL 10',
476 }
477 },
478 # LazyYT
479 {
480 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
481 'info_dict': {
482 'id': '1986',
483 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
484 },
485 'playlist_mincount': 2,
486 },
487 # Direct link with incorrect MIME type
488 {
489 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
490 'md5': '4ccbebe5f36706d85221f204d7eb5913',
491 'info_dict': {
492 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
493 'id': '5_Lennart_Poettering_-_Systemd',
494 'ext': 'webm',
495 'title': '5_Lennart_Poettering_-_Systemd',
496 'upload_date': '20141120',
497 },
498 'expected_warnings': [
499 'URL could be a direct video link, returning it as such.'
500 ]
501 },
502 # Cinchcast embed
503 {
504 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
505 'info_dict': {
506 'id': '7141703',
507 'ext': 'mp3',
508 'upload_date': '20141126',
509 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
510 }
511 },
512 # Cinerama player
513 {
514 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
515 'info_dict': {
516 'id': '730m_DandD_1901_512k',
517 'ext': 'mp4',
518 'uploader': 'www.abc.net.au',
519 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
520 }
521 },
522 # embedded viddler video
523 {
524 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
525 'info_dict': {
526 'id': '4d03aad9',
527 'ext': 'mp4',
528 'uploader': 'deadspin',
529 'title': 'WALL-TO-GORTAT',
530 'timestamp': 1422285291,
531 'upload_date': '20150126',
532 },
533 'add_ie': ['Viddler'],
534 },
535 # Libsyn embed
536 {
537 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
538 'info_dict': {
539 'id': '3377616',
540 'ext': 'mp3',
541 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
542 'description': 'md5:601cb790edd05908957dae8aaa866465',
543 'upload_date': '20150220',
544 },
545 },
546 # jwplayer YouTube
547 {
548 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
549 'info_dict': {
550 'id': 'Mrj4DVp2zeA',
551 'ext': 'mp4',
552 'upload_date': '20150212',
553 'uploader': 'The National Archives UK',
554 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
555 'uploader_id': 'NationalArchives08',
556 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
557 },
558 },
559 # rtl.nl embed
560 {
561 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
562 'playlist_mincount': 5,
563 'info_dict': {
564 'id': 'aanslagen-kopenhagen',
565 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
566 }
567 },
568 # Zapiks embed
569 {
570 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
571 'info_dict': {
572 'id': '118046',
573 'ext': 'mp4',
574 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
575 }
576 },
577 # Kaltura embed
578 {
579 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
580 'info_dict': {
581 'id': '1_eergr3h1',
582 'ext': 'mp4',
583 'upload_date': '20150226',
584 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
585 'timestamp': int,
586 'title': 'John Carlson Postgame 2/25/15',
587 },
588 },
589 # Eagle.Platform embed (generic URL)
590 {
591 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
592 'info_dict': {
593 'id': '227304',
594 'ext': 'mp4',
595 'title': 'Навальный вышел на свободу',
596 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
597 'thumbnail': 're:^https?://.*\.jpg$',
598 'duration': 87,
599 'view_count': int,
600 'age_limit': 0,
601 },
602 },
603 # ClipYou (Eagle.Platform) embed (custom URL)
604 {
605 'url': 'http://muz-tv.ru/play/7129/',
606 'info_dict': {
607 'id': '12820',
608 'ext': 'mp4',
609 'title': "'O Sole Mio",
610 'thumbnail': 're:^https?://.*\.jpg$',
611 'duration': 216,
612 'view_count': int,
613 },
614 },
615 # Pladform embed
616 {
617 'url': 'http://muz-tv.ru/kinozal/view/7400/',
618 'info_dict': {
619 'id': '100183293',
620 'ext': 'mp4',
621 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
622 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
623 'thumbnail': 're:^https?://.*\.jpg$',
624 'duration': 694,
625 'age_limit': 0,
626 },
627 },
628 # Playwire embed
629 {
630 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
631 'info_dict': {
632 'id': '3519514',
633 'ext': 'mp4',
634 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
635 'thumbnail': 're:^https?://.*\.png$',
636 'duration': 45.115,
637 },
638 },
639 # 5min embed
640 {
641 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
642 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
643 'info_dict': {
644 'id': '518726732',
645 'ext': 'mp4',
646 'title': 'Facebook Creates "On This Day" | Crunch Report',
647 },
648 },
649 # SVT embed
650 {
651 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
652 'info_dict': {
653 'id': '2900353',
654 'ext': 'flv',
655 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
656 'duration': 27,
657 'age_limit': 0,
658 },
659 },
660 # RSS feed with enclosure
661 {
662 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
663 'info_dict': {
664 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
665 'ext': 'm4v',
666 'upload_date': '20150228',
667 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
668 }
669 },
670 # Crooks and Liars embed
671 {
672 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
673 'info_dict': {
674 'id': '8RUoRhRi',
675 'ext': 'mp4',
676 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
677 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
678 'timestamp': 1428207000,
679 'upload_date': '20150405',
680 'uploader': 'Heather',
681 },
682 },
683 # Crooks and Liars external embed
684 {
685 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
686 'info_dict': {
687 'id': 'MTE3MjUtMzQ2MzA',
688 'ext': 'mp4',
689 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
690 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
691 'timestamp': 1265032391,
692 'upload_date': '20100201',
693 'uploader': 'Heather',
694 },
695 },
696 # NBC Sports vplayer embed
697 {
698 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
699 'info_dict': {
700 'id': 'ln7x1qSThw4k',
701 'ext': 'flv',
702 'title': "PFT Live: New leader in the 'new-look' defense",
703 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
704 },
705 },
706 # UDN embed
707 {
708 'url': 'http://www.udn.com/news/story/7314/822787',
709 'md5': 'fd2060e988c326991037b9aff9df21a6',
710 'info_dict': {
711 'id': '300346',
712 'ext': 'mp4',
713 'title': '中一中男師變性 全校師生力挺',
714 'thumbnail': 're:^https?://.*\.jpg$',
715 }
716 },
717 # Ooyala embed
718 {
719 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
720 'info_dict': {
721 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
722 'ext': 'mp4',
723 'description': 'VIDEO: Index/Match versus VLOOKUP.',
724 'title': 'This is what separates the Excel masters from the wannabes',
725 },
726 'params': {
727 # m3u8 downloads
728 'skip_download': True,
729 }
730 },
731 # Contains a SMIL manifest
732 {
733 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
734 'info_dict': {
735 'id': 'file',
736 'ext': 'flv',
737 'title': '+ Football: Lottery Champions League Europe',
738 'uploader': 'www.telewebion.com',
739 },
740 'params': {
741 # rtmpe downloads
742 'skip_download': True,
743 }
744 }
745 ]
746
747 def report_following_redirect(self, new_url):
748 """Report information extraction."""
749 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
750
751 def _extract_rss(self, url, video_id, doc):
752 playlist_title = doc.find('./channel/title').text
753 playlist_desc_el = doc.find('./channel/description')
754 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
755
756 entries = []
757 for it in doc.findall('./channel/item'):
758 next_url = xpath_text(it, 'link', fatal=False)
759 if not next_url:
760 enclosure_nodes = it.findall('./enclosure')
761 for e in enclosure_nodes:
762 next_url = e.attrib.get('url')
763 if next_url:
764 break
765
766 if not next_url:
767 continue
768
769 entries.append({
770 '_type': 'url',
771 'url': next_url,
772 'title': it.find('title').text,
773 })
774
775 return {
776 '_type': 'playlist',
777 'id': url,
778 'title': playlist_title,
779 'description': playlist_desc,
780 'entries': entries,
781 }
782
783 def _extract_camtasia(self, url, video_id, webpage):
784 """ Returns None if no camtasia video can be found. """
785
786 camtasia_cfg = self._search_regex(
787 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
788 webpage, 'camtasia configuration file', default=None)
789 if camtasia_cfg is None:
790 return None
791
792 title = self._html_search_meta('DC.title', webpage, fatal=True)
793
794 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
795 camtasia_cfg = self._download_xml(
796 camtasia_url, video_id,
797 note='Downloading camtasia configuration',
798 errnote='Failed to download camtasia configuration')
799 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
800
801 entries = []
802 for n in fileset_node.getchildren():
803 url_n = n.find('./uri')
804 if url_n is None:
805 continue
806
807 entries.append({
808 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
809 'title': '%s - %s' % (title, n.tag),
810 'url': compat_urlparse.urljoin(url, url_n.text),
811 'duration': float_or_none(n.find('./duration').text),
812 })
813
814 return {
815 '_type': 'playlist',
816 'entries': entries,
817 'title': title,
818 }
819
820 def _real_extract(self, url):
821 if url.startswith('//'):
822 return {
823 '_type': 'url',
824 'url': self.http_scheme() + url,
825 }
826
827 parsed_url = compat_urlparse.urlparse(url)
828 if not parsed_url.scheme:
829 default_search = self._downloader.params.get('default_search')
830 if default_search is None:
831 default_search = 'fixup_error'
832
833 if default_search in ('auto', 'auto_warning', 'fixup_error'):
834 if '/' in url:
835 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
836 return self.url_result('http://' + url)
837 elif default_search != 'fixup_error':
838 if default_search == 'auto_warning':
839 if re.match(r'^(?:url|URL)$', url):
840 raise ExtractorError(
841 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
842 expected=True)
843 else:
844 self._downloader.report_warning(
845 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
846 return self.url_result('ytsearch:' + url)
847
848 if default_search in ('error', 'fixup_error'):
849 raise ExtractorError(
850 '%r is not a valid URL. '
851 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
852 % (url, url), expected=True)
853 else:
854 if ':' not in default_search:
855 default_search += ':'
856 return self.url_result(default_search + url)
857
858 url, smuggled_data = unsmuggle_url(url)
859 force_videoid = None
860 is_intentional = smuggled_data and smuggled_data.get('to_generic')
861 if smuggled_data and 'force_videoid' in smuggled_data:
862 force_videoid = smuggled_data['force_videoid']
863 video_id = force_videoid
864 else:
865 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
866
867 self.to_screen('%s: Requesting header' % video_id)
868
869 head_req = HEADRequest(url)
870 head_response = self._request_webpage(
871 head_req, video_id,
872 note=False, errnote='Could not send HEAD request to %s' % url,
873 fatal=False)
874
875 if head_response is not False:
876 # Check for redirect
877 new_url = head_response.geturl()
878 if url != new_url:
879 self.report_following_redirect(new_url)
880 if force_videoid:
881 new_url = smuggle_url(
882 new_url, {'force_videoid': force_videoid})
883 return self.url_result(new_url)
884
885 full_response = None
886 if head_response is False:
887 full_response = self._request_webpage(url, video_id)
888 head_response = full_response
889
890 # Check for direct link to a video
891 content_type = head_response.headers.get('Content-Type', '')
892 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
893 if m:
894 upload_date = unified_strdate(
895 head_response.headers.get('Last-Modified'))
896 return {
897 'id': video_id,
898 'title': os.path.splitext(url_basename(url))[0],
899 'direct': True,
900 'formats': [{
901 'format_id': m.group('format_id'),
902 'url': url,
903 'vcodec': 'none' if m.group('type') == 'audio' else None
904 }],
905 'upload_date': upload_date,
906 }
907
908 if not self._downloader.params.get('test', False) and not is_intentional:
909 self._downloader.report_warning('Falling back on generic information extractor.')
910
911 if not full_response:
912 full_response = self._request_webpage(url, video_id)
913
914 # Maybe it's a direct link to a video?
915 # Be careful not to download the whole thing!
916 first_bytes = full_response.read(512)
917 if not is_html(first_bytes):
918 self._downloader.report_warning(
919 'URL could be a direct video link, returning it as such.')
920 upload_date = unified_strdate(
921 head_response.headers.get('Last-Modified'))
922 return {
923 'id': video_id,
924 'title': os.path.splitext(url_basename(url))[0],
925 'direct': True,
926 'url': url,
927 'upload_date': upload_date,
928 }
929
930 webpage = self._webpage_read_content(
931 full_response, url, video_id, prefix=first_bytes)
932
933 self.report_extraction(video_id)
934
935 # Is it an RSS feed?
936 try:
937 doc = parse_xml(webpage)
938 if doc.tag == 'rss':
939 return self._extract_rss(url, video_id, doc)
940 except compat_xml_parse_error:
941 pass
942
943 # Is it a Camtasia project?
944 camtasia_res = self._extract_camtasia(url, video_id, webpage)
945 if camtasia_res is not None:
946 return camtasia_res
947
948 # Sometimes embedded video player is hidden behind percent encoding
949 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
950 # Unescaping the whole page allows to handle those cases in a generic way
951 webpage = compat_urllib_parse.unquote(webpage)
952
953 # it's tempting to parse this further, but you would
954 # have to take into account all the variations like
955 # Video Title - Site Name
956 # Site Name | Video Title
957 # Video Title - Tagline | Site Name
958 # and so on and so forth; it's just not practical
959 video_title = self._html_search_regex(
960 r'(?s)<title>(.*?)</title>', webpage, 'video title',
961 default='video')
962
963 # Try to detect age limit automatically
964 age_limit = self._rta_search(webpage)
965 # And then there are the jokers who advertise that they use RTA,
966 # but actually don't.
967 AGE_LIMIT_MARKERS = [
968 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
969 ]
970 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
971 age_limit = 18
972
973 # video uploader is domain name
974 video_uploader = self._search_regex(
975 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
976
977 # Helper method
978 def _playlist_from_matches(matches, getter=None, ie=None):
979 urlrs = orderedSet(
980 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
981 for m in matches)
982 return self.playlist_result(
983 urlrs, playlist_id=video_id, playlist_title=video_title)
984
985 # Look for BrightCove:
986 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
987 if bc_urls:
988 self.to_screen('Brightcove video detected.')
989 entries = [{
990 '_type': 'url',
991 'url': smuggle_url(bc_url, {'Referer': url}),
992 'ie_key': 'Brightcove'
993 } for bc_url in bc_urls]
994
995 return {
996 '_type': 'playlist',
997 'title': video_title,
998 'id': video_id,
999 'entries': entries,
1000 }
1001
1002 # Look for embedded rtl.nl player
1003 matches = re.findall(
1004 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1005 webpage)
1006 if matches:
1007 return _playlist_from_matches(matches, ie='RtlNl')
1008
1009 # Look for embedded (iframe) Vimeo player
1010 mobj = re.search(
1011 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1012 if mobj:
1013 player_url = unescapeHTML(mobj.group('url'))
1014 surl = smuggle_url(player_url, {'Referer': url})
1015 return self.url_result(surl)
1016 # Look for embedded (swf embed) Vimeo player
1017 mobj = re.search(
1018 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1019 if mobj:
1020 return self.url_result(mobj.group(1))
1021
1022 # Look for embedded YouTube player
1023 matches = re.findall(r'''(?x)
1024 (?:
1025 <iframe[^>]+?src=|
1026 data-video-url=|
1027 <embed[^>]+?src=|
1028 embedSWF\(?:\s*|
1029 new\s+SWFObject\(
1030 )
1031 (["\'])
1032 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1033 (?:embed|v|p)/.+?)
1034 \1''', webpage)
1035 if matches:
1036 return _playlist_from_matches(
1037 matches, lambda m: unescapeHTML(m[1]))
1038
1039 # Look for lazyYT YouTube embed
1040 matches = re.findall(
1041 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1042 if matches:
1043 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1044
1045 # Look for embedded Dailymotion player
1046 matches = re.findall(
1047 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1048 if matches:
1049 return _playlist_from_matches(
1050 matches, lambda m: unescapeHTML(m[1]))
1051
1052 # Look for embedded Dailymotion playlist player (#3822)
1053 m = re.search(
1054 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1055 if m:
1056 playlists = re.findall(
1057 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1058 if playlists:
1059 return _playlist_from_matches(
1060 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1061
1062 # Look for embedded Wistia player
1063 match = re.search(
1064 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1065 if match:
1066 embed_url = self._proto_relative_url(
1067 unescapeHTML(match.group('url')))
1068 return {
1069 '_type': 'url_transparent',
1070 'url': embed_url,
1071 'ie_key': 'Wistia',
1072 'uploader': video_uploader,
1073 'title': video_title,
1074 'id': video_id,
1075 }
1076
1077 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1078 if match:
1079 return {
1080 '_type': 'url_transparent',
1081 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1082 'ie_key': 'Wistia',
1083 'uploader': video_uploader,
1084 'title': video_title,
1085 'id': match.group('id')
1086 }
1087
1088 # Look for embedded blip.tv player
1089 bliptv_url = BlipTVIE._extract_url(webpage)
1090 if bliptv_url:
1091 return self.url_result(bliptv_url, 'BlipTV')
1092
1093 # Look for SVT player
1094 svt_url = SVTIE._extract_url(webpage)
1095 if svt_url:
1096 return self.url_result(svt_url, 'SVT')
1097
1098 # Look for embedded condenast player
1099 matches = re.findall(
1100 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1101 webpage)
1102 if matches:
1103 return {
1104 '_type': 'playlist',
1105 'entries': [{
1106 '_type': 'url',
1107 'ie_key': 'CondeNast',
1108 'url': ma,
1109 } for ma in matches],
1110 'title': video_title,
1111 'id': video_id,
1112 }
1113
1114 # Look for Bandcamp pages with custom domain
1115 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1116 if mobj is not None:
1117 burl = unescapeHTML(mobj.group(1))
1118 # Don't set the extractor because it can be a track url or an album
1119 return self.url_result(burl)
1120
1121 # Look for embedded Vevo player
1122 mobj = re.search(
1123 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1124 if mobj is not None:
1125 return self.url_result(mobj.group('url'))
1126
1127 # Look for embedded Viddler player
1128 mobj = re.search(
1129 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1130 webpage)
1131 if mobj is not None:
1132 return self.url_result(mobj.group('url'))
1133
1134 # Look for NYTimes player
1135 mobj = re.search(
1136 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1137 webpage)
1138 if mobj is not None:
1139 return self.url_result(mobj.group('url'))
1140
1141 # Look for Libsyn player
1142 mobj = re.search(
1143 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1144 if mobj is not None:
1145 return self.url_result(mobj.group('url'))
1146
1147 # Look for Ooyala videos
1148 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1149 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1150 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1151 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1152 if mobj is not None:
1153 return OoyalaIE._build_url_result(mobj.group('ec'))
1154
1155 # Look for multiple Ooyala embeds on SBN network websites
1156 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1157 if mobj is not None:
1158 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1159 if embeds:
1160 return _playlist_from_matches(
1161 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1162
1163 # Look for Aparat videos
1164 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1165 if mobj is not None:
1166 return self.url_result(mobj.group(1), 'Aparat')
1167
1168 # Look for MPORA videos
1169 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1170 if mobj is not None:
1171 return self.url_result(mobj.group(1), 'Mpora')
1172
1173 # Look for embedded NovaMov-based player
1174 mobj = re.search(
1175 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1176 (?P<url>http://(?:(?:embed|www)\.)?
1177 (?:novamov\.com|
1178 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1179 videoweed\.(?:es|com)|
1180 movshare\.(?:net|sx|ag)|
1181 divxstage\.(?:eu|net|ch|co|at|ag))
1182 /embed\.php.+?)\1''', webpage)
1183 if mobj is not None:
1184 return self.url_result(mobj.group('url'))
1185
1186 # Look for embedded Facebook player
1187 mobj = re.search(
1188 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1189 if mobj is not None:
1190 return self.url_result(mobj.group('url'), 'Facebook')
1191
1192 # Look for embedded VK player
1193 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1194 if mobj is not None:
1195 return self.url_result(mobj.group('url'), 'VK')
1196
1197 # Look for embedded ivi player
1198 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1199 if mobj is not None:
1200 return self.url_result(mobj.group('url'), 'Ivi')
1201
1202 # Look for embedded Huffington Post player
1203 mobj = re.search(
1204 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1205 if mobj is not None:
1206 return self.url_result(mobj.group('url'), 'HuffPost')
1207
1208 # Look for embed.ly
1209 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1210 if mobj is not None:
1211 return self.url_result(mobj.group('url'))
1212 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1213 if mobj is not None:
1214 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1215
1216 # Look for funnyordie embed
1217 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1218 if matches:
1219 return _playlist_from_matches(
1220 matches, getter=unescapeHTML, ie='FunnyOrDie')
1221
1222 # Look for BBC iPlayer embed
1223 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1224 if matches:
1225 return _playlist_from_matches(matches, ie='BBCCoUk')
1226
1227 # Look for embedded RUTV player
1228 rutv_url = RUTVIE._extract_url(webpage)
1229 if rutv_url:
1230 return self.url_result(rutv_url, 'RUTV')
1231
1232 # Look for embedded TED player
1233 mobj = re.search(
1234 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1235 if mobj is not None:
1236 return self.url_result(mobj.group('url'), 'TED')
1237
1238 # Look for embedded Ustream videos
1239 mobj = re.search(
1240 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1241 if mobj is not None:
1242 return self.url_result(mobj.group('url'), 'Ustream')
1243
1244 # Look for embedded arte.tv player
1245 mobj = re.search(
1246 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1247 webpage)
1248 if mobj is not None:
1249 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1250
1251 # Look for embedded smotri.com player
1252 smotri_url = SmotriIE._extract_url(webpage)
1253 if smotri_url:
1254 return self.url_result(smotri_url, 'Smotri')
1255
1256 # Look for embeded soundcloud player
1257 mobj = re.search(
1258 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1259 webpage)
1260 if mobj is not None:
1261 url = unescapeHTML(mobj.group('url'))
1262 return self.url_result(url)
1263
1264 # Look for embedded vulture.com player
1265 mobj = re.search(
1266 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1267 webpage)
1268 if mobj is not None:
1269 url = unescapeHTML(mobj.group('url'))
1270 return self.url_result(url, ie='Vulture')
1271
1272 # Look for embedded mtvservices player
1273 mobj = re.search(
1274 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1275 webpage)
1276 if mobj is not None:
1277 url = unescapeHTML(mobj.group('url'))
1278 return self.url_result(url, ie='MTVServicesEmbedded')
1279
1280 # Look for embedded yahoo player
1281 mobj = re.search(
1282 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1283 webpage)
1284 if mobj is not None:
1285 return self.url_result(mobj.group('url'), 'Yahoo')
1286
1287 # Look for embedded sbs.com.au player
1288 mobj = re.search(
1289 r'''(?x)
1290 (?:
1291 <meta\s+property="og:video"\s+content=|
1292 <iframe[^>]+?src=
1293 )
1294 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1295 webpage)
1296 if mobj is not None:
1297 return self.url_result(mobj.group('url'), 'SBS')
1298
1299 # Look for embedded Cinchcast player
1300 mobj = re.search(
1301 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1302 webpage)
1303 if mobj is not None:
1304 return self.url_result(mobj.group('url'), 'Cinchcast')
1305
1306 mobj = re.search(
1307 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1308 webpage)
1309 if not mobj:
1310 mobj = re.search(
1311 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1312 webpage)
1313 if mobj is not None:
1314 return self.url_result(mobj.group('url'), 'MLB')
1315
1316 mobj = re.search(
1317 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1318 webpage)
1319 if mobj is not None:
1320 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1321
1322 mobj = re.search(
1323 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1324 webpage)
1325 if mobj is not None:
1326 return self.url_result(mobj.group('url'), 'Livestream')
1327
1328 # Look for Zapiks embed
1329 mobj = re.search(
1330 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1331 if mobj is not None:
1332 return self.url_result(mobj.group('url'), 'Zapiks')
1333
1334 # Look for Kaltura embeds
1335 mobj = re.search(
1336 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1337 if mobj is not None:
1338 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1339
1340 # Look for Eagle.Platform embeds
1341 mobj = re.search(
1342 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1343 if mobj is not None:
1344 return self.url_result(mobj.group('url'), 'EaglePlatform')
1345
1346 # Look for ClipYou (uses Eagle.Platform) embeds
1347 mobj = re.search(
1348 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1349 if mobj is not None:
1350 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1351
1352 # Look for Pladform embeds
1353 mobj = re.search(
1354 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1355 if mobj is not None:
1356 return self.url_result(mobj.group('url'), 'Pladform')
1357
1358 # Look for Playwire embeds
1359 mobj = re.search(
1360 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1361 if mobj is not None:
1362 return self.url_result(mobj.group('url'))
1363
1364 # Look for 5min embeds
1365 mobj = re.search(
1366 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1367 if mobj is not None:
1368 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1369
1370 # Look for Crooks and Liars embeds
1371 mobj = re.search(
1372 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1373 if mobj is not None:
1374 return self.url_result(mobj.group('url'))
1375
1376 # Look for NBC Sports VPlayer embeds
1377 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1378 if nbc_sports_url:
1379 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1380
1381 # Look for UDN embeds
1382 mobj = re.search(
1383 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1384 if mobj is not None:
1385 return self.url_result(
1386 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1387
1388 # Look for Senate ISVP iframe
1389 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1390 if senate_isvp_url:
1391 return self.url_result(surl, 'SenateISVP')
1392
1393 def check_video(vurl):
1394 if YoutubeIE.suitable(vurl):
1395 return True
1396 vpath = compat_urlparse.urlparse(vurl).path
1397 vext = determine_ext(vpath)
1398 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1399
1400 def filter_video(urls):
1401 return list(filter(check_video, urls))
1402
1403 # Start with something easy: JW Player in SWFObject
1404 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1405 if not found:
1406 # Look for gorilla-vid style embedding
1407 found = filter_video(re.findall(r'''(?sx)
1408 (?:
1409 jw_plugins|
1410 JWPlayerOptions|
1411 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1412 )
1413 .*?
1414 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1415 if not found:
1416 # Broaden the search a little bit
1417 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1418 if not found:
1419 # Broaden the findall a little bit: JWPlayer JS loader
1420 found = filter_video(re.findall(
1421 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1422 if not found:
1423 # Flow player
1424 found = filter_video(re.findall(r'''(?xs)
1425 flowplayer\("[^"]+",\s*
1426 \{[^}]+?\}\s*,
1427 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1428 ["']?url["']?\s*:\s*["']([^"']+)["']
1429 ''', webpage))
1430 if not found:
1431 # Cinerama player
1432 found = re.findall(
1433 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1434 if not found:
1435 # Try to find twitter cards info
1436 found = filter_video(re.findall(
1437 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1438 if not found:
1439 # We look for Open Graph info:
1440 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1441 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1442 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1443 if m_video_type is not None:
1444 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1445 if not found:
1446 # HTML5 video
1447 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1448 if not found:
1449 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1450 found = re.search(
1451 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1452 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1453 webpage)
1454 if not found:
1455 # Look also in Refresh HTTP header
1456 refresh_header = head_response.headers.get('Refresh')
1457 if refresh_header:
1458 found = re.search(REDIRECT_REGEX, refresh_header)
1459 if found:
1460 new_url = compat_urlparse.urljoin(url, found.group(1))
1461 self.report_following_redirect(new_url)
1462 return {
1463 '_type': 'url',
1464 'url': new_url,
1465 }
1466 if not found:
1467 raise UnsupportedError(url)
1468
1469 entries = []
1470 for video_url in found:
1471 video_url = compat_urlparse.urljoin(url, video_url)
1472 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1473
1474 # Sometimes, jwplayer extraction will result in a YouTube URL
1475 if YoutubeIE.suitable(video_url):
1476 entries.append(self.url_result(video_url, 'Youtube'))
1477 continue
1478
1479 # here's a fun little line of code for you:
1480 video_id = os.path.splitext(video_id)[0]
1481
1482 if determine_ext(video_url) == 'smil':
1483 entries.append({
1484 'id': video_id,
1485 'formats': self._extract_smil_formats(video_url, video_id),
1486 'uploader': video_uploader,
1487 'title': video_title,
1488 'age_limit': age_limit,
1489 })
1490 else:
1491 entries.append({
1492 'id': video_id,
1493 'url': video_url,
1494 'uploader': video_uploader,
1495 'title': video_title,
1496 'age_limit': age_limit,
1497 })
1498
1499 if len(entries) == 1:
1500 return entries[0]
1501 else:
1502 for num, e in enumerate(entries, start=1):
1503 # 'url' results don't have a title
1504 if e.get('title') is not None:
1505 e['title'] = '%s (%d)' % (e['title'], num)
1506 return {
1507 '_type': 'playlist',
1508 'entries': entries,
1509 }