]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/generic.py
Prepare for upload.
[youtubedl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11 compat_urllib_parse,
12 compat_urllib_parse_unquote,
13 compat_urllib_request,
14 compat_urlparse,
15 compat_xml_parse_error,
16 )
17 from ..utils import (
18 determine_ext,
19 ExtractorError,
20 float_or_none,
21 HEADRequest,
22 is_html,
23 orderedSet,
24 parse_xml,
25 smuggle_url,
26 unescapeHTML,
27 unified_strdate,
28 unsmuggle_url,
29 UnsupportedError,
30 url_basename,
31 xpath_text,
32 )
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .sportbox import SportBoxEmbedIE
38 from .smotri import SmotriIE
39 from .condenast import CondeNastIE
40 from .udn import UDNEmbedIE
41 from .senateisvp import SenateISVPIE
42 from .bliptv import BlipTVIE
43 from .svt import SVTIE
44
45
46 class GenericIE(InfoExtractor):
47 IE_DESC = 'Generic downloader that works on some sites'
48 _VALID_URL = r'.*'
49 IE_NAME = 'generic'
50 _TESTS = [
51 # Direct link to a video
52 {
53 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
54 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
55 'info_dict': {
56 'id': 'trailer',
57 'ext': 'mp4',
58 'title': 'trailer',
59 'upload_date': '20100513',
60 }
61 },
62 # Direct link to media delivered compressed (until Accept-Encoding is *)
63 {
64 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
65 'md5': '128c42e68b13950268b648275386fc74',
66 'info_dict': {
67 'id': 'FictionJunction-Parallel_Hearts',
68 'ext': 'flac',
69 'title': 'FictionJunction-Parallel_Hearts',
70 'upload_date': '20140522',
71 },
72 'expected_warnings': [
73 'URL could be a direct video link, returning it as such.'
74 ]
75 },
76 # Direct download with broken HEAD
77 {
78 'url': 'http://ai-radio.org:8000/radio.opus',
79 'info_dict': {
80 'id': 'radio',
81 'ext': 'opus',
82 'title': 'radio',
83 },
84 'params': {
85 'skip_download': True, # infinite live stream
86 },
87 'expected_warnings': [
88 r'501.*Not Implemented'
89 ],
90 },
91 # Direct link with incorrect MIME type
92 {
93 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
94 'md5': '4ccbebe5f36706d85221f204d7eb5913',
95 'info_dict': {
96 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
97 'id': '5_Lennart_Poettering_-_Systemd',
98 'ext': 'webm',
99 'title': '5_Lennart_Poettering_-_Systemd',
100 'upload_date': '20141120',
101 },
102 'expected_warnings': [
103 'URL could be a direct video link, returning it as such.'
104 ]
105 },
106 # RSS feed
107 {
108 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
109 'info_dict': {
110 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
111 'title': 'Zero Punctuation',
112 'description': 're:.*groundbreaking video review series.*'
113 },
114 'playlist_mincount': 11,
115 },
116 # RSS feed with enclosure
117 {
118 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
119 'info_dict': {
120 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
121 'ext': 'm4v',
122 'upload_date': '20150228',
123 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
124 }
125 },
126 # google redirect
127 {
128 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
129 'info_dict': {
130 'id': 'cmQHVoWB5FY',
131 'ext': 'mp4',
132 'upload_date': '20130224',
133 'uploader_id': 'TheVerge',
134 'description': 're:^Chris Ziegler takes a look at the\.*',
135 'uploader': 'The Verge',
136 'title': 'First Firefox OS phones side-by-side',
137 },
138 'params': {
139 'skip_download': False,
140 }
141 },
142 {
143 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
144 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
145 'info_dict': {
146 'id': '13601338388002',
147 'ext': 'mp4',
148 'uploader': 'www.hodiho.fr',
149 'title': 'R\u00e9gis plante sa Jeep',
150 }
151 },
152 # bandcamp page with custom domain
153 {
154 'add_ie': ['Bandcamp'],
155 'url': 'http://bronyrock.com/track/the-pony-mash',
156 'info_dict': {
157 'id': '3235767654',
158 'ext': 'mp3',
159 'title': 'The Pony Mash',
160 'uploader': 'M_Pallante',
161 },
162 'skip': 'There is a limit of 200 free downloads / month for the test song',
163 },
164 # embedded brightcove video
165 # it also tests brightcove videos that need to set the 'Referer' in the
166 # http requests
167 {
168 'add_ie': ['Brightcove'],
169 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
170 'info_dict': {
171 'id': '2765128793001',
172 'ext': 'mp4',
173 'title': 'Le cours de bourse : l’analyse technique',
174 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
175 'uploader': 'BFM BUSINESS',
176 },
177 'params': {
178 'skip_download': True,
179 },
180 },
181 {
182 # https://github.com/rg3/youtube-dl/issues/2253
183 'url': 'http://bcove.me/i6nfkrc3',
184 'md5': '0ba9446db037002366bab3b3eb30c88c',
185 'info_dict': {
186 'id': '3101154703001',
187 'ext': 'mp4',
188 'title': 'Still no power',
189 'uploader': 'thestar.com',
190 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
191 },
192 'add_ie': ['Brightcove'],
193 },
194 {
195 'url': 'http://www.championat.com/video/football/v/87/87499.html',
196 'md5': 'fb973ecf6e4a78a67453647444222983',
197 'info_dict': {
198 'id': '3414141473001',
199 'ext': 'mp4',
200 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
201 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
202 'uploader': 'Championat',
203 },
204 },
205 {
206 # https://github.com/rg3/youtube-dl/issues/3541
207 'add_ie': ['Brightcove'],
208 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
209 'info_dict': {
210 'id': '3866516442001',
211 'ext': 'mp4',
212 'title': 'Leer mij vrouwen kennen: Aflevering 1',
213 'description': 'Leer mij vrouwen kennen: Aflevering 1',
214 'uploader': 'SBS Broadcasting',
215 },
216 'skip': 'Restricted to Netherlands',
217 'params': {
218 'skip_download': True, # m3u8 download
219 },
220 },
221 # ooyala video
222 {
223 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
224 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
225 'info_dict': {
226 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
227 'ext': 'mp4',
228 'title': '2cc213299525360.mov', # that's what we get
229 },
230 'add_ie': ['Ooyala'],
231 },
232 # multiple ooyala embeds on SBN network websites
233 {
234 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
235 'info_dict': {
236 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
237 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
238 },
239 'playlist_mincount': 3,
240 'params': {
241 'skip_download': True,
242 },
243 'add_ie': ['Ooyala'],
244 },
245 # embed.ly video
246 {
247 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
248 'info_dict': {
249 'id': '9ODmcdjQcHQ',
250 'ext': 'mp4',
251 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
252 'upload_date': '20140225',
253 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
254 'uploader': 'Tested',
255 'uploader_id': 'testedcom',
256 },
257 # No need to test YoutubeIE here
258 'params': {
259 'skip_download': True,
260 },
261 },
262 # funnyordie embed
263 {
264 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
265 'info_dict': {
266 'id': '18e820ec3f',
267 'ext': 'mp4',
268 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
269 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
270 },
271 },
272 # BBC iPlayer embeds
273 {
274 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
275 'info_dict': {
276 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
277 },
278 'playlist_mincount': 18,
279 },
280 # RUTV embed
281 {
282 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
283 'info_dict': {
284 'id': '776940',
285 'ext': 'mp4',
286 'title': 'Охотское море стало целиком российским',
287 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
288 },
289 'params': {
290 # m3u8 download
291 'skip_download': True,
292 },
293 },
294 # SportBox embed
295 {
296 'url': 'http://www.vestifinance.ru/articles/25753',
297 'info_dict': {
298 'id': '25753',
299 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
300 },
301 'playlist': [{
302 'info_dict': {
303 'id': '370908',
304 'title': 'Госзаказ. День 3',
305 'ext': 'mp4',
306 }
307 }, {
308 'info_dict': {
309 'id': '370905',
310 'title': 'Госзаказ. День 2',
311 'ext': 'mp4',
312 }
313 }, {
314 'info_dict': {
315 'id': '370902',
316 'title': 'Госзаказ. День 1',
317 'ext': 'mp4',
318 }
319 }],
320 'params': {
321 # m3u8 download
322 'skip_download': True,
323 },
324 },
325 # Embedded TED video
326 {
327 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
328 'md5': '65fdff94098e4a607385a60c5177c638',
329 'info_dict': {
330 'id': '1969',
331 'ext': 'mp4',
332 'title': 'Hidden miracles of the natural world',
333 'uploader': 'Louie Schwartzberg',
334 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
335 }
336 },
337 # Embeded Ustream video
338 {
339 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
340 'md5': '27b99cdb639c9b12a79bca876a073417',
341 'info_dict': {
342 'id': '45734260',
343 'ext': 'flv',
344 'uploader': 'AU SPA: The NSA and Privacy',
345 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
346 }
347 },
348 # nowvideo embed hidden behind percent encoding
349 {
350 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
351 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
352 'info_dict': {
353 'id': '06e53103ca9aa',
354 'ext': 'flv',
355 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
356 'description': 'No description',
357 },
358 },
359 # arte embed
360 {
361 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
362 'md5': '7653032cbb25bf6c80d80f217055fa43',
363 'info_dict': {
364 'id': '048195-004_PLUS7-F',
365 'ext': 'flv',
366 'title': 'X:enius',
367 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
368 'upload_date': '20140320',
369 },
370 'params': {
371 'skip_download': 'Requires rtmpdump'
372 }
373 },
374 # Condé Nast embed
375 {
376 'url': 'http://www.wired.com/2014/04/honda-asimo/',
377 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
378 'info_dict': {
379 'id': '53501be369702d3275860000',
380 'ext': 'mp4',
381 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
382 }
383 },
384 # Dailymotion embed
385 {
386 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
387 'md5': '441aeeb82eb72c422c7f14ec533999cd',
388 'info_dict': {
389 'id': 'k2mm4bCdJ6CQ2i7c8o2',
390 'ext': 'mp4',
391 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
392 'uploader': 'Spi0n',
393 },
394 'add_ie': ['Dailymotion'],
395 },
396 # YouTube embed
397 {
398 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
399 'info_dict': {
400 'id': 'FXRb4ykk4S0',
401 'ext': 'mp4',
402 'title': 'The NBL Auction 2014',
403 'uploader': 'BADMINTON England',
404 'uploader_id': 'BADMINTONEvents',
405 'upload_date': '20140603',
406 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
407 },
408 'add_ie': ['Youtube'],
409 'params': {
410 'skip_download': True,
411 }
412 },
413 # MTVSercices embed
414 {
415 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
416 'md5': '35727f82f58c76d996fc188f9755b0d5',
417 'info_dict': {
418 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
419 'ext': 'mp4',
420 'title': 'Review',
421 'description': 'Mario\'s life in the fast lane has never looked so good.',
422 },
423 },
424 # YouTube embed via <data-embed-url="">
425 {
426 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
427 'info_dict': {
428 'id': '4vAffPZIT44',
429 'ext': 'mp4',
430 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
431 'uploader': 'Gameloft',
432 'uploader_id': 'gameloft',
433 'upload_date': '20140828',
434 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
435 },
436 'params': {
437 'skip_download': True,
438 }
439 },
440 # Camtasia studio
441 {
442 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
443 'playlist': [{
444 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
445 'info_dict': {
446 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
447 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
448 'ext': 'flv',
449 'duration': 2235.90,
450 }
451 }, {
452 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
453 'info_dict': {
454 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
455 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
456 'ext': 'flv',
457 'duration': 2235.93,
458 }
459 }],
460 'info_dict': {
461 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
462 }
463 },
464 # Flowplayer
465 {
466 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
467 'md5': '9d65602bf31c6e20014319c7d07fba27',
468 'info_dict': {
469 'id': '5123ea6d5e5a7',
470 'ext': 'mp4',
471 'age_limit': 18,
472 'uploader': 'www.handjobhub.com',
473 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
474 }
475 },
476 # Multiple brightcove videos
477 # https://github.com/rg3/youtube-dl/issues/2283
478 {
479 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
480 'info_dict': {
481 'id': 'always-never',
482 'title': 'Always / Never - The New Yorker',
483 },
484 'playlist_count': 3,
485 'params': {
486 'extract_flat': False,
487 'skip_download': True,
488 }
489 },
490 # MLB embed
491 {
492 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
493 'md5': '96f09a37e44da40dd083e12d9a683327',
494 'info_dict': {
495 'id': '33322633',
496 'ext': 'mp4',
497 'title': 'Ump changes call to ball',
498 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
499 'duration': 48,
500 'timestamp': 1401537900,
501 'upload_date': '20140531',
502 'thumbnail': 're:^https?://.*\.jpg$',
503 },
504 },
505 # Wistia embed
506 {
507 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
508 'md5': '8788b683c777a5cf25621eaf286d0c23',
509 'info_dict': {
510 'id': '1cfaf6b7ea',
511 'ext': 'mov',
512 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
513 'duration': 643.0,
514 'filesize': 182808282,
515 'uploader': 'education-portal.com',
516 },
517 },
518 {
519 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
520 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
521 'info_dict': {
522 'id': 'uxjb0lwrcz',
523 'ext': 'mp4',
524 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
525 'duration': 1715.0,
526 'uploader': 'thoughtworks.wistia.com',
527 },
528 },
529 # Soundcloud embed
530 {
531 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
532 'info_dict': {
533 'id': '174391317',
534 'ext': 'mp3',
535 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
536 'uploader': 'Sophos Security',
537 'title': 'Chet Chat 171 - Oct 29, 2014',
538 'upload_date': '20141029',
539 }
540 },
541 # Livestream embed
542 {
543 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
544 'info_dict': {
545 'id': '67864563',
546 'ext': 'flv',
547 'upload_date': '20141112',
548 'title': 'Rosetta #CometLanding webcast HL 10',
549 }
550 },
551 # LazyYT
552 {
553 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
554 'info_dict': {
555 'id': '1986',
556 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
557 },
558 'playlist_mincount': 2,
559 },
560 # Cinchcast embed
561 {
562 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
563 'info_dict': {
564 'id': '7141703',
565 'ext': 'mp3',
566 'upload_date': '20141126',
567 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
568 }
569 },
570 # Cinerama player
571 {
572 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
573 'info_dict': {
574 'id': '730m_DandD_1901_512k',
575 'ext': 'mp4',
576 'uploader': 'www.abc.net.au',
577 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
578 }
579 },
580 # embedded viddler video
581 {
582 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
583 'info_dict': {
584 'id': '4d03aad9',
585 'ext': 'mp4',
586 'uploader': 'deadspin',
587 'title': 'WALL-TO-GORTAT',
588 'timestamp': 1422285291,
589 'upload_date': '20150126',
590 },
591 'add_ie': ['Viddler'],
592 },
593 # Libsyn embed
594 {
595 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
596 'info_dict': {
597 'id': '3377616',
598 'ext': 'mp3',
599 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
600 'description': 'md5:601cb790edd05908957dae8aaa866465',
601 'upload_date': '20150220',
602 },
603 },
604 # jwplayer YouTube
605 {
606 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
607 'info_dict': {
608 'id': 'Mrj4DVp2zeA',
609 'ext': 'mp4',
610 'upload_date': '20150212',
611 'uploader': 'The National Archives UK',
612 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
613 'uploader_id': 'NationalArchives08',
614 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
615 },
616 },
617 # rtl.nl embed
618 {
619 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
620 'playlist_mincount': 5,
621 'info_dict': {
622 'id': 'aanslagen-kopenhagen',
623 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
624 }
625 },
626 # Zapiks embed
627 {
628 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
629 'info_dict': {
630 'id': '118046',
631 'ext': 'mp4',
632 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
633 }
634 },
635 # Kaltura embed
636 {
637 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
638 'info_dict': {
639 'id': '1_eergr3h1',
640 'ext': 'mp4',
641 'upload_date': '20150226',
642 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
643 'timestamp': int,
644 'title': 'John Carlson Postgame 2/25/15',
645 },
646 },
647 # Eagle.Platform embed (generic URL)
648 {
649 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
650 'info_dict': {
651 'id': '227304',
652 'ext': 'mp4',
653 'title': 'Навальный вышел на свободу',
654 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
655 'thumbnail': 're:^https?://.*\.jpg$',
656 'duration': 87,
657 'view_count': int,
658 'age_limit': 0,
659 },
660 },
661 # ClipYou (Eagle.Platform) embed (custom URL)
662 {
663 'url': 'http://muz-tv.ru/play/7129/',
664 'info_dict': {
665 'id': '12820',
666 'ext': 'mp4',
667 'title': "'O Sole Mio",
668 'thumbnail': 're:^https?://.*\.jpg$',
669 'duration': 216,
670 'view_count': int,
671 },
672 },
673 # Pladform embed
674 {
675 'url': 'http://muz-tv.ru/kinozal/view/7400/',
676 'info_dict': {
677 'id': '100183293',
678 'ext': 'mp4',
679 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
680 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
681 'thumbnail': 're:^https?://.*\.jpg$',
682 'duration': 694,
683 'age_limit': 0,
684 },
685 },
686 # Playwire embed
687 {
688 'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
689 'info_dict': {
690 'id': '3519514',
691 'ext': 'mp4',
692 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
693 'thumbnail': 're:^https?://.*\.png$',
694 'duration': 45.115,
695 },
696 },
697 # 5min embed
698 {
699 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
700 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
701 'info_dict': {
702 'id': '518726732',
703 'ext': 'mp4',
704 'title': 'Facebook Creates "On This Day" | Crunch Report',
705 },
706 },
707 # SVT embed
708 {
709 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
710 'info_dict': {
711 'id': '2900353',
712 'ext': 'flv',
713 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
714 'duration': 27,
715 'age_limit': 0,
716 },
717 },
718 # Crooks and Liars embed
719 {
720 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
721 'info_dict': {
722 'id': '8RUoRhRi',
723 'ext': 'mp4',
724 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
725 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
726 'timestamp': 1428207000,
727 'upload_date': '20150405',
728 'uploader': 'Heather',
729 },
730 },
731 # Crooks and Liars external embed
732 {
733 'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
734 'info_dict': {
735 'id': 'MTE3MjUtMzQ2MzA',
736 'ext': 'mp4',
737 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
738 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
739 'timestamp': 1265032391,
740 'upload_date': '20100201',
741 'uploader': 'Heather',
742 },
743 },
744 # NBC Sports vplayer embed
745 {
746 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
747 'info_dict': {
748 'id': 'ln7x1qSThw4k',
749 'ext': 'flv',
750 'title': "PFT Live: New leader in the 'new-look' defense",
751 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
752 },
753 },
754 # UDN embed
755 {
756 'url': 'http://www.udn.com/news/story/7314/822787',
757 'md5': 'fd2060e988c326991037b9aff9df21a6',
758 'info_dict': {
759 'id': '300346',
760 'ext': 'mp4',
761 'title': '中一中男師變性 全校師生力挺',
762 'thumbnail': 're:^https?://.*\.jpg$',
763 }
764 },
765 # Ooyala embed
766 {
767 'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
768 'info_dict': {
769 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
770 'ext': 'mp4',
771 'description': 'VIDEO: Index/Match versus VLOOKUP.',
772 'title': 'This is what separates the Excel masters from the wannabes',
773 },
774 'params': {
775 # m3u8 downloads
776 'skip_download': True,
777 }
778 },
779 # Contains a SMIL manifest
780 {
781 'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
782 'info_dict': {
783 'id': 'file',
784 'ext': 'flv',
785 'title': '+ Football: Lottery Champions League Europe',
786 'uploader': 'www.telewebion.com',
787 },
788 'params': {
789 # rtmpe downloads
790 'skip_download': True,
791 }
792 }
793 ]
794
795 def report_following_redirect(self, new_url):
796 """Report information extraction."""
797 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
798
799 def _extract_rss(self, url, video_id, doc):
800 playlist_title = doc.find('./channel/title').text
801 playlist_desc_el = doc.find('./channel/description')
802 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
803
804 entries = []
805 for it in doc.findall('./channel/item'):
806 next_url = xpath_text(it, 'link', fatal=False)
807 if not next_url:
808 enclosure_nodes = it.findall('./enclosure')
809 for e in enclosure_nodes:
810 next_url = e.attrib.get('url')
811 if next_url:
812 break
813
814 if not next_url:
815 continue
816
817 entries.append({
818 '_type': 'url',
819 'url': next_url,
820 'title': it.find('title').text,
821 })
822
823 return {
824 '_type': 'playlist',
825 'id': url,
826 'title': playlist_title,
827 'description': playlist_desc,
828 'entries': entries,
829 }
830
831 def _extract_camtasia(self, url, video_id, webpage):
832 """ Returns None if no camtasia video can be found. """
833
834 camtasia_cfg = self._search_regex(
835 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
836 webpage, 'camtasia configuration file', default=None)
837 if camtasia_cfg is None:
838 return None
839
840 title = self._html_search_meta('DC.title', webpage, fatal=True)
841
842 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
843 camtasia_cfg = self._download_xml(
844 camtasia_url, video_id,
845 note='Downloading camtasia configuration',
846 errnote='Failed to download camtasia configuration')
847 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
848
849 entries = []
850 for n in fileset_node.getchildren():
851 url_n = n.find('./uri')
852 if url_n is None:
853 continue
854
855 entries.append({
856 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
857 'title': '%s - %s' % (title, n.tag),
858 'url': compat_urlparse.urljoin(url, url_n.text),
859 'duration': float_or_none(n.find('./duration').text),
860 })
861
862 return {
863 '_type': 'playlist',
864 'entries': entries,
865 'title': title,
866 }
867
868 def _real_extract(self, url):
869 if url.startswith('//'):
870 return {
871 '_type': 'url',
872 'url': self.http_scheme() + url,
873 }
874
875 parsed_url = compat_urlparse.urlparse(url)
876 if not parsed_url.scheme:
877 default_search = self._downloader.params.get('default_search')
878 if default_search is None:
879 default_search = 'fixup_error'
880
881 if default_search in ('auto', 'auto_warning', 'fixup_error'):
882 if '/' in url:
883 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
884 return self.url_result('http://' + url)
885 elif default_search != 'fixup_error':
886 if default_search == 'auto_warning':
887 if re.match(r'^(?:url|URL)$', url):
888 raise ExtractorError(
889 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
890 expected=True)
891 else:
892 self._downloader.report_warning(
893 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
894 return self.url_result('ytsearch:' + url)
895
896 if default_search in ('error', 'fixup_error'):
897 raise ExtractorError(
898 '%r is not a valid URL. '
899 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
900 % (url, url), expected=True)
901 else:
902 if ':' not in default_search:
903 default_search += ':'
904 return self.url_result(default_search + url)
905
906 url, smuggled_data = unsmuggle_url(url)
907 force_videoid = None
908 is_intentional = smuggled_data and smuggled_data.get('to_generic')
909 if smuggled_data and 'force_videoid' in smuggled_data:
910 force_videoid = smuggled_data['force_videoid']
911 video_id = force_videoid
912 else:
913 video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
914
915 self.to_screen('%s: Requesting header' % video_id)
916
917 head_req = HEADRequest(url)
918 head_response = self._request_webpage(
919 head_req, video_id,
920 note=False, errnote='Could not send HEAD request to %s' % url,
921 fatal=False)
922
923 if head_response is not False:
924 # Check for redirect
925 new_url = head_response.geturl()
926 if url != new_url:
927 self.report_following_redirect(new_url)
928 if force_videoid:
929 new_url = smuggle_url(
930 new_url, {'force_videoid': force_videoid})
931 return self.url_result(new_url)
932
933 full_response = None
934 if head_response is False:
935 request = compat_urllib_request.Request(url)
936 request.add_header('Accept-Encoding', '*')
937 full_response = self._request_webpage(request, video_id)
938 head_response = full_response
939
940 # Check for direct link to a video
941 content_type = head_response.headers.get('Content-Type', '')
942 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
943 if m:
944 upload_date = unified_strdate(
945 head_response.headers.get('Last-Modified'))
946 return {
947 'id': video_id,
948 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
949 'direct': True,
950 'formats': [{
951 'format_id': m.group('format_id'),
952 'url': url,
953 'vcodec': 'none' if m.group('type') == 'audio' else None
954 }],
955 'upload_date': upload_date,
956 }
957
958 if not self._downloader.params.get('test', False) and not is_intentional:
959 self._downloader.report_warning('Falling back on generic information extractor.')
960
961 if not full_response:
962 request = compat_urllib_request.Request(url)
963 # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
964 # making it impossible to download only chunk of the file (yet we need only 512kB to
965 # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
966 # that will always result in downloading the whole file that is not desirable.
967 # Therefore for extraction pass we have to override Accept-Encoding to any in order
968 # to accept raw bytes and being able to download only a chunk.
969 # It may probably better to solve this by checking Content-Type for application/octet-stream
970 # after HEAD request finishes, but not sure if we can rely on this.
971 request.add_header('Accept-Encoding', '*')
972 full_response = self._request_webpage(request, video_id)
973
974 # Maybe it's a direct link to a video?
975 # Be careful not to download the whole thing!
976 first_bytes = full_response.read(512)
977 if not is_html(first_bytes):
978 self._downloader.report_warning(
979 'URL could be a direct video link, returning it as such.')
980 upload_date = unified_strdate(
981 head_response.headers.get('Last-Modified'))
982 return {
983 'id': video_id,
984 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
985 'direct': True,
986 'url': url,
987 'upload_date': upload_date,
988 }
989
990 webpage = self._webpage_read_content(
991 full_response, url, video_id, prefix=first_bytes)
992
993 self.report_extraction(video_id)
994
995 # Is it an RSS feed?
996 try:
997 doc = parse_xml(webpage)
998 if doc.tag == 'rss':
999 return self._extract_rss(url, video_id, doc)
1000 except compat_xml_parse_error:
1001 pass
1002
1003 # Is it a Camtasia project?
1004 camtasia_res = self._extract_camtasia(url, video_id, webpage)
1005 if camtasia_res is not None:
1006 return camtasia_res
1007
1008 # Sometimes embedded video player is hidden behind percent encoding
1009 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1010 # Unescaping the whole page allows to handle those cases in a generic way
1011 webpage = compat_urllib_parse.unquote(webpage)
1012
1013 # it's tempting to parse this further, but you would
1014 # have to take into account all the variations like
1015 # Video Title - Site Name
1016 # Site Name | Video Title
1017 # Video Title - Tagline | Site Name
1018 # and so on and so forth; it's just not practical
1019 video_title = self._html_search_regex(
1020 r'(?s)<title>(.*?)</title>', webpage, 'video title',
1021 default='video')
1022
1023 # Try to detect age limit automatically
1024 age_limit = self._rta_search(webpage)
1025 # And then there are the jokers who advertise that they use RTA,
1026 # but actually don't.
1027 AGE_LIMIT_MARKERS = [
1028 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1029 ]
1030 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1031 age_limit = 18
1032
1033 # video uploader is domain name
1034 video_uploader = self._search_regex(
1035 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1036
1037 # Helper method
1038 def _playlist_from_matches(matches, getter=None, ie=None):
1039 urlrs = orderedSet(
1040 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1041 for m in matches)
1042 return self.playlist_result(
1043 urlrs, playlist_id=video_id, playlist_title=video_title)
1044
1045 # Look for BrightCove:
1046 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1047 if bc_urls:
1048 self.to_screen('Brightcove video detected.')
1049 entries = [{
1050 '_type': 'url',
1051 'url': smuggle_url(bc_url, {'Referer': url}),
1052 'ie_key': 'Brightcove'
1053 } for bc_url in bc_urls]
1054
1055 return {
1056 '_type': 'playlist',
1057 'title': video_title,
1058 'id': video_id,
1059 'entries': entries,
1060 }
1061
1062 # Look for embedded rtl.nl player
1063 matches = re.findall(
1064 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1065 webpage)
1066 if matches:
1067 return _playlist_from_matches(matches, ie='RtlNl')
1068
1069 # Look for embedded (iframe) Vimeo player
1070 mobj = re.search(
1071 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1072 if mobj:
1073 player_url = unescapeHTML(mobj.group('url'))
1074 surl = smuggle_url(player_url, {'Referer': url})
1075 return self.url_result(surl)
1076 # Look for embedded (swf embed) Vimeo player
1077 mobj = re.search(
1078 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1079 if mobj:
1080 return self.url_result(mobj.group(1))
1081
1082 # Look for embedded YouTube player
1083 matches = re.findall(r'''(?x)
1084 (?:
1085 <iframe[^>]+?src=|
1086 data-video-url=|
1087 <embed[^>]+?src=|
1088 embedSWF\(?:\s*|
1089 new\s+SWFObject\(
1090 )
1091 (["\'])
1092 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1093 (?:embed|v|p)/.+?)
1094 \1''', webpage)
1095 if matches:
1096 return _playlist_from_matches(
1097 matches, lambda m: unescapeHTML(m[1]))
1098
1099 # Look for lazyYT YouTube embed
1100 matches = re.findall(
1101 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1102 if matches:
1103 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1104
1105 # Look for embedded Dailymotion player
1106 matches = re.findall(
1107 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1108 if matches:
1109 return _playlist_from_matches(
1110 matches, lambda m: unescapeHTML(m[1]))
1111
1112 # Look for embedded Dailymotion playlist player (#3822)
1113 m = re.search(
1114 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1115 if m:
1116 playlists = re.findall(
1117 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1118 if playlists:
1119 return _playlist_from_matches(
1120 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1121
1122 # Look for embedded Wistia player
1123 match = re.search(
1124 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1125 if match:
1126 embed_url = self._proto_relative_url(
1127 unescapeHTML(match.group('url')))
1128 return {
1129 '_type': 'url_transparent',
1130 'url': embed_url,
1131 'ie_key': 'Wistia',
1132 'uploader': video_uploader,
1133 'title': video_title,
1134 'id': video_id,
1135 }
1136
1137 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1138 if match:
1139 return {
1140 '_type': 'url_transparent',
1141 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1142 'ie_key': 'Wistia',
1143 'uploader': video_uploader,
1144 'title': video_title,
1145 'id': match.group('id')
1146 }
1147
1148 # Look for embedded blip.tv player
1149 bliptv_url = BlipTVIE._extract_url(webpage)
1150 if bliptv_url:
1151 return self.url_result(bliptv_url, 'BlipTV')
1152
1153 # Look for SVT player
1154 svt_url = SVTIE._extract_url(webpage)
1155 if svt_url:
1156 return self.url_result(svt_url, 'SVT')
1157
1158 # Look for embedded condenast player
1159 matches = re.findall(
1160 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1161 webpage)
1162 if matches:
1163 return {
1164 '_type': 'playlist',
1165 'entries': [{
1166 '_type': 'url',
1167 'ie_key': 'CondeNast',
1168 'url': ma,
1169 } for ma in matches],
1170 'title': video_title,
1171 'id': video_id,
1172 }
1173
1174 # Look for Bandcamp pages with custom domain
1175 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1176 if mobj is not None:
1177 burl = unescapeHTML(mobj.group(1))
1178 # Don't set the extractor because it can be a track url or an album
1179 return self.url_result(burl)
1180
1181 # Look for embedded Vevo player
1182 mobj = re.search(
1183 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1184 if mobj is not None:
1185 return self.url_result(mobj.group('url'))
1186
1187 # Look for embedded Viddler player
1188 mobj = re.search(
1189 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1190 webpage)
1191 if mobj is not None:
1192 return self.url_result(mobj.group('url'))
1193
1194 # Look for NYTimes player
1195 mobj = re.search(
1196 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1197 webpage)
1198 if mobj is not None:
1199 return self.url_result(mobj.group('url'))
1200
1201 # Look for Libsyn player
1202 mobj = re.search(
1203 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1204 if mobj is not None:
1205 return self.url_result(mobj.group('url'))
1206
1207 # Look for Ooyala videos
1208 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1209 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1210 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1211 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1212 if mobj is not None:
1213 return OoyalaIE._build_url_result(mobj.group('ec'))
1214
1215 # Look for multiple Ooyala embeds on SBN network websites
1216 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1217 if mobj is not None:
1218 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1219 if embeds:
1220 return _playlist_from_matches(
1221 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1222
1223 # Look for Aparat videos
1224 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1225 if mobj is not None:
1226 return self.url_result(mobj.group(1), 'Aparat')
1227
1228 # Look for MPORA videos
1229 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1230 if mobj is not None:
1231 return self.url_result(mobj.group(1), 'Mpora')
1232
1233 # Look for embedded NovaMov-based player
1234 mobj = re.search(
1235 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1236 (?P<url>http://(?:(?:embed|www)\.)?
1237 (?:novamov\.com|
1238 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1239 videoweed\.(?:es|com)|
1240 movshare\.(?:net|sx|ag)|
1241 divxstage\.(?:eu|net|ch|co|at|ag))
1242 /embed\.php.+?)\1''', webpage)
1243 if mobj is not None:
1244 return self.url_result(mobj.group('url'))
1245
1246 # Look for embedded Facebook player
1247 mobj = re.search(
1248 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1249 if mobj is not None:
1250 return self.url_result(mobj.group('url'), 'Facebook')
1251
1252 # Look for embedded VK player
1253 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1254 if mobj is not None:
1255 return self.url_result(mobj.group('url'), 'VK')
1256
1257 # Look for embedded ivi player
1258 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1259 if mobj is not None:
1260 return self.url_result(mobj.group('url'), 'Ivi')
1261
1262 # Look for embedded Huffington Post player
1263 mobj = re.search(
1264 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1265 if mobj is not None:
1266 return self.url_result(mobj.group('url'), 'HuffPost')
1267
1268 # Look for embed.ly
1269 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1270 if mobj is not None:
1271 return self.url_result(mobj.group('url'))
1272 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1273 if mobj is not None:
1274 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1275
1276 # Look for funnyordie embed
1277 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1278 if matches:
1279 return _playlist_from_matches(
1280 matches, getter=unescapeHTML, ie='FunnyOrDie')
1281
1282 # Look for BBC iPlayer embed
1283 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1284 if matches:
1285 return _playlist_from_matches(matches, ie='BBCCoUk')
1286
1287 # Look for embedded RUTV player
1288 rutv_url = RUTVIE._extract_url(webpage)
1289 if rutv_url:
1290 return self.url_result(rutv_url, 'RUTV')
1291
1292 # Look for embedded SportBox player
1293 sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1294 if sportbox_urls:
1295 return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1296
1297 # Look for embedded TED player
1298 mobj = re.search(
1299 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1300 if mobj is not None:
1301 return self.url_result(mobj.group('url'), 'TED')
1302
1303 # Look for embedded Ustream videos
1304 mobj = re.search(
1305 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1306 if mobj is not None:
1307 return self.url_result(mobj.group('url'), 'Ustream')
1308
1309 # Look for embedded arte.tv player
1310 mobj = re.search(
1311 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1312 webpage)
1313 if mobj is not None:
1314 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1315
1316 # Look for embedded smotri.com player
1317 smotri_url = SmotriIE._extract_url(webpage)
1318 if smotri_url:
1319 return self.url_result(smotri_url, 'Smotri')
1320
1321 # Look for embeded soundcloud player
1322 mobj = re.search(
1323 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1324 webpage)
1325 if mobj is not None:
1326 url = unescapeHTML(mobj.group('url'))
1327 return self.url_result(url)
1328
1329 # Look for embedded vulture.com player
1330 mobj = re.search(
1331 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1332 webpage)
1333 if mobj is not None:
1334 url = unescapeHTML(mobj.group('url'))
1335 return self.url_result(url, ie='Vulture')
1336
1337 # Look for embedded mtvservices player
1338 mobj = re.search(
1339 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1340 webpage)
1341 if mobj is not None:
1342 url = unescapeHTML(mobj.group('url'))
1343 return self.url_result(url, ie='MTVServicesEmbedded')
1344
1345 # Look for embedded yahoo player
1346 mobj = re.search(
1347 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1348 webpage)
1349 if mobj is not None:
1350 return self.url_result(mobj.group('url'), 'Yahoo')
1351
1352 # Look for embedded sbs.com.au player
1353 mobj = re.search(
1354 r'''(?x)
1355 (?:
1356 <meta\s+property="og:video"\s+content=|
1357 <iframe[^>]+?src=
1358 )
1359 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1360 webpage)
1361 if mobj is not None:
1362 return self.url_result(mobj.group('url'), 'SBS')
1363
1364 # Look for embedded Cinchcast player
1365 mobj = re.search(
1366 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1367 webpage)
1368 if mobj is not None:
1369 return self.url_result(mobj.group('url'), 'Cinchcast')
1370
1371 mobj = re.search(
1372 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1373 webpage)
1374 if not mobj:
1375 mobj = re.search(
1376 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1377 webpage)
1378 if mobj is not None:
1379 return self.url_result(mobj.group('url'), 'MLB')
1380
1381 mobj = re.search(
1382 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1383 webpage)
1384 if mobj is not None:
1385 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1386
1387 mobj = re.search(
1388 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1389 webpage)
1390 if mobj is not None:
1391 return self.url_result(mobj.group('url'), 'Livestream')
1392
1393 # Look for Zapiks embed
1394 mobj = re.search(
1395 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1396 if mobj is not None:
1397 return self.url_result(mobj.group('url'), 'Zapiks')
1398
1399 # Look for Kaltura embeds
1400 mobj = re.search(
1401 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1402 if mobj is not None:
1403 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1404
1405 # Look for Eagle.Platform embeds
1406 mobj = re.search(
1407 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1408 if mobj is not None:
1409 return self.url_result(mobj.group('url'), 'EaglePlatform')
1410
1411 # Look for ClipYou (uses Eagle.Platform) embeds
1412 mobj = re.search(
1413 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1414 if mobj is not None:
1415 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1416
1417 # Look for Pladform embeds
1418 mobj = re.search(
1419 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1420 if mobj is not None:
1421 return self.url_result(mobj.group('url'), 'Pladform')
1422
1423 # Look for Playwire embeds
1424 mobj = re.search(
1425 r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1426 if mobj is not None:
1427 return self.url_result(mobj.group('url'))
1428
1429 # Look for 5min embeds
1430 mobj = re.search(
1431 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1432 if mobj is not None:
1433 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1434
1435 # Look for Crooks and Liars embeds
1436 mobj = re.search(
1437 r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1438 if mobj is not None:
1439 return self.url_result(mobj.group('url'))
1440
1441 # Look for NBC Sports VPlayer embeds
1442 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1443 if nbc_sports_url:
1444 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1445
1446 # Look for UDN embeds
1447 mobj = re.search(
1448 r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1449 if mobj is not None:
1450 return self.url_result(
1451 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1452
1453 # Look for Senate ISVP iframe
1454 senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1455 if senate_isvp_url:
1456 return self.url_result(senate_isvp_url, 'SenateISVP')
1457
1458 def check_video(vurl):
1459 if YoutubeIE.suitable(vurl):
1460 return True
1461 vpath = compat_urlparse.urlparse(vurl).path
1462 vext = determine_ext(vpath)
1463 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1464
1465 def filter_video(urls):
1466 return list(filter(check_video, urls))
1467
1468 # Start with something easy: JW Player in SWFObject
1469 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1470 if not found:
1471 # Look for gorilla-vid style embedding
1472 found = filter_video(re.findall(r'''(?sx)
1473 (?:
1474 jw_plugins|
1475 JWPlayerOptions|
1476 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1477 )
1478 .*?
1479 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1480 if not found:
1481 # Broaden the search a little bit
1482 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1483 if not found:
1484 # Broaden the findall a little bit: JWPlayer JS loader
1485 found = filter_video(re.findall(
1486 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1487 if not found:
1488 # Flow player
1489 found = filter_video(re.findall(r'''(?xs)
1490 flowplayer\("[^"]+",\s*
1491 \{[^}]+?\}\s*,
1492 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1493 ["']?url["']?\s*:\s*["']([^"']+)["']
1494 ''', webpage))
1495 if not found:
1496 # Cinerama player
1497 found = re.findall(
1498 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1499 if not found:
1500 # Try to find twitter cards info
1501 found = filter_video(re.findall(
1502 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1503 if not found:
1504 # We look for Open Graph info:
1505 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1506 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1507 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1508 if m_video_type is not None:
1509 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1510 if not found:
1511 # HTML5 video
1512 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1513 if not found:
1514 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1515 found = re.search(
1516 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1517 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1518 webpage)
1519 if not found:
1520 # Look also in Refresh HTTP header
1521 refresh_header = head_response.headers.get('Refresh')
1522 if refresh_header:
1523 found = re.search(REDIRECT_REGEX, refresh_header)
1524 if found:
1525 new_url = compat_urlparse.urljoin(url, found.group(1))
1526 self.report_following_redirect(new_url)
1527 return {
1528 '_type': 'url',
1529 'url': new_url,
1530 }
1531 if not found:
1532 raise UnsupportedError(url)
1533
1534 entries = []
1535 for video_url in found:
1536 video_url = compat_urlparse.urljoin(url, video_url)
1537 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1538
1539 # Sometimes, jwplayer extraction will result in a YouTube URL
1540 if YoutubeIE.suitable(video_url):
1541 entries.append(self.url_result(video_url, 'Youtube'))
1542 continue
1543
1544 # here's a fun little line of code for you:
1545 video_id = os.path.splitext(video_id)[0]
1546
1547 if determine_ext(video_url) == 'smil':
1548 entries.append({
1549 'id': video_id,
1550 'formats': self._extract_smil_formats(video_url, video_id),
1551 'uploader': video_uploader,
1552 'title': video_title,
1553 'age_limit': age_limit,
1554 })
1555 else:
1556 entries.append({
1557 'id': video_id,
1558 'url': video_url,
1559 'uploader': video_uploader,
1560 'title': video_title,
1561 'age_limit': age_limit,
1562 })
1563
1564 if len(entries) == 1:
1565 return entries[0]
1566 else:
1567 for num, e in enumerate(entries, start=1):
1568 # 'url' results don't have a title
1569 if e.get('title') is not None:
1570 e['title'] = '%s (%d)' % (e['title'], num)
1571 return {
1572 '_type': 'playlist',
1573 'entries': entries,
1574 }