]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/InfoExtractors.py
Update changelog.
[youtubedl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19
20 from .utils import *
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 The following fields are optional:
42
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
120 try:
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
132 if m:
133 encoding = m.group(1)
134 else:
135 encoding = 'utf-8'
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
138 try:
139 url = url_or_request.get_full_url()
140 except AttributeError:
141 url = url_or_request
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 content = webpage_bytes.decode(encoding, 'replace')
146 return (content, urlh)
147
148 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the data of the page as a string """
150 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
151
152 def to_screen(self, msg):
153 """Print msg to screen, prefixing it with '[ie_name]'"""
154 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
155
156 def report_extraction(self, id_or_name):
157 """Report information extraction."""
158 self.to_screen(u'%s: Extracting information' % id_or_name)
159
160 def report_download_webpage(self, video_id):
161 """Report webpage download."""
162 self.to_screen(u'%s: Downloading webpage' % video_id)
163
164 def report_age_confirmation(self):
165 """Report attempt to confirm age."""
166 self.to_screen(u'Confirming age')
167
168 #Methods for following #608
169 #They set the correct value of the '_type' key
170 def video_result(self, video_info):
171 """Returns a video"""
172 video_info['_type'] = 'video'
173 return video_info
174 def url_result(self, url, ie=None):
175 """Returns a url that points to a page that should be processed"""
176 #TODO: ie should be the class used for getting the info
177 video_info = {'_type': 'url',
178 'url': url,
179 'ie_key': ie}
180 return video_info
181 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
182 """Returns a playlist"""
183 video_info = {'_type': 'playlist',
184 'entries': entries}
185 if playlist_id:
186 video_info['id'] = playlist_id
187 if playlist_title:
188 video_info['title'] = playlist_title
189 return video_info
190
191
192 class YoutubeIE(InfoExtractor):
193 """Information extractor for youtube.com."""
194
195 _VALID_URL = r"""^
196 (
197 (?:https?://)? # http(s):// (optional)
198 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
199 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
200 (?:.*?\#/)? # handle anchor (#/) redirect urls
201 (?: # the various things that can precede the ID:
202 (?:(?:v|embed|e)/) # v/ or embed/ or e/
203 |(?: # or the v= param in all its forms
204 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
205 (?:\?|\#!?) # the params delimiter ? or # or #!
206 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
207 v=
208 )
209 )? # optional -> youtube.com/xxxx is OK
210 )? # all until now is optional -> you can pass the naked ID
211 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
212 (?(1).+)? # if we found the ID, everything can follow
213 $"""
214 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
215 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
216 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
217 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
218 _NETRC_MACHINE = 'youtube'
219 # Listed in order of quality
220 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
221 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
222 _video_extensions = {
223 '13': '3gp',
224 '17': 'mp4',
225 '18': 'mp4',
226 '22': 'mp4',
227 '37': 'mp4',
228 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 '43': 'webm',
230 '44': 'webm',
231 '45': 'webm',
232 '46': 'webm',
233 }
234 _video_dimensions = {
235 '5': '240x400',
236 '6': '???',
237 '13': '???',
238 '17': '144x176',
239 '18': '360x640',
240 '22': '720x1280',
241 '34': '360x640',
242 '35': '480x854',
243 '37': '1080x1920',
244 '38': '3072x4096',
245 '43': '360x640',
246 '44': '480x854',
247 '45': '720x1280',
248 '46': '1080x1920',
249 }
250 IE_NAME = u'youtube'
251
252 @classmethod
253 def suitable(cls, url):
254 """Receives a URL and returns True if suitable for this IE."""
255 if YoutubePlaylistIE.suitable(url): return False
256 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
257
258 def report_lang(self):
259 """Report attempt to set language."""
260 self.to_screen(u'Setting language')
261
262 def report_login(self):
263 """Report attempt to log in."""
264 self.to_screen(u'Logging in')
265
266 def report_video_webpage_download(self, video_id):
267 """Report attempt to download video webpage."""
268 self.to_screen(u'%s: Downloading video webpage' % video_id)
269
270 def report_video_info_webpage_download(self, video_id):
271 """Report attempt to download video info webpage."""
272 self.to_screen(u'%s: Downloading video info webpage' % video_id)
273
274 def report_video_subtitles_download(self, video_id):
275 """Report attempt to download video info webpage."""
276 self.to_screen(u'%s: Checking available subtitles' % video_id)
277
278 def report_video_subtitles_request(self, video_id, sub_lang, format):
279 """Report attempt to download video info webpage."""
280 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
281
282 def report_video_subtitles_available(self, video_id, sub_lang_list):
283 """Report available subtitles."""
284 sub_lang = ",".join(list(sub_lang_list.keys()))
285 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
286
287 def report_information_extraction(self, video_id):
288 """Report attempt to extract video information."""
289 self.to_screen(u'%s: Extracting video information' % video_id)
290
291 def report_unavailable_format(self, video_id, format):
292 """Report extracted video URL."""
293 self.to_screen(u'%s: Format %s not available' % (video_id, format))
294
295 def report_rtmp_download(self):
296 """Indicate the download will use the RTMP protocol."""
297 self.to_screen(u'RTMP download detected')
298
299 def _get_available_subtitles(self, video_id):
300 self.report_video_subtitles_download(video_id)
301 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
302 try:
303 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
304 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
305 return (u'unable to download video subtitles: %s' % compat_str(err), None)
306 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
307 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
308 if not sub_lang_list:
309 return (u'video doesn\'t have subtitles', None)
310 return sub_lang_list
311
312 def _list_available_subtitles(self, video_id):
313 sub_lang_list = self._get_available_subtitles(video_id)
314 self.report_video_subtitles_available(video_id, sub_lang_list)
315
316 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
317 """
318 Return tuple:
319 (error_message, sub_lang, sub)
320 """
321 self.report_video_subtitles_request(video_id, sub_lang, format)
322 params = compat_urllib_parse.urlencode({
323 'lang': sub_lang,
324 'name': sub_name,
325 'v': video_id,
326 'fmt': format,
327 })
328 url = 'http://www.youtube.com/api/timedtext?' + params
329 try:
330 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
333 if not sub:
334 return (u'Did not fetch video subtitles', None, None)
335 return (None, sub_lang, sub)
336
337 def _extract_subtitle(self, video_id):
338 """
339 Return a list with a tuple:
340 [(error_message, sub_lang, sub)]
341 """
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
346 if self._downloader.params.get('subtitleslang', False):
347 sub_lang = self._downloader.params.get('subtitleslang')
348 elif 'en' in sub_lang_list:
349 sub_lang = 'en'
350 else:
351 sub_lang = list(sub_lang_list.keys())[0]
352 if not sub_lang in sub_lang_list:
353 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
354
355 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
356 return [subtitle]
357
358 def _extract_all_subtitles(self, video_id):
359 sub_lang_list = self._get_available_subtitles(video_id)
360 sub_format = self._downloader.params.get('subtitlesformat')
361 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
362 return [(sub_lang_list[0], None, None)]
363 subtitles = []
364 for sub_lang in sub_lang_list:
365 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
366 subtitles.append(subtitle)
367 return subtitles
368
369 def _print_formats(self, formats):
370 print('Available formats:')
371 for x in formats:
372 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
373
374 def _real_initialize(self):
375 if self._downloader is None:
376 return
377
378 username = None
379 password = None
380 downloader_params = self._downloader.params
381
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 username = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
387 try:
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
389 if info is not None:
390 username = info[0]
391 password = info[2]
392 else:
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
396 return
397
398 # Set language
399 request = compat_urllib_request.Request(self._LANG_URL)
400 try:
401 self.report_lang()
402 compat_urllib_request.urlopen(request).read()
403 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
404 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
405 return
406
407 # No authentication to be performed
408 if username is None:
409 return
410
411 request = compat_urllib_request.Request(self._LOGIN_URL)
412 try:
413 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
416 return
417
418 galx = None
419 dsh = None
420 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
421 if match:
422 galx = match.group(1)
423
424 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 if match:
426 dsh = match.group(1)
427
428 # Log in
429 login_form_strs = {
430 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
431 u'Email': username,
432 u'GALX': galx,
433 u'Passwd': password,
434 u'PersistentCookie': u'yes',
435 u'_utf8': u'霱',
436 u'bgresponse': u'js_disabled',
437 u'checkConnection': u'',
438 u'checkedDomains': u'youtube',
439 u'dnConn': u'',
440 u'dsh': dsh,
441 u'pstMsg': u'0',
442 u'rmShown': u'1',
443 u'secTok': u'',
444 u'signIn': u'Sign in',
445 u'timeStmp': u'',
446 u'service': u'youtube',
447 u'uilel': u'3',
448 u'hl': u'en_US',
449 }
450 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
451 # chokes on unicode
452 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
453 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
454 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
455 try:
456 self.report_login()
457 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
458 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
459 self._downloader.report_warning(u'unable to log in: bad username or password')
460 return
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 return
464
465 # Confirm age
466 age_form = {
467 'next_url': '/',
468 'action_confirm': 'Confirm',
469 }
470 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
471 try:
472 self.report_age_confirmation()
473 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
474 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
475 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
476
477 def _extract_id(self, url):
478 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
479 if mobj is None:
480 raise ExtractorError(u'Invalid URL: %s' % url)
481 video_id = mobj.group(2)
482 return video_id
483
484 def _real_extract(self, url):
485 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
486 mobj = re.search(self._NEXT_URL_RE, url)
487 if mobj:
488 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
489 video_id = self._extract_id(url)
490
491 # Get video webpage
492 self.report_video_webpage_download(video_id)
493 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
494 request = compat_urllib_request.Request(url)
495 try:
496 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
498 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
499
500 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
501
502 # Attempt to extract SWF player URL
503 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
504 if mobj is not None:
505 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
506 else:
507 player_url = None
508
509 # Get video info
510 self.report_video_info_webpage_download(video_id)
511 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
512 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
513 % (video_id, el_type))
514 video_info_webpage = self._download_webpage(video_info_url, video_id,
515 note=False,
516 errnote='unable to download video info webpage')
517 video_info = compat_parse_qs(video_info_webpage)
518 if 'token' in video_info:
519 break
520 if 'token' not in video_info:
521 if 'reason' in video_info:
522 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
523 else:
524 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
525
526 # Check for "rental" videos
527 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
528 raise ExtractorError(u'"rental" videos not supported')
529
530 # Start extracting information
531 self.report_information_extraction(video_id)
532
533 # uploader
534 if 'author' not in video_info:
535 raise ExtractorError(u'Unable to extract uploader name')
536 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
537
538 # uploader_id
539 video_uploader_id = None
540 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
541 if mobj is not None:
542 video_uploader_id = mobj.group(1)
543 else:
544 self._downloader.report_warning(u'unable to extract uploader nickname')
545
546 # title
547 if 'title' not in video_info:
548 raise ExtractorError(u'Unable to extract video title')
549 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550
551 # thumbnail image
552 if 'thumbnail_url' not in video_info:
553 self._downloader.report_warning(u'unable to extract video thumbnail')
554 video_thumbnail = ''
555 else: # don't panic if we can't find it
556 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
557
558 # upload date
559 upload_date = None
560 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
561 if mobj is not None:
562 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
563 upload_date = unified_strdate(upload_date)
564
565 # description
566 video_description = get_element_by_id("eow-description", video_webpage)
567 if video_description:
568 video_description = clean_html(video_description)
569 else:
570 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
571 if fd_mobj:
572 video_description = unescapeHTML(fd_mobj.group(1))
573 else:
574 video_description = u''
575
576 # subtitles
577 video_subtitles = None
578
579 if self._downloader.params.get('writesubtitles', False):
580 video_subtitles = self._extract_subtitle(video_id)
581 if video_subtitles:
582 (sub_error, sub_lang, sub) = video_subtitles[0]
583 if sub_error:
584 self._downloader.report_error(sub_error)
585
586 if self._downloader.params.get('allsubtitles', False):
587 video_subtitles = self._extract_all_subtitles(video_id)
588 for video_subtitle in video_subtitles:
589 (sub_error, sub_lang, sub) = video_subtitle
590 if sub_error:
591 self._downloader.report_error(sub_error)
592
593 if self._downloader.params.get('listsubtitles', False):
594 sub_lang_list = self._list_available_subtitles(video_id)
595 return
596
597 if 'length_seconds' not in video_info:
598 self._downloader.report_warning(u'unable to extract video duration')
599 video_duration = ''
600 else:
601 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
602
603 # token
604 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605
606 # Decide which formats to download
607 req_format = self._downloader.params.get('format', None)
608
609 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
610 self.report_rtmp_download()
611 video_url_list = [(None, video_info['conn'][0])]
612 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
613 url_map = {}
614 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
615 url_data = compat_parse_qs(url_data_str)
616 if 'itag' in url_data and 'url' in url_data:
617 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
618 if not 'ratebypass' in url: url += '&ratebypass=yes'
619 url_map[url_data['itag'][0]] = url
620
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
625 else:
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
632 return
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 else:
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
645 if rf in url_map:
646 video_url_list = [(rf, url_map[rf])]
647 break
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
650 else:
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
652
653 results = []
654 for format_param, video_real_url in video_url_list:
655 # Extension
656 video_extension = self._video_extensions.get(format_param, 'flv')
657
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
660
661 results.append({
662 'id': video_id,
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
675 })
676 return results
677
678
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
681
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
686
687 def report_disclaimer(self):
688 """Report disclaimer retrieval."""
689 self.to_screen(u'Retrieving disclaimer')
690
691 def _real_initialize(self):
692 # Retrieve disclaimer
693 request = compat_urllib_request.Request(self._DISCLAIMER)
694 try:
695 self.report_disclaimer()
696 disclaimer = compat_urllib_request.urlopen(request).read()
697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
698 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
699
700 # Confirm age
701 disclaimer_form = {
702 'filters': '0',
703 'submit': "Continue - I'm over 18",
704 }
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
706 try:
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
711
712 def _real_extract(self, url):
713 # Extract id and simplified title from URL
714 mobj = re.match(self._VALID_URL, url)
715 if mobj is None:
716 raise ExtractorError(u'Invalid URL: %s' % url)
717
718 video_id = mobj.group(1)
719
720 # Check if video comes from YouTube
721 mobj2 = re.match(r'^yt-(.*)$', video_id)
722 if mobj2 is not None:
723 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
724
725 # Retrieve video webpage to extract further information
726 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
727
728 # Extract URL, uploader and title from webpage
729 self.report_extraction(video_id)
730 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
731 if mobj is not None:
732 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
733 video_extension = mediaURL[-3:]
734
735 # Extract gdaKey if available
736 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
737 if mobj is None:
738 video_url = mediaURL
739 else:
740 gdaKey = mobj.group(1)
741 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
742 else:
743 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
744 if mobj is None:
745 raise ExtractorError(u'Unable to extract media URL')
746 vardict = compat_parse_qs(mobj.group(1))
747 if 'mediaData' not in vardict:
748 raise ExtractorError(u'Unable to extract media URL')
749 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
750 if mobj is None:
751 raise ExtractorError(u'Unable to extract media URL')
752 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
753 video_extension = mediaURL[-3:]
754 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
755
756 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
757 if mobj is None:
758 raise ExtractorError(u'Unable to extract title')
759 video_title = mobj.group(1).decode('utf-8')
760
761 mobj = re.search(r'submitter=(.*?);', webpage)
762 if mobj is None:
763 raise ExtractorError(u'Unable to extract uploader nickname')
764 video_uploader = mobj.group(1)
765
766 return [{
767 'id': video_id.decode('utf-8'),
768 'url': video_url.decode('utf-8'),
769 'uploader': video_uploader.decode('utf-8'),
770 'upload_date': None,
771 'title': video_title,
772 'ext': video_extension.decode('utf-8'),
773 }]
774
775 class DailymotionIE(InfoExtractor):
776 """Information Extractor for Dailymotion"""
777
778 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
779 IE_NAME = u'dailymotion'
780
781 def _real_extract(self, url):
782 # Extract id and simplified title from URL
783 mobj = re.match(self._VALID_URL, url)
784 if mobj is None:
785 raise ExtractorError(u'Invalid URL: %s' % url)
786
787 video_id = mobj.group(1).split('_')[0].split('?')[0]
788
789 video_extension = 'mp4'
790
791 # Retrieve video webpage to extract further information
792 request = compat_urllib_request.Request(url)
793 request.add_header('Cookie', 'family_filter=off')
794 webpage = self._download_webpage(request, video_id)
795
796 # Extract URL, uploader and title from webpage
797 self.report_extraction(video_id)
798 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
799 if mobj is None:
800 raise ExtractorError(u'Unable to extract media URL')
801 flashvars = compat_urllib_parse.unquote(mobj.group(1))
802
803 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
804 if key in flashvars:
805 max_quality = key
806 self.to_screen(u'Using %s' % key)
807 break
808 else:
809 raise ExtractorError(u'Unable to extract video URL')
810
811 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
812 if mobj is None:
813 raise ExtractorError(u'Unable to extract video URL')
814
815 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
816
817 # TODO: support choosing qualities
818
819 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
820 if mobj is None:
821 raise ExtractorError(u'Unable to extract title')
822 video_title = unescapeHTML(mobj.group('title'))
823
824 video_uploader = None
825 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
826 if mobj is None:
827 # lookin for official user
828 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
829 if mobj_official is None:
830 self._downloader.report_warning(u'unable to extract uploader nickname')
831 else:
832 video_uploader = mobj_official.group(1)
833 else:
834 video_uploader = mobj.group(1)
835
836 video_upload_date = None
837 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
838 if mobj is not None:
839 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
840
841 return [{
842 'id': video_id,
843 'url': video_url,
844 'uploader': video_uploader,
845 'upload_date': video_upload_date,
846 'title': video_title,
847 'ext': video_extension,
848 }]
849
850
851 class PhotobucketIE(InfoExtractor):
852 """Information extractor for photobucket.com."""
853
854 # TODO: the original _VALID_URL was:
855 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
856 # Check if it's necessary to keep the old extracion process
857 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
858 IE_NAME = u'photobucket'
859
860 def _real_extract(self, url):
861 # Extract id from URL
862 mobj = re.match(self._VALID_URL, url)
863 if mobj is None:
864 raise ExtractorError(u'Invalid URL: %s' % url)
865
866 video_id = mobj.group('id')
867
868 video_extension = mobj.group('ext')
869
870 # Retrieve video webpage to extract further information
871 webpage = self._download_webpage(url, video_id)
872
873 # Extract URL, uploader, and title from webpage
874 self.report_extraction(video_id)
875 # We try first by looking the javascript code:
876 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
877 if mobj is not None:
878 info = json.loads(mobj.group('json'))
879 return [{
880 'id': video_id,
881 'url': info[u'downloadUrl'],
882 'uploader': info[u'username'],
883 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
884 'title': info[u'title'],
885 'ext': video_extension,
886 'thumbnail': info[u'thumbUrl'],
887 }]
888
889 # We try looking in other parts of the webpage
890 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
891 if mobj is None:
892 raise ExtractorError(u'Unable to extract media URL')
893 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
894
895 video_url = mediaURL
896
897 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
898 if mobj is None:
899 raise ExtractorError(u'Unable to extract title')
900 video_title = mobj.group(1).decode('utf-8')
901
902 video_uploader = mobj.group(2).decode('utf-8')
903
904 return [{
905 'id': video_id.decode('utf-8'),
906 'url': video_url.decode('utf-8'),
907 'uploader': video_uploader,
908 'upload_date': None,
909 'title': video_title,
910 'ext': video_extension.decode('utf-8'),
911 }]
912
913
914 class YahooIE(InfoExtractor):
915 """Information extractor for screen.yahoo.com."""
916 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
917
918 def _real_extract(self, url):
919 mobj = re.match(self._VALID_URL, url)
920 if mobj is None:
921 raise ExtractorError(u'Invalid URL: %s' % url)
922 video_id = mobj.group('id')
923 webpage = self._download_webpage(url, video_id)
924 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
925
926 if m_id is None:
927 # TODO: Check which url parameters are required
928 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
929 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
930 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
931 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
932 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
933 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
934 '''
935 self.report_extraction(video_id)
936 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
937 if m_info is None:
938 raise ExtractorError(u'Unable to extract video info')
939 video_title = m_info.group('title')
940 video_description = m_info.group('description')
941 video_thumb = m_info.group('thumb')
942 video_date = m_info.group('date')
943 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
944
945 # TODO: Find a way to get mp4 videos
946 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
947 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
948 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
949 video_url = m_rest.group('url')
950 video_path = m_rest.group('path')
951 if m_rest is None:
952 raise ExtractorError(u'Unable to extract video url')
953
954 else: # We have to use a different method if another id is defined
955 long_id = m_id.group('new_id')
956 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
957 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
958 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
959 info = json.loads(json_str)
960 res = info[u'query'][u'results'][u'mediaObj'][0]
961 stream = res[u'streams'][0]
962 video_path = stream[u'path']
963 video_url = stream[u'host']
964 meta = res[u'meta']
965 video_title = meta[u'title']
966 video_description = meta[u'description']
967 video_thumb = meta[u'thumbnail']
968 video_date = None # I can't find it
969
970 info_dict = {
971 'id': video_id,
972 'url': video_url,
973 'play_path': video_path,
974 'title':video_title,
975 'description': video_description,
976 'thumbnail': video_thumb,
977 'upload_date': video_date,
978 'ext': 'flv',
979 }
980 return info_dict
981
982 class VimeoIE(InfoExtractor):
983 """Information extractor for vimeo.com."""
984
985 # _VALID_URL matches Vimeo URLs
986 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
987 IE_NAME = u'vimeo'
988
989 def _real_extract(self, url, new_video=True):
990 # Extract ID from URL
991 mobj = re.match(self._VALID_URL, url)
992 if mobj is None:
993 raise ExtractorError(u'Invalid URL: %s' % url)
994
995 video_id = mobj.group('id')
996 if not mobj.group('proto'):
997 url = 'https://' + url
998 if mobj.group('direct_link'):
999 url = 'https://vimeo.com/' + video_id
1000
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
1003 webpage = self._download_webpage(request, video_id)
1004
1005 # Now we begin extracting as much information as we can from what we
1006 # retrieved. First we extract the information common to all extractors,
1007 # and latter we extract those that are Vimeo specific.
1008 self.report_extraction(video_id)
1009
1010 # Extract the config JSON
1011 try:
1012 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013 config = json.loads(config)
1014 except:
1015 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1017 else:
1018 raise ExtractorError(u'Unable to extract info section')
1019
1020 # Extract title
1021 video_title = config["video"]["title"]
1022
1023 # Extract uploader and uploader_id
1024 video_uploader = config["video"]["owner"]["name"]
1025 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027 # Extract video thumbnail
1028 video_thumbnail = config["video"]["thumbnail"]
1029
1030 # Extract video description
1031 video_description = get_element_by_attribute("itemprop", "description", webpage)
1032 if video_description: video_description = clean_html(video_description)
1033 else: video_description = u''
1034
1035 # Extract upload date
1036 video_upload_date = None
1037 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038 if mobj is not None:
1039 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041 # Vimeo specific: extract request signature and timestamp
1042 sig = config['request']['signature']
1043 timestamp = config['request']['timestamp']
1044
1045 # Vimeo specific: extract video codec and quality information
1046 # First consider quality, then codecs, then take everything
1047 # TODO bind to format param
1048 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049 files = { 'hd': [], 'sd': [], 'other': []}
1050 for codec_name, codec_extension in codecs:
1051 if codec_name in config["video"]["files"]:
1052 if 'hd' in config["video"]["files"][codec_name]:
1053 files['hd'].append((codec_name, codec_extension, 'hd'))
1054 elif 'sd' in config["video"]["files"][codec_name]:
1055 files['sd'].append((codec_name, codec_extension, 'sd'))
1056 else:
1057 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059 for quality in ('hd', 'sd', 'other'):
1060 if len(files[quality]) > 0:
1061 video_quality = files[quality][0][2]
1062 video_codec = files[quality][0][0]
1063 video_extension = files[quality][0][1]
1064 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065 break
1066 else:
1067 raise ExtractorError(u'No known codec found')
1068
1069 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072 return [{
1073 'id': video_id,
1074 'url': video_url,
1075 'uploader': video_uploader,
1076 'uploader_id': video_uploader_id,
1077 'upload_date': video_upload_date,
1078 'title': video_title,
1079 'ext': video_extension,
1080 'thumbnail': video_thumbnail,
1081 'description': video_description,
1082 }]
1083
1084
1085 class ArteTvIE(InfoExtractor):
1086 """arte.tv information extractor."""
1087
1088 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089 _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091 IE_NAME = u'arte.tv'
1092
1093 def fetch_webpage(self, url):
1094 request = compat_urllib_request.Request(url)
1095 try:
1096 self.report_download_webpage(url)
1097 webpage = compat_urllib_request.urlopen(request).read()
1098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100 except ValueError as err:
1101 raise ExtractorError(u'Invalid URL: %s' % url)
1102 return webpage
1103
1104 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105 page = self.fetch_webpage(url)
1106 mobj = re.search(regex, page, regexFlags)
1107 info = {}
1108
1109 if mobj is None:
1110 raise ExtractorError(u'Invalid URL: %s' % url)
1111
1112 for (i, key, err) in matchTuples:
1113 if mobj.group(i) is None:
1114 raise ExtractorError(err)
1115 else:
1116 info[key] = mobj.group(i)
1117
1118 return info
1119
1120 def extractLiveStream(self, url):
1121 video_lang = url.split('/')[-4]
1122 info = self.grep_webpage(
1123 url,
1124 r'src="(.*?/videothek_js.*?\.js)',
1125 0,
1126 [
1127 (1, 'url', u'Invalid URL: %s' % url)
1128 ]
1129 )
1130 http_host = url.split('/')[2]
1131 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132 info = self.grep_webpage(
1133 next_url,
1134 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135 '(http://.*?\.swf).*?' +
1136 '(rtmp://.*?)\'',
1137 re.DOTALL,
1138 [
1139 (1, 'path', u'could not extract video path: %s' % url),
1140 (2, 'player', u'could not extract video player: %s' % url),
1141 (3, 'url', u'could not extract video url: %s' % url)
1142 ]
1143 )
1144 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1145
1146 def extractPlus7Stream(self, url):
1147 video_lang = url.split('/')[-3]
1148 info = self.grep_webpage(
1149 url,
1150 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1151 0,
1152 [
1153 (1, 'url', u'Invalid URL: %s' % url)
1154 ]
1155 )
1156 next_url = compat_urllib_parse.unquote(info.get('url'))
1157 info = self.grep_webpage(
1158 next_url,
1159 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1160 0,
1161 [
1162 (1, 'url', u'Could not find <video> tag: %s' % url)
1163 ]
1164 )
1165 next_url = compat_urllib_parse.unquote(info.get('url'))
1166
1167 info = self.grep_webpage(
1168 next_url,
1169 r'<video id="(.*?)".*?>.*?' +
1170 '<name>(.*?)</name>.*?' +
1171 '<dateVideo>(.*?)</dateVideo>.*?' +
1172 '<url quality="hd">(.*?)</url>',
1173 re.DOTALL,
1174 [
1175 (1, 'id', u'could not extract video id: %s' % url),
1176 (2, 'title', u'could not extract video title: %s' % url),
1177 (3, 'date', u'could not extract video date: %s' % url),
1178 (4, 'url', u'could not extract video url: %s' % url)
1179 ]
1180 )
1181
1182 return {
1183 'id': info.get('id'),
1184 'url': compat_urllib_parse.unquote(info.get('url')),
1185 'uploader': u'arte.tv',
1186 'upload_date': unified_strdate(info.get('date')),
1187 'title': info.get('title').decode('utf-8'),
1188 'ext': u'mp4',
1189 'format': u'NA',
1190 'player_url': None,
1191 }
1192
1193 def _real_extract(self, url):
1194 video_id = url.split('/')[-1]
1195 self.report_extraction(video_id)
1196
1197 if re.search(self._LIVE_URL, video_id) is not None:
1198 self.extractLiveStream(url)
1199 return
1200 else:
1201 info = self.extractPlus7Stream(url)
1202
1203 return [info]
1204
1205
1206 class GenericIE(InfoExtractor):
1207 """Generic last-resort information extractor."""
1208
1209 _VALID_URL = r'.*'
1210 IE_NAME = u'generic'
1211
1212 def report_download_webpage(self, video_id):
1213 """Report webpage download."""
1214 if not self._downloader.params.get('test', False):
1215 self._downloader.report_warning(u'Falling back on generic information extractor.')
1216 super(GenericIE, self).report_download_webpage(video_id)
1217
1218 def report_following_redirect(self, new_url):
1219 """Report information extraction."""
1220 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1221
1222 def _test_redirect(self, url):
1223 """Check if it is a redirect, like url shorteners, in case return the new url."""
1224 class HeadRequest(compat_urllib_request.Request):
1225 def get_method(self):
1226 return "HEAD"
1227
1228 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1229 """
1230 Subclass the HTTPRedirectHandler to make it use our
1231 HeadRequest also on the redirected URL
1232 """
1233 def redirect_request(self, req, fp, code, msg, headers, newurl):
1234 if code in (301, 302, 303, 307):
1235 newurl = newurl.replace(' ', '%20')
1236 newheaders = dict((k,v) for k,v in req.headers.items()
1237 if k.lower() not in ("content-length", "content-type"))
1238 return HeadRequest(newurl,
1239 headers=newheaders,
1240 origin_req_host=req.get_origin_req_host(),
1241 unverifiable=True)
1242 else:
1243 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1244
1245 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1246 """
1247 Fallback to GET if HEAD is not allowed (405 HTTP error)
1248 """
1249 def http_error_405(self, req, fp, code, msg, headers):
1250 fp.read()
1251 fp.close()
1252
1253 newheaders = dict((k,v) for k,v in req.headers.items()
1254 if k.lower() not in ("content-length", "content-type"))
1255 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1256 headers=newheaders,
1257 origin_req_host=req.get_origin_req_host(),
1258 unverifiable=True))
1259
1260 # Build our opener
1261 opener = compat_urllib_request.OpenerDirector()
1262 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263 HTTPMethodFallback, HEADRedirectHandler,
1264 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265 opener.add_handler(handler())
1266
1267 response = opener.open(HeadRequest(url))
1268 new_url = response.geturl()
1269
1270 if url == new_url:
1271 return False
1272
1273 self.report_following_redirect(new_url)
1274 return new_url
1275
1276 def _real_extract(self, url):
1277 new_url = self._test_redirect(url)
1278 if new_url: return [self.url_result(new_url)]
1279
1280 video_id = url.split('/')[-1]
1281 try:
1282 webpage = self._download_webpage(url, video_id)
1283 except ValueError as err:
1284 # since this is the last-resort InfoExtractor, if
1285 # this error is thrown, it'll be thrown here
1286 raise ExtractorError(u'Invalid URL: %s' % url)
1287
1288 self.report_extraction(video_id)
1289 # Start with something easy: JW Player in SWFObject
1290 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291 if mobj is None:
1292 # Broaden the search a little bit
1293 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1294 if mobj is None:
1295 # Broaden the search a little bit: JWPlayer JS loader
1296 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1297 if mobj is None:
1298 raise ExtractorError(u'Invalid URL: %s' % url)
1299
1300 # It's possible that one of the regexes
1301 # matched, but returned an empty group:
1302 if mobj.group(1) is None:
1303 raise ExtractorError(u'Invalid URL: %s' % url)
1304
1305 video_url = compat_urllib_parse.unquote(mobj.group(1))
1306 video_id = os.path.basename(video_url)
1307
1308 # here's a fun little line of code for you:
1309 video_extension = os.path.splitext(video_id)[1][1:]
1310 video_id = os.path.splitext(video_id)[0]
1311
1312 # it's tempting to parse this further, but you would
1313 # have to take into account all the variations like
1314 # Video Title - Site Name
1315 # Site Name | Video Title
1316 # Video Title - Tagline | Site Name
1317 # and so on and so forth; it's just not practical
1318 mobj = re.search(r'<title>(.*)</title>', webpage)
1319 if mobj is None:
1320 raise ExtractorError(u'Unable to extract title')
1321 video_title = mobj.group(1)
1322
1323 # video uploader is domain name
1324 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325 if mobj is None:
1326 raise ExtractorError(u'Unable to extract title')
1327 video_uploader = mobj.group(1)
1328
1329 return [{
1330 'id': video_id,
1331 'url': video_url,
1332 'uploader': video_uploader,
1333 'upload_date': None,
1334 'title': video_title,
1335 'ext': video_extension,
1336 }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340 """Information Extractor for YouTube search queries."""
1341 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343 _max_youtube_results = 1000
1344 IE_NAME = u'youtube:search'
1345
1346 def report_download_page(self, query, pagenum):
1347 """Report attempt to download search page with given number."""
1348 query = query.decode(preferredencoding())
1349 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1350
1351 def _real_extract(self, query):
1352 mobj = re.match(self._VALID_URL, query)
1353 if mobj is None:
1354 raise ExtractorError(u'Invalid search query "%s"' % query)
1355
1356 prefix, query = query.split(':')
1357 prefix = prefix[8:]
1358 query = query.encode('utf-8')
1359 if prefix == '':
1360 return self._get_n_results(query, 1)
1361 elif prefix == 'all':
1362 self._get_n_results(query, self._max_youtube_results)
1363 else:
1364 try:
1365 n = int(prefix)
1366 if n <= 0:
1367 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368 elif n > self._max_youtube_results:
1369 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370 n = self._max_youtube_results
1371 return self._get_n_results(query, n)
1372 except ValueError: # parsing prefix as integer fails
1373 return self._get_n_results(query, 1)
1374
1375 def _get_n_results(self, query, n):
1376 """Get a specified number of results for a query"""
1377
1378 video_ids = []
1379 pagenum = 0
1380 limit = n
1381
1382 while (50 * pagenum) < limit:
1383 self.report_download_page(query, pagenum+1)
1384 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385 request = compat_urllib_request.Request(result_url)
1386 try:
1387 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390 api_response = json.loads(data)['data']
1391
1392 if not 'items' in api_response:
1393 raise ExtractorError(u'[youtube] No video results')
1394
1395 new_ids = list(video['id'] for video in api_response['items'])
1396 video_ids += new_ids
1397
1398 limit = min(n, api_response['totalItems'])
1399 pagenum += 1
1400
1401 if len(video_ids) > n:
1402 video_ids = video_ids[:n]
1403 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1404 return self.playlist_result(videos, query)
1405
1406
1407 class GoogleSearchIE(InfoExtractor):
1408 """Information Extractor for Google Video search queries."""
1409 _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1411 _max_google_results = 1000
1412 IE_NAME = u'video.google:search'
1413
1414 def _real_extract(self, query):
1415 mobj = re.match(self._VALID_URL, query)
1416
1417 prefix = mobj.group('prefix')
1418 query = mobj.group('query')
1419 if prefix == '':
1420 return self._get_n_results(query, 1)
1421 elif prefix == 'all':
1422 return self._get_n_results(query, self._max_google_results)
1423 else:
1424 n = int(prefix)
1425 if n <= 0:
1426 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427 elif n > self._max_google_results:
1428 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429 n = self._max_google_results
1430 return self._get_n_results(query, n)
1431
1432 def _get_n_results(self, query, n):
1433 """Get a specified number of results for a query"""
1434
1435 res = {
1436 '_type': 'playlist',
1437 'id': query,
1438 'entries': []
1439 }
1440
1441 for pagenum in itertools.count(1):
1442 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1444 note='Downloading result page ' + str(pagenum))
1445
1446 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1447 e = {
1448 '_type': 'url',
1449 'url': mobj.group(1)
1450 }
1451 res['entries'].append(e)
1452
1453 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1454 return res
1455
1456 class YahooSearchIE(InfoExtractor):
1457 """Information Extractor for Yahoo! Video search queries."""
1458
1459 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1460
1461 _max_yahoo_results = 1000
1462 IE_NAME = u'screen.yahoo:search'
1463
1464 def _real_extract(self, query):
1465 mobj = re.match(self._VALID_URL, query)
1466 if mobj is None:
1467 raise ExtractorError(u'Invalid search query "%s"' % query)
1468
1469 prefix, query = query.split(':')
1470 prefix = prefix[8:]
1471 query = query.encode('utf-8')
1472 if prefix == '':
1473 return self._get_n_results(query, 1)
1474 elif prefix == 'all':
1475 return self._get_n_results(query, self._max_yahoo_results)
1476 else:
1477 try:
1478 n = int(prefix)
1479 if n <= 0:
1480 raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1481 elif n > self._max_yahoo_results:
1482 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1483 n = self._max_yahoo_results
1484 return self._get_n_results(query, n)
1485 except ValueError: # parsing prefix as integer fails
1486 return self._get_n_results(query, 1)
1487
1488 def _get_n_results(self, query, n):
1489 """Get a specified number of results for a query"""
1490
1491 res = {
1492 '_type': 'playlist',
1493 'id': query,
1494 'entries': []
1495 }
1496 for pagenum in itertools.count(0):
1497 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1498 webpage = self._download_webpage(result_url, query,
1499 note='Downloading results page '+str(pagenum+1))
1500 info = json.loads(webpage)
1501 m = info[u'm']
1502 results = info[u'results']
1503
1504 for (i, r) in enumerate(results):
1505 if (pagenum * 30) +i >= n:
1506 break
1507 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1508 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1509 res['entries'].append(e)
1510 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1511 break
1512
1513 return res
1514
1515
1516 class YoutubePlaylistIE(InfoExtractor):
1517 """Information Extractor for YouTube playlists."""
1518
1519 _VALID_URL = r"""(?:
1520 (?:https?://)?
1521 (?:\w+\.)?
1522 youtube\.com/
1523 (?:
1524 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1525 \? (?:.*?&)*? (?:p|a|list)=
1526 | p/
1527 )
1528 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1529 .*
1530 |
1531 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1532 )"""
1533 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1534 _MAX_RESULTS = 50
1535 IE_NAME = u'youtube:playlist'
1536
1537 @classmethod
1538 def suitable(cls, url):
1539 """Receives a URL and returns True if suitable for this IE."""
1540 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1541
1542 def _real_extract(self, url):
1543 # Extract playlist id
1544 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1545 if mobj is None:
1546 raise ExtractorError(u'Invalid URL: %s' % url)
1547
1548 # Download playlist videos from API
1549 playlist_id = mobj.group(1) or mobj.group(2)
1550 page_num = 1
1551 videos = []
1552
1553 while True:
1554 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1555 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1556
1557 try:
1558 response = json.loads(page)
1559 except ValueError as err:
1560 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1561
1562 if 'feed' not in response:
1563 raise ExtractorError(u'Got a malformed response from YouTube API')
1564 playlist_title = response['feed']['title']['$t']
1565 if 'entry' not in response['feed']:
1566 # Number of videos is a multiple of self._MAX_RESULTS
1567 break
1568
1569 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1570 for entry in response['feed']['entry']
1571 if 'content' in entry ]
1572
1573 if len(response['feed']['entry']) < self._MAX_RESULTS:
1574 break
1575 page_num += 1
1576
1577 videos = [v[1] for v in sorted(videos)]
1578
1579 url_results = [self.url_result(url, 'Youtube') for url in videos]
1580 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1581
1582
1583 class YoutubeChannelIE(InfoExtractor):
1584 """Information Extractor for YouTube channels."""
1585
1586 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1587 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1588 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1589 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1590 IE_NAME = u'youtube:channel'
1591
1592 def extract_videos_from_page(self, page):
1593 ids_in_page = []
1594 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1595 if mobj.group(1) not in ids_in_page:
1596 ids_in_page.append(mobj.group(1))
1597 return ids_in_page
1598
1599 def _real_extract(self, url):
1600 # Extract channel id
1601 mobj = re.match(self._VALID_URL, url)
1602 if mobj is None:
1603 raise ExtractorError(u'Invalid URL: %s' % url)
1604
1605 # Download channel page
1606 channel_id = mobj.group(1)
1607 video_ids = []
1608 pagenum = 1
1609
1610 url = self._TEMPLATE_URL % (channel_id, pagenum)
1611 page = self._download_webpage(url, channel_id,
1612 u'Downloading page #%s' % pagenum)
1613
1614 # Extract video identifiers
1615 ids_in_page = self.extract_videos_from_page(page)
1616 video_ids.extend(ids_in_page)
1617
1618 # Download any subsequent channel pages using the json-based channel_ajax query
1619 if self._MORE_PAGES_INDICATOR in page:
1620 while True:
1621 pagenum = pagenum + 1
1622
1623 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1624 page = self._download_webpage(url, channel_id,
1625 u'Downloading page #%s' % pagenum)
1626
1627 page = json.loads(page)
1628
1629 ids_in_page = self.extract_videos_from_page(page['content_html'])
1630 video_ids.extend(ids_in_page)
1631
1632 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1633 break
1634
1635 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1636
1637 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1638 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1639 return [self.playlist_result(url_entries, channel_id)]
1640
1641
1642 class YoutubeUserIE(InfoExtractor):
1643 """Information Extractor for YouTube users."""
1644
1645 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1646 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1647 _GDATA_PAGE_SIZE = 50
1648 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1649 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1650 IE_NAME = u'youtube:user'
1651
1652 def _real_extract(self, url):
1653 # Extract username
1654 mobj = re.match(self._VALID_URL, url)
1655 if mobj is None:
1656 raise ExtractorError(u'Invalid URL: %s' % url)
1657
1658 username = mobj.group(1)
1659
1660 # Download video ids using YouTube Data API. Result size per
1661 # query is limited (currently to 50 videos) so we need to query
1662 # page by page until there are no video ids - it means we got
1663 # all of them.
1664
1665 video_ids = []
1666 pagenum = 0
1667
1668 while True:
1669 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670
1671 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1672 page = self._download_webpage(gdata_url, username,
1673 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674
1675 # Extract video identifiers
1676 ids_in_page = []
1677
1678 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1679 if mobj.group(1) not in ids_in_page:
1680 ids_in_page.append(mobj.group(1))
1681
1682 video_ids.extend(ids_in_page)
1683
1684 # A little optimization - if current page is not
1685 # "full", ie. does not contain PAGE_SIZE video ids then
1686 # we can assume that this page is the last one - there
1687 # are no more ids on further pages - no need to query
1688 # again.
1689
1690 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1691 break
1692
1693 pagenum += 1
1694
1695 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1696 url_results = [self.url_result(url, 'Youtube') for url in urls]
1697 return [self.playlist_result(url_results, playlist_title = username)]
1698
1699
1700 class BlipTVUserIE(InfoExtractor):
1701 """Information Extractor for blip.tv users."""
1702
1703 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1704 _PAGE_SIZE = 12
1705 IE_NAME = u'blip.tv:user'
1706
1707 def _real_extract(self, url):
1708 # Extract username
1709 mobj = re.match(self._VALID_URL, url)
1710 if mobj is None:
1711 raise ExtractorError(u'Invalid URL: %s' % url)
1712
1713 username = mobj.group(1)
1714
1715 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1716
1717 page = self._download_webpage(url, username, u'Downloading user page')
1718 mobj = re.search(r'data-users-id="([^"]+)"', page)
1719 page_base = page_base % mobj.group(1)
1720
1721
1722 # Download video ids using BlipTV Ajax calls. Result size per
1723 # query is limited (currently to 12 videos) so we need to query
1724 # page by page until there are no video ids - it means we got
1725 # all of them.
1726
1727 video_ids = []
1728 pagenum = 1
1729
1730 while True:
1731 url = page_base + "&page=" + str(pagenum)
1732 page = self._download_webpage(url, username,
1733 u'Downloading video ids from page %d' % pagenum)
1734
1735 # Extract video identifiers
1736 ids_in_page = []
1737
1738 for mobj in re.finditer(r'href="/([^"]+)"', page):
1739 if mobj.group(1) not in ids_in_page:
1740 ids_in_page.append(unescapeHTML(mobj.group(1)))
1741
1742 video_ids.extend(ids_in_page)
1743
1744 # A little optimization - if current page is not
1745 # "full", ie. does not contain PAGE_SIZE video ids then
1746 # we can assume that this page is the last one - there
1747 # are no more ids on further pages - no need to query
1748 # again.
1749
1750 if len(ids_in_page) < self._PAGE_SIZE:
1751 break
1752
1753 pagenum += 1
1754
1755 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1756 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1757 return [self.playlist_result(url_entries, playlist_title = username)]
1758
1759
1760 class DepositFilesIE(InfoExtractor):
1761 """Information extractor for depositfiles.com"""
1762
1763 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1764
1765 def _real_extract(self, url):
1766 file_id = url.split('/')[-1]
1767 # Rebuild url in english locale
1768 url = 'http://depositfiles.com/en/files/' + file_id
1769
1770 # Retrieve file webpage with 'Free download' button pressed
1771 free_download_indication = { 'gateway_result' : '1' }
1772 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1773 try:
1774 self.report_download_webpage(file_id)
1775 webpage = compat_urllib_request.urlopen(request).read()
1776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1778
1779 # Search for the real file URL
1780 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1781 if (mobj is None) or (mobj.group(1) is None):
1782 # Try to figure out reason of the error.
1783 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1784 if (mobj is not None) and (mobj.group(1) is not None):
1785 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1786 raise ExtractorError(u'%s' % restriction_message)
1787 else:
1788 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1789
1790 file_url = mobj.group(1)
1791 file_extension = os.path.splitext(file_url)[1][1:]
1792
1793 # Search for file title
1794 mobj = re.search(r'<b title="(.*?)">', webpage)
1795 if mobj is None:
1796 raise ExtractorError(u'Unable to extract title')
1797 file_title = mobj.group(1).decode('utf-8')
1798
1799 return [{
1800 'id': file_id.decode('utf-8'),
1801 'url': file_url.decode('utf-8'),
1802 'uploader': None,
1803 'upload_date': None,
1804 'title': file_title,
1805 'ext': file_extension.decode('utf-8'),
1806 }]
1807
1808
1809 class FacebookIE(InfoExtractor):
1810 """Information Extractor for Facebook"""
1811
1812 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1813 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1814 _NETRC_MACHINE = 'facebook'
1815 IE_NAME = u'facebook'
1816
1817 def report_login(self):
1818 """Report attempt to log in."""
1819 self.to_screen(u'Logging in')
1820
1821 def _real_initialize(self):
1822 if self._downloader is None:
1823 return
1824
1825 useremail = None
1826 password = None
1827 downloader_params = self._downloader.params
1828
1829 # Attempt to use provided username and password or .netrc data
1830 if downloader_params.get('username', None) is not None:
1831 useremail = downloader_params['username']
1832 password = downloader_params['password']
1833 elif downloader_params.get('usenetrc', False):
1834 try:
1835 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1836 if info is not None:
1837 useremail = info[0]
1838 password = info[2]
1839 else:
1840 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1841 except (IOError, netrc.NetrcParseError) as err:
1842 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1843 return
1844
1845 if useremail is None:
1846 return
1847
1848 # Log in
1849 login_form = {
1850 'email': useremail,
1851 'pass': password,
1852 'login': 'Log+In'
1853 }
1854 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1855 try:
1856 self.report_login()
1857 login_results = compat_urllib_request.urlopen(request).read()
1858 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1859 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1860 return
1861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1863 return
1864
1865 def _real_extract(self, url):
1866 mobj = re.match(self._VALID_URL, url)
1867 if mobj is None:
1868 raise ExtractorError(u'Invalid URL: %s' % url)
1869 video_id = mobj.group('ID')
1870
1871 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1872 webpage = self._download_webpage(url, video_id)
1873
1874 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1875 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1876 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1877 if not m:
1878 raise ExtractorError(u'Cannot parse data')
1879 data = dict(json.loads(m.group(1)))
1880 params_raw = compat_urllib_parse.unquote(data['params'])
1881 params = json.loads(params_raw)
1882 video_data = params['video_data'][0]
1883 video_url = video_data.get('hd_src')
1884 if not video_url:
1885 video_url = video_data['sd_src']
1886 if not video_url:
1887 raise ExtractorError(u'Cannot find video URL')
1888 video_duration = int(video_data['video_duration'])
1889 thumbnail = video_data['thumbnail_src']
1890
1891 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1892 if not m:
1893 raise ExtractorError(u'Cannot find title in webpage')
1894 video_title = unescapeHTML(m.group(1))
1895
1896 info = {
1897 'id': video_id,
1898 'title': video_title,
1899 'url': video_url,
1900 'ext': 'mp4',
1901 'duration': video_duration,
1902 'thumbnail': thumbnail,
1903 }
1904 return [info]
1905
1906
1907 class BlipTVIE(InfoExtractor):
1908 """Information extractor for blip.tv"""
1909
1910 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1911 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1912 IE_NAME = u'blip.tv'
1913
1914 def report_direct_download(self, title):
1915 """Report information extraction."""
1916 self.to_screen(u'%s: Direct download detected' % title)
1917
1918 def _real_extract(self, url):
1919 mobj = re.match(self._VALID_URL, url)
1920 if mobj is None:
1921 raise ExtractorError(u'Invalid URL: %s' % url)
1922
1923 urlp = compat_urllib_parse_urlparse(url)
1924 if urlp.path.startswith('/play/'):
1925 request = compat_urllib_request.Request(url)
1926 response = compat_urllib_request.urlopen(request)
1927 redirecturl = response.geturl()
1928 rurlp = compat_urllib_parse_urlparse(redirecturl)
1929 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1930 url = 'http://blip.tv/a/a-' + file_id
1931 return self._real_extract(url)
1932
1933
1934 if '?' in url:
1935 cchar = '&'
1936 else:
1937 cchar = '?'
1938 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1939 request = compat_urllib_request.Request(json_url)
1940 request.add_header('User-Agent', 'iTunes/10.6.1')
1941 self.report_extraction(mobj.group(1))
1942 info = None
1943 try:
1944 urlh = compat_urllib_request.urlopen(request)
1945 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1946 basename = url.split('/')[-1]
1947 title,ext = os.path.splitext(basename)
1948 title = title.decode('UTF-8')
1949 ext = ext.replace('.', '')
1950 self.report_direct_download(title)
1951 info = {
1952 'id': title,
1953 'url': url,
1954 'uploader': None,
1955 'upload_date': None,
1956 'title': title,
1957 'ext': ext,
1958 'urlhandle': urlh
1959 }
1960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1961 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1962 if info is None: # Regular URL
1963 try:
1964 json_code_bytes = urlh.read()
1965 json_code = json_code_bytes.decode('utf-8')
1966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1967 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1968
1969 try:
1970 json_data = json.loads(json_code)
1971 if 'Post' in json_data:
1972 data = json_data['Post']
1973 else:
1974 data = json_data
1975
1976 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1977 video_url = data['media']['url']
1978 umobj = re.match(self._URL_EXT, video_url)
1979 if umobj is None:
1980 raise ValueError('Can not determine filename extension')
1981 ext = umobj.group(1)
1982
1983 info = {
1984 'id': data['item_id'],
1985 'url': video_url,
1986 'uploader': data['display_name'],
1987 'upload_date': upload_date,
1988 'title': data['title'],
1989 'ext': ext,
1990 'format': data['media']['mimeType'],
1991 'thumbnail': data['thumbnailUrl'],
1992 'description': data['description'],
1993 'player_url': data['embedUrl'],
1994 'user_agent': 'iTunes/10.6.1',
1995 }
1996 except (ValueError,KeyError) as err:
1997 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1998
1999 return [info]
2000
2001
2002 class MyVideoIE(InfoExtractor):
2003 """Information Extractor for myvideo.de."""
2004
2005 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2006 IE_NAME = u'myvideo'
2007
2008 def _real_extract(self,url):
2009 mobj = re.match(self._VALID_URL, url)
2010 if mobj is None:
2011 raise ExtractorError(u'Invalid URL: %s' % url)
2012
2013 video_id = mobj.group(1)
2014
2015 # Get video webpage
2016 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2017 webpage = self._download_webpage(webpage_url, video_id)
2018
2019 self.report_extraction(video_id)
2020 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2021 webpage)
2022 if mobj is None:
2023 raise ExtractorError(u'Unable to extract media URL')
2024 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2025
2026 mobj = re.search('<title>([^<]+)</title>', webpage)
2027 if mobj is None:
2028 raise ExtractorError(u'Unable to extract title')
2029
2030 video_title = mobj.group(1)
2031
2032 return [{
2033 'id': video_id,
2034 'url': video_url,
2035 'uploader': None,
2036 'upload_date': None,
2037 'title': video_title,
2038 'ext': u'flv',
2039 }]
2040
2041 class ComedyCentralIE(InfoExtractor):
2042 """Information extractor for The Daily Show and Colbert Report """
2043
2044 # urls can be abbreviations like :thedailyshow or :colbert
2045 # urls for episodes like:
2046 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2047 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2048 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2049 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2050 |(https?://)?(www\.)?
2051 (?P<showname>thedailyshow|colbertnation)\.com/
2052 (full-episodes/(?P<episode>.*)|
2053 (?P<clip>
2054 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2055 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2056 $"""
2057
2058 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2059
2060 _video_extensions = {
2061 '3500': 'mp4',
2062 '2200': 'mp4',
2063 '1700': 'mp4',
2064 '1200': 'mp4',
2065 '750': 'mp4',
2066 '400': 'mp4',
2067 }
2068 _video_dimensions = {
2069 '3500': '1280x720',
2070 '2200': '960x540',
2071 '1700': '768x432',
2072 '1200': '640x360',
2073 '750': '512x288',
2074 '400': '384x216',
2075 }
2076
2077 @classmethod
2078 def suitable(cls, url):
2079 """Receives a URL and returns True if suitable for this IE."""
2080 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2081
2082 def _print_formats(self, formats):
2083 print('Available formats:')
2084 for x in formats:
2085 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2086
2087
2088 def _real_extract(self, url):
2089 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2090 if mobj is None:
2091 raise ExtractorError(u'Invalid URL: %s' % url)
2092
2093 if mobj.group('shortname'):
2094 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2095 url = u'http://www.thedailyshow.com/full-episodes/'
2096 else:
2097 url = u'http://www.colbertnation.com/full-episodes/'
2098 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2099 assert mobj is not None
2100
2101 if mobj.group('clip'):
2102 if mobj.group('showname') == 'thedailyshow':
2103 epTitle = mobj.group('tdstitle')
2104 else:
2105 epTitle = mobj.group('cntitle')
2106 dlNewest = False
2107 else:
2108 dlNewest = not mobj.group('episode')
2109 if dlNewest:
2110 epTitle = mobj.group('showname')
2111 else:
2112 epTitle = mobj.group('episode')
2113
2114 self.report_extraction(epTitle)
2115 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2116 if dlNewest:
2117 url = htmlHandle.geturl()
2118 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2119 if mobj is None:
2120 raise ExtractorError(u'Invalid redirected URL: ' + url)
2121 if mobj.group('episode') == '':
2122 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2123 epTitle = mobj.group('episode')
2124
2125 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2126
2127 if len(mMovieParams) == 0:
2128 # The Colbert Report embeds the information in a without
2129 # a URL prefix; so extract the alternate reference
2130 # and then add the URL prefix manually.
2131
2132 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2133 if len(altMovieParams) == 0:
2134 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2135 else:
2136 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2137
2138 uri = mMovieParams[0][1]
2139 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2140 indexXml = self._download_webpage(indexUrl, epTitle,
2141 u'Downloading show index',
2142 u'unable to download episode index')
2143
2144 results = []
2145
2146 idoc = xml.etree.ElementTree.fromstring(indexXml)
2147 itemEls = idoc.findall('.//item')
2148 for partNum,itemEl in enumerate(itemEls):
2149 mediaId = itemEl.findall('./guid')[0].text
2150 shortMediaId = mediaId.split(':')[-1]
2151 showId = mediaId.split(':')[-2].replace('.com', '')
2152 officialTitle = itemEl.findall('./title')[0].text
2153 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2154
2155 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2156 compat_urllib_parse.urlencode({'uri': mediaId}))
2157 configXml = self._download_webpage(configUrl, epTitle,
2158 u'Downloading configuration for %s' % shortMediaId)
2159
2160 cdoc = xml.etree.ElementTree.fromstring(configXml)
2161 turls = []
2162 for rendition in cdoc.findall('.//rendition'):
2163 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2164 turls.append(finfo)
2165
2166 if len(turls) == 0:
2167 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2168 continue
2169
2170 if self._downloader.params.get('listformats', None):
2171 self._print_formats([i[0] for i in turls])
2172 return
2173
2174 # For now, just pick the highest bitrate
2175 format,rtmp_video_url = turls[-1]
2176
2177 # Get the format arg from the arg stream
2178 req_format = self._downloader.params.get('format', None)
2179
2180 # Select format if we can find one
2181 for f,v in turls:
2182 if f == req_format:
2183 format, rtmp_video_url = f, v
2184 break
2185
2186 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2187 if not m:
2188 raise ExtractorError(u'Cannot transform RTMP url')
2189 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2190 video_url = base + m.group('finalid')
2191
2192 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2193 info = {
2194 'id': shortMediaId,
2195 'url': video_url,
2196 'uploader': showId,
2197 'upload_date': officialDate,
2198 'title': effTitle,
2199 'ext': 'mp4',
2200 'format': format,
2201 'thumbnail': None,
2202 'description': officialTitle,
2203 }
2204 results.append(info)
2205
2206 return results
2207
2208
2209 class EscapistIE(InfoExtractor):
2210 """Information extractor for The Escapist """
2211
2212 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2213 IE_NAME = u'escapist'
2214
2215 def _real_extract(self, url):
2216 mobj = re.match(self._VALID_URL, url)
2217 if mobj is None:
2218 raise ExtractorError(u'Invalid URL: %s' % url)
2219 showName = mobj.group('showname')
2220 videoId = mobj.group('episode')
2221
2222 self.report_extraction(showName)
2223 webPage = self._download_webpage(url, showName)
2224
2225 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2226 description = unescapeHTML(descMatch.group(1))
2227 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2228 imgUrl = unescapeHTML(imgMatch.group(1))
2229 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2230 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2231 configUrlMatch = re.search('config=(.*)$', playerUrl)
2232 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2233
2234 configJSON = self._download_webpage(configUrl, showName,
2235 u'Downloading configuration',
2236 u'unable to download configuration')
2237
2238 # Technically, it's JavaScript, not JSON
2239 configJSON = configJSON.replace("'", '"')
2240
2241 try:
2242 config = json.loads(configJSON)
2243 except (ValueError,) as err:
2244 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2245
2246 playlist = config['playlist']
2247 videoUrl = playlist[1]['url']
2248
2249 info = {
2250 'id': videoId,
2251 'url': videoUrl,
2252 'uploader': showName,
2253 'upload_date': None,
2254 'title': showName,
2255 'ext': 'mp4',
2256 'thumbnail': imgUrl,
2257 'description': description,
2258 'player_url': playerUrl,
2259 }
2260
2261 return [info]
2262
2263 class CollegeHumorIE(InfoExtractor):
2264 """Information extractor for collegehumor.com"""
2265
2266 _WORKING = False
2267 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2268 IE_NAME = u'collegehumor'
2269
2270 def report_manifest(self, video_id):
2271 """Report information extraction."""
2272 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2273
2274 def _real_extract(self, url):
2275 mobj = re.match(self._VALID_URL, url)
2276 if mobj is None:
2277 raise ExtractorError(u'Invalid URL: %s' % url)
2278 video_id = mobj.group('videoid')
2279
2280 info = {
2281 'id': video_id,
2282 'uploader': None,
2283 'upload_date': None,
2284 }
2285
2286 self.report_extraction(video_id)
2287 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2288 try:
2289 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2290 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2291 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2292
2293 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2294 try:
2295 videoNode = mdoc.findall('./video')[0]
2296 info['description'] = videoNode.findall('./description')[0].text
2297 info['title'] = videoNode.findall('./caption')[0].text
2298 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2299 manifest_url = videoNode.findall('./file')[0].text
2300 except IndexError:
2301 raise ExtractorError(u'Invalid metadata XML file')
2302
2303 manifest_url += '?hdcore=2.10.3'
2304 self.report_manifest(video_id)
2305 try:
2306 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2308 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2309
2310 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2311 try:
2312 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2313 node_id = media_node.attrib['url']
2314 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2315 except IndexError as err:
2316 raise ExtractorError(u'Invalid manifest file')
2317
2318 url_pr = compat_urllib_parse_urlparse(manifest_url)
2319 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2320
2321 info['url'] = url
2322 info['ext'] = 'f4f'
2323 return [info]
2324
2325
2326 class XVideosIE(InfoExtractor):
2327 """Information extractor for xvideos.com"""
2328
2329 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2330 IE_NAME = u'xvideos'
2331
2332 def _real_extract(self, url):
2333 mobj = re.match(self._VALID_URL, url)
2334 if mobj is None:
2335 raise ExtractorError(u'Invalid URL: %s' % url)
2336 video_id = mobj.group(1)
2337
2338 webpage = self._download_webpage(url, video_id)
2339
2340 self.report_extraction(video_id)
2341
2342
2343 # Extract video URL
2344 mobj = re.search(r'flv_url=(.+?)&', webpage)
2345 if mobj is None:
2346 raise ExtractorError(u'Unable to extract video url')
2347 video_url = compat_urllib_parse.unquote(mobj.group(1))
2348
2349
2350 # Extract title
2351 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2352 if mobj is None:
2353 raise ExtractorError(u'Unable to extract video title')
2354 video_title = mobj.group(1)
2355
2356
2357 # Extract video thumbnail
2358 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2359 if mobj is None:
2360 raise ExtractorError(u'Unable to extract video thumbnail')
2361 video_thumbnail = mobj.group(0)
2362
2363 info = {
2364 'id': video_id,
2365 'url': video_url,
2366 'uploader': None,
2367 'upload_date': None,
2368 'title': video_title,
2369 'ext': 'flv',
2370 'thumbnail': video_thumbnail,
2371 'description': None,
2372 }
2373
2374 return [info]
2375
2376
2377 class SoundcloudIE(InfoExtractor):
2378 """Information extractor for soundcloud.com
2379 To access the media, the uid of the song and a stream token
2380 must be extracted from the page source and the script must make
2381 a request to media.soundcloud.com/crossdomain.xml. Then
2382 the media can be grabbed by requesting from an url composed
2383 of the stream token and uid
2384 """
2385
2386 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2387 IE_NAME = u'soundcloud'
2388
2389 def report_resolve(self, video_id):
2390 """Report information extraction."""
2391 self.to_screen(u'%s: Resolving id' % video_id)
2392
2393 def _real_extract(self, url):
2394 mobj = re.match(self._VALID_URL, url)
2395 if mobj is None:
2396 raise ExtractorError(u'Invalid URL: %s' % url)
2397
2398 # extract uploader (which is in the url)
2399 uploader = mobj.group(1)
2400 # extract simple title (uploader + slug of song title)
2401 slug_title = mobj.group(2)
2402 simple_title = uploader + u'-' + slug_title
2403 full_title = '%s/%s' % (uploader, slug_title)
2404
2405 self.report_resolve(full_title)
2406
2407 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2408 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2409 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2410
2411 info = json.loads(info_json)
2412 video_id = info['id']
2413 self.report_extraction(full_title)
2414
2415 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2416 stream_json = self._download_webpage(streams_url, full_title,
2417 u'Downloading stream definitions',
2418 u'unable to download stream definitions')
2419
2420 streams = json.loads(stream_json)
2421 mediaURL = streams['http_mp3_128_url']
2422 upload_date = unified_strdate(info['created_at'])
2423
2424 return [{
2425 'id': info['id'],
2426 'url': mediaURL,
2427 'uploader': info['user']['username'],
2428 'upload_date': upload_date,
2429 'title': info['title'],
2430 'ext': u'mp3',
2431 'description': info['description'],
2432 }]
2433
2434 class SoundcloudSetIE(InfoExtractor):
2435 """Information extractor for soundcloud.com sets
2436 To access the media, the uid of the song and a stream token
2437 must be extracted from the page source and the script must make
2438 a request to media.soundcloud.com/crossdomain.xml. Then
2439 the media can be grabbed by requesting from an url composed
2440 of the stream token and uid
2441 """
2442
2443 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2444 IE_NAME = u'soundcloud:set'
2445
2446 def report_resolve(self, video_id):
2447 """Report information extraction."""
2448 self.to_screen(u'%s: Resolving id' % video_id)
2449
2450 def _real_extract(self, url):
2451 mobj = re.match(self._VALID_URL, url)
2452 if mobj is None:
2453 raise ExtractorError(u'Invalid URL: %s' % url)
2454
2455 # extract uploader (which is in the url)
2456 uploader = mobj.group(1)
2457 # extract simple title (uploader + slug of song title)
2458 slug_title = mobj.group(2)
2459 simple_title = uploader + u'-' + slug_title
2460 full_title = '%s/sets/%s' % (uploader, slug_title)
2461
2462 self.report_resolve(full_title)
2463
2464 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2465 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2466 info_json = self._download_webpage(resolv_url, full_title)
2467
2468 videos = []
2469 info = json.loads(info_json)
2470 if 'errors' in info:
2471 for err in info['errors']:
2472 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2473 return
2474
2475 self.report_extraction(full_title)
2476 for track in info['tracks']:
2477 video_id = track['id']
2478
2479 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2480 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2481
2482 self.report_extraction(video_id)
2483 streams = json.loads(stream_json)
2484 mediaURL = streams['http_mp3_128_url']
2485
2486 videos.append({
2487 'id': video_id,
2488 'url': mediaURL,
2489 'uploader': track['user']['username'],
2490 'upload_date': unified_strdate(track['created_at']),
2491 'title': track['title'],
2492 'ext': u'mp3',
2493 'description': track['description'],
2494 })
2495 return videos
2496
2497
2498 class InfoQIE(InfoExtractor):
2499 """Information extractor for infoq.com"""
2500 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2501
2502 def _real_extract(self, url):
2503 mobj = re.match(self._VALID_URL, url)
2504 if mobj is None:
2505 raise ExtractorError(u'Invalid URL: %s' % url)
2506
2507 webpage = self._download_webpage(url, video_id=url)
2508 self.report_extraction(url)
2509
2510 # Extract video URL
2511 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2512 if mobj is None:
2513 raise ExtractorError(u'Unable to extract video url')
2514 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2515 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2516
2517 # Extract title
2518 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2519 if mobj is None:
2520 raise ExtractorError(u'Unable to extract video title')
2521 video_title = mobj.group(1)
2522
2523 # Extract description
2524 video_description = u'No description available.'
2525 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2526 if mobj is not None:
2527 video_description = mobj.group(1)
2528
2529 video_filename = video_url.split('/')[-1]
2530 video_id, extension = video_filename.split('.')
2531
2532 info = {
2533 'id': video_id,
2534 'url': video_url,
2535 'uploader': None,
2536 'upload_date': None,
2537 'title': video_title,
2538 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2539 'thumbnail': None,
2540 'description': video_description,
2541 }
2542
2543 return [info]
2544
2545 class MixcloudIE(InfoExtractor):
2546 """Information extractor for www.mixcloud.com"""
2547
2548 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2549 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2550 IE_NAME = u'mixcloud'
2551
2552 def report_download_json(self, file_id):
2553 """Report JSON download."""
2554 self.to_screen(u'Downloading json')
2555
2556 def get_urls(self, jsonData, fmt, bitrate='best'):
2557 """Get urls from 'audio_formats' section in json"""
2558 file_url = None
2559 try:
2560 bitrate_list = jsonData[fmt]
2561 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2562 bitrate = max(bitrate_list) # select highest
2563
2564 url_list = jsonData[fmt][bitrate]
2565 except TypeError: # we have no bitrate info.
2566 url_list = jsonData[fmt]
2567 return url_list
2568
2569 def check_urls(self, url_list):
2570 """Returns 1st active url from list"""
2571 for url in url_list:
2572 try:
2573 compat_urllib_request.urlopen(url)
2574 return url
2575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2576 url = None
2577
2578 return None
2579
2580 def _print_formats(self, formats):
2581 print('Available formats:')
2582 for fmt in formats.keys():
2583 for b in formats[fmt]:
2584 try:
2585 ext = formats[fmt][b][0]
2586 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2587 except TypeError: # we have no bitrate info
2588 ext = formats[fmt][0]
2589 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2590 break
2591
2592 def _real_extract(self, url):
2593 mobj = re.match(self._VALID_URL, url)
2594 if mobj is None:
2595 raise ExtractorError(u'Invalid URL: %s' % url)
2596 # extract uploader & filename from url
2597 uploader = mobj.group(1).decode('utf-8')
2598 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2599
2600 # construct API request
2601 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2602 # retrieve .json file with links to files
2603 request = compat_urllib_request.Request(file_url)
2604 try:
2605 self.report_download_json(file_url)
2606 jsonData = compat_urllib_request.urlopen(request).read()
2607 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2609
2610 # parse JSON
2611 json_data = json.loads(jsonData)
2612 player_url = json_data['player_swf_url']
2613 formats = dict(json_data['audio_formats'])
2614
2615 req_format = self._downloader.params.get('format', None)
2616 bitrate = None
2617
2618 if self._downloader.params.get('listformats', None):
2619 self._print_formats(formats)
2620 return
2621
2622 if req_format is None or req_format == 'best':
2623 for format_param in formats.keys():
2624 url_list = self.get_urls(formats, format_param)
2625 # check urls
2626 file_url = self.check_urls(url_list)
2627 if file_url is not None:
2628 break # got it!
2629 else:
2630 if req_format not in formats:
2631 raise ExtractorError(u'Format is not available')
2632
2633 url_list = self.get_urls(formats, req_format)
2634 file_url = self.check_urls(url_list)
2635 format_param = req_format
2636
2637 return [{
2638 'id': file_id.decode('utf-8'),
2639 'url': file_url.decode('utf-8'),
2640 'uploader': uploader.decode('utf-8'),
2641 'upload_date': None,
2642 'title': json_data['name'],
2643 'ext': file_url.split('.')[-1].decode('utf-8'),
2644 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2645 'thumbnail': json_data['thumbnail_url'],
2646 'description': json_data['description'],
2647 'player_url': player_url.decode('utf-8'),
2648 }]
2649
2650 class StanfordOpenClassroomIE(InfoExtractor):
2651 """Information extractor for Stanford's Open ClassRoom"""
2652
2653 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2654 IE_NAME = u'stanfordoc'
2655
2656 def _real_extract(self, url):
2657 mobj = re.match(self._VALID_URL, url)
2658 if mobj is None:
2659 raise ExtractorError(u'Invalid URL: %s' % url)
2660
2661 if mobj.group('course') and mobj.group('video'): # A specific video
2662 course = mobj.group('course')
2663 video = mobj.group('video')
2664 info = {
2665 'id': course + '_' + video,
2666 'uploader': None,
2667 'upload_date': None,
2668 }
2669
2670 self.report_extraction(info['id'])
2671 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2672 xmlUrl = baseUrl + video + '.xml'
2673 try:
2674 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2677 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2678 try:
2679 info['title'] = mdoc.findall('./title')[0].text
2680 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2681 except IndexError:
2682 raise ExtractorError(u'Invalid metadata XML file')
2683 info['ext'] = info['url'].rpartition('.')[2]
2684 return [info]
2685 elif mobj.group('course'): # A course page
2686 course = mobj.group('course')
2687 info = {
2688 'id': course,
2689 'type': 'playlist',
2690 'uploader': None,
2691 'upload_date': None,
2692 }
2693
2694 coursepage = self._download_webpage(url, info['id'],
2695 note='Downloading course info page',
2696 errnote='Unable to download course info page')
2697
2698 m = re.search('<h1>([^<]+)</h1>', coursepage)
2699 if m:
2700 info['title'] = unescapeHTML(m.group(1))
2701 else:
2702 info['title'] = info['id']
2703
2704 m = re.search('<description>([^<]+)</description>', coursepage)
2705 if m:
2706 info['description'] = unescapeHTML(m.group(1))
2707
2708 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2709 info['list'] = [
2710 {
2711 'type': 'reference',
2712 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2713 }
2714 for vpage in links]
2715 results = []
2716 for entry in info['list']:
2717 assert entry['type'] == 'reference'
2718 results += self.extract(entry['url'])
2719 return results
2720 else: # Root page
2721 info = {
2722 'id': 'Stanford OpenClassroom',
2723 'type': 'playlist',
2724 'uploader': None,
2725 'upload_date': None,
2726 }
2727
2728 self.report_download_webpage(info['id'])
2729 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2730 try:
2731 rootpage = compat_urllib_request.urlopen(rootURL).read()
2732 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2734
2735 info['title'] = info['id']
2736
2737 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2738 info['list'] = [
2739 {
2740 'type': 'reference',
2741 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2742 }
2743 for cpage in links]
2744
2745 results = []
2746 for entry in info['list']:
2747 assert entry['type'] == 'reference'
2748 results += self.extract(entry['url'])
2749 return results
2750
2751 class MTVIE(InfoExtractor):
2752 """Information extractor for MTV.com"""
2753
2754 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2755 IE_NAME = u'mtv'
2756
2757 def _real_extract(self, url):
2758 mobj = re.match(self._VALID_URL, url)
2759 if mobj is None:
2760 raise ExtractorError(u'Invalid URL: %s' % url)
2761 if not mobj.group('proto'):
2762 url = 'http://' + url
2763 video_id = mobj.group('videoid')
2764
2765 webpage = self._download_webpage(url, video_id)
2766
2767 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2768 if mobj is None:
2769 raise ExtractorError(u'Unable to extract song name')
2770 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2771 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2772 if mobj is None:
2773 raise ExtractorError(u'Unable to extract performer')
2774 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2775 video_title = performer + ' - ' + song_name
2776
2777 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2778 if mobj is None:
2779 raise ExtractorError(u'Unable to mtvn_uri')
2780 mtvn_uri = mobj.group(1)
2781
2782 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2783 if mobj is None:
2784 raise ExtractorError(u'Unable to extract content id')
2785 content_id = mobj.group(1)
2786
2787 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2788 self.report_extraction(video_id)
2789 request = compat_urllib_request.Request(videogen_url)
2790 try:
2791 metadataXml = compat_urllib_request.urlopen(request).read()
2792 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2794
2795 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2796 renditions = mdoc.findall('.//rendition')
2797
2798 # For now, always pick the highest quality.
2799 rendition = renditions[-1]
2800
2801 try:
2802 _,_,ext = rendition.attrib['type'].partition('/')
2803 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2804 video_url = rendition.find('./src').text
2805 except KeyError:
2806 raise ExtractorError('Invalid rendition field.')
2807
2808 info = {
2809 'id': video_id,
2810 'url': video_url,
2811 'uploader': performer,
2812 'upload_date': None,
2813 'title': video_title,
2814 'ext': ext,
2815 'format': format,
2816 }
2817
2818 return [info]
2819
2820
2821 class YoukuIE(InfoExtractor):
2822 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2823
2824 def _gen_sid(self):
2825 nowTime = int(time.time() * 1000)
2826 random1 = random.randint(1000,1998)
2827 random2 = random.randint(1000,9999)
2828
2829 return "%d%d%d" %(nowTime,random1,random2)
2830
2831 def _get_file_ID_mix_string(self, seed):
2832 mixed = []
2833 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2834 seed = float(seed)
2835 for i in range(len(source)):
2836 seed = (seed * 211 + 30031 ) % 65536
2837 index = math.floor(seed / 65536 * len(source) )
2838 mixed.append(source[int(index)])
2839 source.remove(source[int(index)])
2840 #return ''.join(mixed)
2841 return mixed
2842
2843 def _get_file_id(self, fileId, seed):
2844 mixed = self._get_file_ID_mix_string(seed)
2845 ids = fileId.split('*')
2846 realId = []
2847 for ch in ids:
2848 if ch:
2849 realId.append(mixed[int(ch)])
2850 return ''.join(realId)
2851
2852 def _real_extract(self, url):
2853 mobj = re.match(self._VALID_URL, url)
2854 if mobj is None:
2855 raise ExtractorError(u'Invalid URL: %s' % url)
2856 video_id = mobj.group('ID')
2857
2858 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2859
2860 jsondata = self._download_webpage(info_url, video_id)
2861
2862 self.report_extraction(video_id)
2863 try:
2864 config = json.loads(jsondata)
2865
2866 video_title = config['data'][0]['title']
2867 seed = config['data'][0]['seed']
2868
2869 format = self._downloader.params.get('format', None)
2870 supported_format = list(config['data'][0]['streamfileids'].keys())
2871
2872 if format is None or format == 'best':
2873 if 'hd2' in supported_format:
2874 format = 'hd2'
2875 else:
2876 format = 'flv'
2877 ext = u'flv'
2878 elif format == 'worst':
2879 format = 'mp4'
2880 ext = u'mp4'
2881 else:
2882 format = 'flv'
2883 ext = u'flv'
2884
2885
2886 fileid = config['data'][0]['streamfileids'][format]
2887 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2888 except (UnicodeDecodeError, ValueError, KeyError):
2889 raise ExtractorError(u'Unable to extract info section')
2890
2891 files_info=[]
2892 sid = self._gen_sid()
2893 fileid = self._get_file_id(fileid, seed)
2894
2895 #column 8,9 of fileid represent the segment number
2896 #fileid[7:9] should be changed
2897 for index, key in enumerate(keys):
2898
2899 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2900 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2901
2902 info = {
2903 'id': '%s_part%02d' % (video_id, index),
2904 'url': download_url,
2905 'uploader': None,
2906 'upload_date': None,
2907 'title': video_title,
2908 'ext': ext,
2909 }
2910 files_info.append(info)
2911
2912 return files_info
2913
2914
2915 class XNXXIE(InfoExtractor):
2916 """Information extractor for xnxx.com"""
2917
2918 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2919 IE_NAME = u'xnxx'
2920 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2921 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2922 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2923
2924 def _real_extract(self, url):
2925 mobj = re.match(self._VALID_URL, url)
2926 if mobj is None:
2927 raise ExtractorError(u'Invalid URL: %s' % url)
2928 video_id = mobj.group(1)
2929
2930 # Get webpage content
2931 webpage = self._download_webpage(url, video_id)
2932
2933 result = re.search(self.VIDEO_URL_RE, webpage)
2934 if result is None:
2935 raise ExtractorError(u'Unable to extract video url')
2936 video_url = compat_urllib_parse.unquote(result.group(1))
2937
2938 result = re.search(self.VIDEO_TITLE_RE, webpage)
2939 if result is None:
2940 raise ExtractorError(u'Unable to extract video title')
2941 video_title = result.group(1)
2942
2943 result = re.search(self.VIDEO_THUMB_RE, webpage)
2944 if result is None:
2945 raise ExtractorError(u'Unable to extract video thumbnail')
2946 video_thumbnail = result.group(1)
2947
2948 return [{
2949 'id': video_id,
2950 'url': video_url,
2951 'uploader': None,
2952 'upload_date': None,
2953 'title': video_title,
2954 'ext': 'flv',
2955 'thumbnail': video_thumbnail,
2956 'description': None,
2957 }]
2958
2959
2960 class GooglePlusIE(InfoExtractor):
2961 """Information extractor for plus.google.com."""
2962
2963 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2964 IE_NAME = u'plus.google'
2965
2966 def report_extract_entry(self, url):
2967 """Report downloading extry"""
2968 self.to_screen(u'Downloading entry: %s' % url)
2969
2970 def report_date(self, upload_date):
2971 """Report downloading extry"""
2972 self.to_screen(u'Entry date: %s' % upload_date)
2973
2974 def report_uploader(self, uploader):
2975 """Report downloading extry"""
2976 self.to_screen(u'Uploader: %s' % uploader)
2977
2978 def report_title(self, video_title):
2979 """Report downloading extry"""
2980 self.to_screen(u'Title: %s' % video_title)
2981
2982 def report_extract_vid_page(self, video_page):
2983 """Report information extraction."""
2984 self.to_screen(u'Extracting video page: %s' % video_page)
2985
2986 def _real_extract(self, url):
2987 # Extract id from URL
2988 mobj = re.match(self._VALID_URL, url)
2989 if mobj is None:
2990 raise ExtractorError(u'Invalid URL: %s' % url)
2991
2992 post_url = mobj.group(0)
2993 video_id = mobj.group(1)
2994
2995 video_extension = 'flv'
2996
2997 # Step 1, Retrieve post webpage to extract further information
2998 self.report_extract_entry(post_url)
2999 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3000
3001 # Extract update date
3002 upload_date = None
3003 pattern = 'title="Timestamp">(.*?)</a>'
3004 mobj = re.search(pattern, webpage)
3005 if mobj:
3006 upload_date = mobj.group(1)
3007 # Convert timestring to a format suitable for filename
3008 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3009 upload_date = upload_date.strftime('%Y%m%d')
3010 self.report_date(upload_date)
3011
3012 # Extract uploader
3013 uploader = None
3014 pattern = r'rel\="author".*?>(.*?)</a>'
3015 mobj = re.search(pattern, webpage)
3016 if mobj:
3017 uploader = mobj.group(1)
3018 self.report_uploader(uploader)
3019
3020 # Extract title
3021 # Get the first line for title
3022 video_title = u'NA'
3023 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3024 mobj = re.search(pattern, webpage)
3025 if mobj:
3026 video_title = mobj.group(1)
3027 self.report_title(video_title)
3028
3029 # Step 2, Stimulate clicking the image box to launch video
3030 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3031 mobj = re.search(pattern, webpage)
3032 if mobj is None:
3033 raise ExtractorError(u'Unable to extract video page URL')
3034
3035 video_page = mobj.group(1)
3036 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3037 self.report_extract_vid_page(video_page)
3038
3039
3040 # Extract video links on video page
3041 """Extract video links of all sizes"""
3042 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3043 mobj = re.findall(pattern, webpage)
3044 if len(mobj) == 0:
3045 raise ExtractorError(u'Unable to extract video links')
3046
3047 # Sort in resolution
3048 links = sorted(mobj)
3049
3050 # Choose the lowest of the sort, i.e. highest resolution
3051 video_url = links[-1]
3052 # Only get the url. The resolution part in the tuple has no use anymore
3053 video_url = video_url[-1]
3054 # Treat escaped \u0026 style hex
3055 try:
3056 video_url = video_url.decode("unicode_escape")
3057 except AttributeError: # Python 3
3058 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3059
3060
3061 return [{
3062 'id': video_id,
3063 'url': video_url,
3064 'uploader': uploader,
3065 'upload_date': upload_date,
3066 'title': video_title,
3067 'ext': video_extension,
3068 }]
3069
3070 class NBAIE(InfoExtractor):
3071 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3072 IE_NAME = u'nba'
3073
3074 def _real_extract(self, url):
3075 mobj = re.match(self._VALID_URL, url)
3076 if mobj is None:
3077 raise ExtractorError(u'Invalid URL: %s' % url)
3078
3079 video_id = mobj.group(1)
3080 if video_id.endswith('/index.html'):
3081 video_id = video_id[:-len('/index.html')]
3082
3083 webpage = self._download_webpage(url, video_id)
3084
3085 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3086 def _findProp(rexp, default=None):
3087 m = re.search(rexp, webpage)
3088 if m:
3089 return unescapeHTML(m.group(1))
3090 else:
3091 return default
3092
3093 shortened_video_id = video_id.rpartition('/')[2]
3094 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3095 info = {
3096 'id': shortened_video_id,
3097 'url': video_url,
3098 'ext': 'mp4',
3099 'title': title,
3100 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3101 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3102 }
3103 return [info]
3104
3105 class JustinTVIE(InfoExtractor):
3106 """Information extractor for justin.tv and twitch.tv"""
3107 # TODO: One broadcast may be split into multiple videos. The key
3108 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3109 # starts at 1 and increases. Can we treat all parts as one video?
3110
3111 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3112 (?:
3113 (?P<channelid>[^/]+)|
3114 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3115 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3116 )
3117 /?(?:\#.*)?$
3118 """
3119 _JUSTIN_PAGE_LIMIT = 100
3120 IE_NAME = u'justin.tv'
3121
3122 def report_download_page(self, channel, offset):
3123 """Report attempt to download a single page of videos."""
3124 self.to_screen(u'%s: Downloading video information from %d to %d' %
3125 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3126
3127 # Return count of items, list of *valid* items
3128 def _parse_page(self, url, video_id):
3129 webpage = self._download_webpage(url, video_id,
3130 u'Downloading video info JSON',
3131 u'unable to download video info JSON')
3132
3133 response = json.loads(webpage)
3134 if type(response) != list:
3135 error_text = response.get('error', 'unknown error')
3136 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3137 info = []
3138 for clip in response:
3139 video_url = clip['video_file_url']
3140 if video_url:
3141 video_extension = os.path.splitext(video_url)[1][1:]
3142 video_date = re.sub('-', '', clip['start_time'][:10])
3143 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3144 video_id = clip['id']
3145 video_title = clip.get('title', video_id)
3146 info.append({
3147 'id': video_id,
3148 'url': video_url,
3149 'title': video_title,
3150 'uploader': clip.get('channel_name', video_uploader_id),
3151 'uploader_id': video_uploader_id,
3152 'upload_date': video_date,
3153 'ext': video_extension,
3154 })
3155 return (len(response), info)
3156
3157 def _real_extract(self, url):
3158 mobj = re.match(self._VALID_URL, url)
3159 if mobj is None:
3160 raise ExtractorError(u'invalid URL: %s' % url)
3161
3162 api_base = 'http://api.justin.tv'
3163 paged = False
3164 if mobj.group('channelid'):
3165 paged = True
3166 video_id = mobj.group('channelid')
3167 api = api_base + '/channel/archives/%s.json' % video_id
3168 elif mobj.group('chapterid'):
3169 chapter_id = mobj.group('chapterid')
3170
3171 webpage = self._download_webpage(url, chapter_id)
3172 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3173 if not m:
3174 raise ExtractorError(u'Cannot find archive of a chapter')
3175 archive_id = m.group(1)
3176
3177 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3178 chapter_info_xml = self._download_webpage(api, chapter_id,
3179 note=u'Downloading chapter information',
3180 errnote=u'Chapter information download failed')
3181 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3182 for a in doc.findall('.//archive'):
3183 if archive_id == a.find('./id').text:
3184 break
3185 else:
3186 raise ExtractorError(u'Could not find chapter in chapter information')
3187
3188 video_url = a.find('./video_file_url').text
3189 video_ext = video_url.rpartition('.')[2] or u'flv'
3190
3191 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3192 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3193 note='Downloading chapter metadata',
3194 errnote='Download of chapter metadata failed')
3195 chapter_info = json.loads(chapter_info_json)
3196
3197 bracket_start = int(doc.find('.//bracket_start').text)
3198 bracket_end = int(doc.find('.//bracket_end').text)
3199
3200 # TODO determine start (and probably fix up file)
3201 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3202 #video_url += u'?start=' + TODO:start_timestamp
3203 # bracket_start is 13290, but we want 51670615
3204 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3205 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3206
3207 info = {
3208 'id': u'c' + chapter_id,
3209 'url': video_url,
3210 'ext': video_ext,
3211 'title': chapter_info['title'],
3212 'thumbnail': chapter_info['preview'],
3213 'description': chapter_info['description'],
3214 'uploader': chapter_info['channel']['display_name'],
3215 'uploader_id': chapter_info['channel']['name'],
3216 }
3217 return [info]
3218 else:
3219 video_id = mobj.group('videoid')
3220 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3221
3222 self.report_extraction(video_id)
3223
3224 info = []
3225 offset = 0
3226 limit = self._JUSTIN_PAGE_LIMIT
3227 while True:
3228 if paged:
3229 self.report_download_page(video_id, offset)
3230 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3231 page_count, page_info = self._parse_page(page_url, video_id)
3232 info.extend(page_info)
3233 if not paged or page_count != limit:
3234 break
3235 offset += limit
3236 return info
3237
3238 class FunnyOrDieIE(InfoExtractor):
3239 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3240
3241 def _real_extract(self, url):
3242 mobj = re.match(self._VALID_URL, url)
3243 if mobj is None:
3244 raise ExtractorError(u'invalid URL: %s' % url)
3245
3246 video_id = mobj.group('id')
3247 webpage = self._download_webpage(url, video_id)
3248
3249 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3250 if not m:
3251 raise ExtractorError(u'Unable to find video information')
3252 video_url = unescapeHTML(m.group('url'))
3253
3254 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3255 if not m:
3256 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3257 if not m:
3258 raise ExtractorError(u'Cannot find video title')
3259 title = clean_html(m.group('title'))
3260
3261 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3262 if m:
3263 desc = unescapeHTML(m.group('desc'))
3264 else:
3265 desc = None
3266
3267 info = {
3268 'id': video_id,
3269 'url': video_url,
3270 'ext': 'mp4',
3271 'title': title,
3272 'description': desc,
3273 }
3274 return [info]
3275
3276 class SteamIE(InfoExtractor):
3277 _VALID_URL = r"""http://store\.steampowered\.com/
3278 (agecheck/)?
3279 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3280 (?P<gameID>\d+)/?
3281 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3282 """
3283
3284 @classmethod
3285 def suitable(cls, url):
3286 """Receives a URL and returns True if suitable for this IE."""
3287 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3288
3289 def _real_extract(self, url):
3290 m = re.match(self._VALID_URL, url, re.VERBOSE)
3291 gameID = m.group('gameID')
3292 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3293 self.report_age_confirmation()
3294 webpage = self._download_webpage(videourl, gameID)
3295 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3296
3297 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3298 mweb = re.finditer(urlRE, webpage)
3299 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3300 titles = re.finditer(namesRE, webpage)
3301 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3302 thumbs = re.finditer(thumbsRE, webpage)
3303 videos = []
3304 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3305 video_id = vid.group('videoID')
3306 title = vtitle.group('videoName')
3307 video_url = vid.group('videoURL')
3308 video_thumb = thumb.group('thumbnail')
3309 if not video_url:
3310 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3311 info = {
3312 'id':video_id,
3313 'url':video_url,
3314 'ext': 'flv',
3315 'title': unescapeHTML(title),
3316 'thumbnail': video_thumb
3317 }
3318 videos.append(info)
3319 return [self.playlist_result(videos, gameID, game_title)]
3320
3321 class UstreamIE(InfoExtractor):
3322 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3323 IE_NAME = u'ustream'
3324
3325 def _real_extract(self, url):
3326 m = re.match(self._VALID_URL, url)
3327 video_id = m.group('videoID')
3328 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3329 webpage = self._download_webpage(url, video_id)
3330 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3331 title = m.group('title')
3332 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3333 uploader = m.group('uploader')
3334 info = {
3335 'id':video_id,
3336 'url':video_url,
3337 'ext': 'flv',
3338 'title': title,
3339 'uploader': uploader
3340 }
3341 return [info]
3342
3343 class WorldStarHipHopIE(InfoExtractor):
3344 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3345 IE_NAME = u'WorldStarHipHop'
3346
3347 def _real_extract(self, url):
3348 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3349
3350 m = re.match(self._VALID_URL, url)
3351 video_id = m.group('id')
3352
3353 webpage_src = self._download_webpage(url, video_id)
3354
3355 mobj = re.search(_src_url, webpage_src)
3356
3357 if mobj is not None:
3358 video_url = mobj.group(1)
3359 if 'mp4' in video_url:
3360 ext = 'mp4'
3361 else:
3362 ext = 'flv'
3363 else:
3364 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3365
3366 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3367
3368 if mobj is None:
3369 raise ExtractorError(u'Cannot determine title')
3370 title = mobj.group(1)
3371
3372 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3373 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3374 if mobj is not None:
3375 thumbnail = mobj.group(1)
3376 else:
3377 _title = r"""candytitles.*>(.*)</span>"""
3378 mobj = re.search(_title, webpage_src)
3379 if mobj is not None:
3380 title = mobj.group(1)
3381 thumbnail = None
3382
3383 results = [{
3384 'id': video_id,
3385 'url' : video_url,
3386 'title' : title,
3387 'thumbnail' : thumbnail,
3388 'ext' : ext,
3389 }]
3390 return results
3391
3392 class RBMARadioIE(InfoExtractor):
3393 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3394
3395 def _real_extract(self, url):
3396 m = re.match(self._VALID_URL, url)
3397 video_id = m.group('videoID')
3398
3399 webpage = self._download_webpage(url, video_id)
3400 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3401 if not m:
3402 raise ExtractorError(u'Cannot find metadata')
3403 json_data = m.group(1)
3404
3405 try:
3406 data = json.loads(json_data)
3407 except ValueError as e:
3408 raise ExtractorError(u'Invalid JSON: ' + str(e))
3409
3410 video_url = data['akamai_url'] + '&cbr=256'
3411 url_parts = compat_urllib_parse_urlparse(video_url)
3412 video_ext = url_parts.path.rpartition('.')[2]
3413 info = {
3414 'id': video_id,
3415 'url': video_url,
3416 'ext': video_ext,
3417 'title': data['title'],
3418 'description': data.get('teaser_text'),
3419 'location': data.get('country_of_origin'),
3420 'uploader': data.get('host', {}).get('name'),
3421 'uploader_id': data.get('host', {}).get('slug'),
3422 'thumbnail': data.get('image', {}).get('large_url_2x'),
3423 'duration': data.get('duration'),
3424 }
3425 return [info]
3426
3427
3428 class YouPornIE(InfoExtractor):
3429 """Information extractor for youporn.com."""
3430 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3431
3432 def _print_formats(self, formats):
3433 """Print all available formats"""
3434 print(u'Available formats:')
3435 print(u'ext\t\tformat')
3436 print(u'---------------------------------')
3437 for format in formats:
3438 print(u'%s\t\t%s' % (format['ext'], format['format']))
3439
3440 def _specific(self, req_format, formats):
3441 for x in formats:
3442 if(x["format"]==req_format):
3443 return x
3444 return None
3445
3446 def _real_extract(self, url):
3447 mobj = re.match(self._VALID_URL, url)
3448 if mobj is None:
3449 raise ExtractorError(u'Invalid URL: %s' % url)
3450
3451 video_id = mobj.group('videoid')
3452
3453 req = compat_urllib_request.Request(url)
3454 req.add_header('Cookie', 'age_verified=1')
3455 webpage = self._download_webpage(req, video_id)
3456
3457 # Get the video title
3458 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3459 if result is None:
3460 raise ExtractorError(u'Unable to extract video title')
3461 video_title = result.group('title').strip()
3462
3463 # Get the video date
3464 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3465 if result is None:
3466 self._downloader.report_warning(u'unable to extract video date')
3467 upload_date = None
3468 else:
3469 upload_date = unified_strdate(result.group('date').strip())
3470
3471 # Get the video uploader
3472 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3473 if result is None:
3474 self._downloader.report_warning(u'unable to extract uploader')
3475 video_uploader = None
3476 else:
3477 video_uploader = result.group('uploader').strip()
3478 video_uploader = clean_html( video_uploader )
3479
3480 # Get all of the formats available
3481 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3482 result = re.search(DOWNLOAD_LIST_RE, webpage)
3483 if result is None:
3484 raise ExtractorError(u'Unable to extract download list')
3485 download_list_html = result.group('download_list').strip()
3486
3487 # Get all of the links from the page
3488 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3489 links = re.findall(LINK_RE, download_list_html)
3490 if(len(links) == 0):
3491 raise ExtractorError(u'ERROR: no known formats available for video')
3492
3493 self.to_screen(u'Links found: %d' % len(links))
3494
3495 formats = []
3496 for link in links:
3497
3498 # A link looks like this:
3499 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3500 # A path looks like this:
3501 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3502 video_url = unescapeHTML( link )
3503 path = compat_urllib_parse_urlparse( video_url ).path
3504 extension = os.path.splitext( path )[1][1:]
3505 format = path.split('/')[4].split('_')[:2]
3506 size = format[0]
3507 bitrate = format[1]
3508 format = "-".join( format )
3509 title = u'%s-%s-%s' % (video_title, size, bitrate)
3510
3511 formats.append({
3512 'id': video_id,
3513 'url': video_url,
3514 'uploader': video_uploader,
3515 'upload_date': upload_date,
3516 'title': title,
3517 'ext': extension,
3518 'format': format,
3519 'thumbnail': None,
3520 'description': None,
3521 'player_url': None
3522 })
3523
3524 if self._downloader.params.get('listformats', None):
3525 self._print_formats(formats)
3526 return
3527
3528 req_format = self._downloader.params.get('format', None)
3529 self.to_screen(u'Format: %s' % req_format)
3530
3531 if req_format is None or req_format == 'best':
3532 return [formats[0]]
3533 elif req_format == 'worst':
3534 return [formats[-1]]
3535 elif req_format in ('-1', 'all'):
3536 return formats
3537 else:
3538 format = self._specific( req_format, formats )
3539 if result is None:
3540 raise ExtractorError(u'Requested format not available')
3541 return [format]
3542
3543
3544
3545 class PornotubeIE(InfoExtractor):
3546 """Information extractor for pornotube.com."""
3547 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3548
3549 def _real_extract(self, url):
3550 mobj = re.match(self._VALID_URL, url)
3551 if mobj is None:
3552 raise ExtractorError(u'Invalid URL: %s' % url)
3553
3554 video_id = mobj.group('videoid')
3555 video_title = mobj.group('title')
3556
3557 # Get webpage content
3558 webpage = self._download_webpage(url, video_id)
3559
3560 # Get the video URL
3561 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3562 result = re.search(VIDEO_URL_RE, webpage)
3563 if result is None:
3564 raise ExtractorError(u'Unable to extract video url')
3565 video_url = compat_urllib_parse.unquote(result.group('url'))
3566
3567 #Get the uploaded date
3568 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3569 result = re.search(VIDEO_UPLOADED_RE, webpage)
3570 if result is None:
3571 raise ExtractorError(u'Unable to extract video title')
3572 upload_date = unified_strdate(result.group('date'))
3573
3574 info = {'id': video_id,
3575 'url': video_url,
3576 'uploader': None,
3577 'upload_date': upload_date,
3578 'title': video_title,
3579 'ext': 'flv',
3580 'format': 'flv'}
3581
3582 return [info]
3583
3584 class YouJizzIE(InfoExtractor):
3585 """Information extractor for youjizz.com."""
3586 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3587
3588 def _real_extract(self, url):
3589 mobj = re.match(self._VALID_URL, url)
3590 if mobj is None:
3591 raise ExtractorError(u'Invalid URL: %s' % url)
3592
3593 video_id = mobj.group('videoid')
3594
3595 # Get webpage content
3596 webpage = self._download_webpage(url, video_id)
3597
3598 # Get the video title
3599 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3600 if result is None:
3601 raise ExtractorError(u'ERROR: unable to extract video title')
3602 video_title = result.group('title').strip()
3603
3604 # Get the embed page
3605 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3606 if result is None:
3607 raise ExtractorError(u'ERROR: unable to extract embed page')
3608
3609 embed_page_url = result.group(0).strip()
3610 video_id = result.group('videoid')
3611
3612 webpage = self._download_webpage(embed_page_url, video_id)
3613
3614 # Get the video URL
3615 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3616 if result is None:
3617 raise ExtractorError(u'ERROR: unable to extract video url')
3618 video_url = result.group('source')
3619
3620 info = {'id': video_id,
3621 'url': video_url,
3622 'title': video_title,
3623 'ext': 'flv',
3624 'format': 'flv',
3625 'player_url': embed_page_url}
3626
3627 return [info]
3628
3629 class EightTracksIE(InfoExtractor):
3630 IE_NAME = '8tracks'
3631 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3632
3633 def _real_extract(self, url):
3634 mobj = re.match(self._VALID_URL, url)
3635 if mobj is None:
3636 raise ExtractorError(u'Invalid URL: %s' % url)
3637 playlist_id = mobj.group('id')
3638
3639 webpage = self._download_webpage(url, playlist_id)
3640
3641 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3642 if not m:
3643 raise ExtractorError(u'Cannot find trax information')
3644 json_like = m.group(1)
3645 data = json.loads(json_like)
3646
3647 session = str(random.randint(0, 1000000000))
3648 mix_id = data['id']
3649 track_count = data['tracks_count']
3650 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3651 next_url = first_url
3652 res = []
3653 for i in itertools.count():
3654 api_json = self._download_webpage(next_url, playlist_id,
3655 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3656 errnote=u'Failed to download song information')
3657 api_data = json.loads(api_json)
3658 track_data = api_data[u'set']['track']
3659 info = {
3660 'id': track_data['id'],
3661 'url': track_data['track_file_stream_url'],
3662 'title': track_data['performer'] + u' - ' + track_data['name'],
3663 'raw_title': track_data['name'],
3664 'uploader_id': data['user']['login'],
3665 'ext': 'm4a',
3666 }
3667 res.append(info)
3668 if api_data['set']['at_last_track']:
3669 break
3670 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3671 return res
3672
3673 class KeekIE(InfoExtractor):
3674 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3675 IE_NAME = u'keek'
3676
3677 def _real_extract(self, url):
3678 m = re.match(self._VALID_URL, url)
3679 video_id = m.group('videoID')
3680 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3681 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3682 webpage = self._download_webpage(url, video_id)
3683 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3684 title = unescapeHTML(m.group('title'))
3685 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3686 uploader = clean_html(m.group('uploader'))
3687 info = {
3688 'id': video_id,
3689 'url': video_url,
3690 'ext': 'mp4',
3691 'title': title,
3692 'thumbnail': thumbnail,
3693 'uploader': uploader
3694 }
3695 return [info]
3696
3697 class TEDIE(InfoExtractor):
3698 _VALID_URL=r'''http://www\.ted\.com/
3699 (
3700 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3701 |
3702 ((?P<type_talk>talks)) # We have a simple talk
3703 )
3704 (/lang/(.*?))? # The url may contain the language
3705 /(?P<name>\w+) # Here goes the name and then ".html"
3706 '''
3707
3708 @classmethod
3709 def suitable(cls, url):
3710 """Receives a URL and returns True if suitable for this IE."""
3711 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3712
3713 def _real_extract(self, url):
3714 m=re.match(self._VALID_URL, url, re.VERBOSE)
3715 if m.group('type_talk'):
3716 return [self._talk_info(url)]
3717 else :
3718 playlist_id=m.group('playlist_id')
3719 name=m.group('name')
3720 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3721 return [self._playlist_videos_info(url,name,playlist_id)]
3722
3723 def _talk_video_link(self,mediaSlug):
3724 '''Returns the video link for that mediaSlug'''
3725 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3726
3727 def _playlist_videos_info(self,url,name,playlist_id=0):
3728 '''Returns the videos of the playlist'''
3729 video_RE=r'''
3730 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3731 ([.\s]*?)data-playlist_item_id="(\d+)"
3732 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3733 '''
3734 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3735 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3736 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3737 m_names=re.finditer(video_name_RE,webpage)
3738
3739 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3740 m_playlist = re.search(playlist_RE, webpage)
3741 playlist_title = m_playlist.group('playlist_title')
3742
3743 playlist_entries = []
3744 for m_video, m_name in zip(m_videos,m_names):
3745 video_id=m_video.group('video_id')
3746 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3747 playlist_entries.append(self.url_result(talk_url, 'TED'))
3748 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3749
3750 def _talk_info(self, url, video_id=0):
3751 """Return the video for the talk in the url"""
3752 m=re.match(self._VALID_URL, url,re.VERBOSE)
3753 videoName=m.group('name')
3754 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3755 # If the url includes the language we get the title translated
3756 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3757 title=re.search(title_RE, webpage).group('title')
3758 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3759 "id":(?P<videoID>[\d]+).*?
3760 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3761 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3762 thumb_match=re.search(thumb_RE,webpage)
3763 info_match=re.search(info_RE,webpage,re.VERBOSE)
3764 video_id=info_match.group('videoID')
3765 mediaSlug=info_match.group('mediaSlug')
3766 video_url=self._talk_video_link(mediaSlug)
3767 info = {
3768 'id': video_id,
3769 'url': video_url,
3770 'ext': 'mp4',
3771 'title': title,
3772 'thumbnail': thumb_match.group('thumbnail')
3773 }
3774 return info
3775
3776 class MySpassIE(InfoExtractor):
3777 _VALID_URL = r'http://www.myspass.de/.*'
3778
3779 def _real_extract(self, url):
3780 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3781
3782 # video id is the last path element of the URL
3783 # usually there is a trailing slash, so also try the second but last
3784 url_path = compat_urllib_parse_urlparse(url).path
3785 url_parent_path, video_id = os.path.split(url_path)
3786 if not video_id:
3787 _, video_id = os.path.split(url_parent_path)
3788
3789 # get metadata
3790 metadata_url = META_DATA_URL_TEMPLATE % video_id
3791 metadata_text = self._download_webpage(metadata_url, video_id)
3792 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3793
3794 # extract values from metadata
3795 url_flv_el = metadata.find('url_flv')
3796 if url_flv_el is None:
3797 raise ExtractorError(u'Unable to extract download url')
3798 video_url = url_flv_el.text
3799 extension = os.path.splitext(video_url)[1][1:]
3800 title_el = metadata.find('title')
3801 if title_el is None:
3802 raise ExtractorError(u'Unable to extract title')
3803 title = title_el.text
3804 format_id_el = metadata.find('format_id')
3805 if format_id_el is None:
3806 format = ext
3807 else:
3808 format = format_id_el.text
3809 description_el = metadata.find('description')
3810 if description_el is not None:
3811 description = description_el.text
3812 else:
3813 description = None
3814 imagePreview_el = metadata.find('imagePreview')
3815 if imagePreview_el is not None:
3816 thumbnail = imagePreview_el.text
3817 else:
3818 thumbnail = None
3819 info = {
3820 'id': video_id,
3821 'url': video_url,
3822 'title': title,
3823 'ext': extension,
3824 'format': format,
3825 'thumbnail': thumbnail,
3826 'description': description
3827 }
3828 return [info]
3829
3830 class SpiegelIE(InfoExtractor):
3831 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3832
3833 def _real_extract(self, url):
3834 m = re.match(self._VALID_URL, url)
3835 video_id = m.group('videoID')
3836
3837 webpage = self._download_webpage(url, video_id)
3838 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3839 if not m:
3840 raise ExtractorError(u'Cannot find title')
3841 video_title = unescapeHTML(m.group(1))
3842
3843 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3844 xml_code = self._download_webpage(xml_url, video_id,
3845 note=u'Downloading XML', errnote=u'Failed to download XML')
3846
3847 idoc = xml.etree.ElementTree.fromstring(xml_code)
3848 last_type = idoc[-1]
3849 filename = last_type.findall('./filename')[0].text
3850 duration = float(last_type.findall('./duration')[0].text)
3851
3852 video_url = 'http://video2.spiegel.de/flash/' + filename
3853 video_ext = filename.rpartition('.')[2]
3854 info = {
3855 'id': video_id,
3856 'url': video_url,
3857 'ext': video_ext,
3858 'title': video_title,
3859 'duration': duration,
3860 }
3861 return [info]
3862
3863 class LiveLeakIE(InfoExtractor):
3864
3865 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3866 IE_NAME = u'liveleak'
3867
3868 def _real_extract(self, url):
3869 mobj = re.match(self._VALID_URL, url)
3870 if mobj is None:
3871 raise ExtractorError(u'Invalid URL: %s' % url)
3872
3873 video_id = mobj.group('video_id')
3874
3875 webpage = self._download_webpage(url, video_id)
3876
3877 m = re.search(r'file: "(.*?)",', webpage)
3878 if not m:
3879 raise ExtractorError(u'Unable to find video url')
3880 video_url = m.group(1)
3881
3882 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3883 if not m:
3884 raise ExtractorError(u'Cannot find video title')
3885 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3886
3887 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3888 if m:
3889 desc = unescapeHTML(m.group('desc'))
3890 else:
3891 desc = None
3892
3893 m = re.search(r'By:.*?(\w+)</a>', webpage)
3894 if m:
3895 uploader = clean_html(m.group(1))
3896 else:
3897 uploader = None
3898
3899 info = {
3900 'id': video_id,
3901 'url': video_url,
3902 'ext': 'mp4',
3903 'title': title,
3904 'description': desc,
3905 'uploader': uploader
3906 }
3907
3908 return [info]
3909
3910 class ARDIE(InfoExtractor):
3911 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3912 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3913 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3914
3915 def _real_extract(self, url):
3916 # determine video id from url
3917 m = re.match(self._VALID_URL, url)
3918
3919 numid = re.search(r'documentId=([0-9]+)', url)
3920 if numid:
3921 video_id = numid.group(1)
3922 else:
3923 video_id = m.group('video_id')
3924
3925 # determine title and media streams from webpage
3926 html = self._download_webpage(url, video_id)
3927 title = re.search(self._TITLE, html).group('title')
3928 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3929 if not streams:
3930 assert '"fsk"' in html
3931 raise ExtractorError(u'This video is only available after 8:00 pm')
3932
3933 # choose default media type and highest quality for now
3934 stream = max([s for s in streams if int(s["media_type"]) == 0],
3935 key=lambda s: int(s["quality"]))
3936
3937 # there's two possibilities: RTMP stream or HTTP download
3938 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3939 if stream['rtmp_url']:
3940 self.to_screen(u'RTMP download detected')
3941 assert stream['video_url'].startswith('mp4:')
3942 info["url"] = stream["rtmp_url"]
3943 info["play_path"] = stream['video_url']
3944 else:
3945 assert stream["video_url"].endswith('.mp4')
3946 info["url"] = stream["video_url"]
3947 return [info]
3948
3949 class TumblrIE(InfoExtractor):
3950 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3951
3952 def _real_extract(self, url):
3953 m_url = re.match(self._VALID_URL, url)
3954 video_id = m_url.group('id')
3955 blog = m_url.group('blog_name')
3956
3957 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3958 webpage = self._download_webpage(url, video_id)
3959
3960 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3961 video = re.search(re_video, webpage)
3962 if video is None:
3963 self.to_screen("No video founded")
3964 return []
3965 video_url = video.group('video_url')
3966 ext = video.group('ext')
3967
3968 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
3969 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3970
3971 # The only place where you can get a title, it's not complete,
3972 # but searching in other places doesn't work for all videos
3973 re_title = r'<title>(?P<title>.*?)</title>'
3974 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3975
3976 return [{'id': video_id,
3977 'url': video_url,
3978 'title': title,
3979 'thumbnail': thumb,
3980 'ext': ext
3981 }]
3982
3983 class BandcampIE(InfoExtractor):
3984 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3985
3986 def _real_extract(self, url):
3987 mobj = re.match(self._VALID_URL, url)
3988 title = mobj.group('title')
3989 webpage = self._download_webpage(url, title)
3990 # We get the link to the free download page
3991 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3992 if m_download is None:
3993 raise ExtractorError(u'No free songs founded')
3994
3995 download_link = m_download.group(1)
3996 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3997 webpage, re.MULTILINE|re.DOTALL).group('id')
3998
3999 download_webpage = self._download_webpage(download_link, id,
4000 'Downloading free downloads page')
4001 # We get the dictionary of the track from some javascrip code
4002 info = re.search(r'items: (.*?),$',
4003 download_webpage, re.MULTILINE).group(1)
4004 info = json.loads(info)[0]
4005 # We pick mp3-320 for now, until format selection can be easily implemented.
4006 mp3_info = info[u'downloads'][u'mp3-320']
4007 # If we try to use this url it says the link has expired
4008 initial_url = mp3_info[u'url']
4009 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4010 m_url = re.match(re_url, initial_url)
4011 #We build the url we will use to get the final track url
4012 # This url is build in Bandcamp in the script download_bunde_*.js
4013 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4014 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4015 # If we could correctly generate the .rand field the url would be
4016 #in the "download_url" key
4017 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4018
4019 track_info = {'id':id,
4020 'title' : info[u'title'],
4021 'ext' : 'mp3',
4022 'url' : final_url,
4023 'thumbnail' : info[u'thumb_url'],
4024 'uploader' : info[u'artist']
4025 }
4026
4027 return [track_info]
4028
4029 class RedTubeIE(InfoExtractor):
4030 """Information Extractor for redtube"""
4031 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4032
4033 def _real_extract(self,url):
4034 mobj = re.match(self._VALID_URL, url)
4035 if mobj is None:
4036 raise ExtractorError(u'Invalid URL: %s' % url)
4037
4038 video_id = mobj.group('id')
4039 video_extension = 'mp4'
4040 webpage = self._download_webpage(url, video_id)
4041 self.report_extraction(video_id)
4042 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4043
4044 if mobj is None:
4045 raise ExtractorError(u'Unable to extract media URL')
4046
4047 video_url = mobj.group(1)
4048 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4049 if mobj is None:
4050 raise ExtractorError(u'Unable to extract title')
4051 video_title = mobj.group(1)
4052
4053 return [{
4054 'id': video_id,
4055 'url': video_url,
4056 'ext': video_extension,
4057 'title': video_title,
4058 }]
4059
4060 class InaIE(InfoExtractor):
4061 """Information Extractor for Ina.fr"""
4062 _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4063
4064 def _real_extract(self,url):
4065 mobj = re.match(self._VALID_URL, url)
4066
4067 video_id = mobj.group('id')
4068 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4069 video_extension = 'mp4'
4070 webpage = self._download_webpage(mrss_url, video_id)
4071
4072 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4073 if mobj is None:
4074 raise ExtractorError(u'Unable to extract media URL')
4075 video_url = mobj.group(1)
4076
4077 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4078 if mobj is None:
4079 raise ExtractorError(u'Unable to extract title')
4080 video_title = mobj.group(1)
4081
4082 return [{
4083 'id': video_id,
4084 'url': video_url,
4085 'ext': video_extension,
4086 'title': video_title,
4087 }]
4088
4089 def gen_extractors():
4090 """ Return a list of an instance of every supported extractor.
4091 The order does matter; the first extractor matched is the one handling the URL.
4092 """
4093 return [
4094 YoutubePlaylistIE(),
4095 YoutubeChannelIE(),
4096 YoutubeUserIE(),
4097 YoutubeSearchIE(),
4098 YoutubeIE(),
4099 MetacafeIE(),
4100 DailymotionIE(),
4101 GoogleSearchIE(),
4102 PhotobucketIE(),
4103 YahooIE(),
4104 YahooSearchIE(),
4105 DepositFilesIE(),
4106 FacebookIE(),
4107 BlipTVUserIE(),
4108 BlipTVIE(),
4109 VimeoIE(),
4110 MyVideoIE(),
4111 ComedyCentralIE(),
4112 EscapistIE(),
4113 CollegeHumorIE(),
4114 XVideosIE(),
4115 SoundcloudSetIE(),
4116 SoundcloudIE(),
4117 InfoQIE(),
4118 MixcloudIE(),
4119 StanfordOpenClassroomIE(),
4120 MTVIE(),
4121 YoukuIE(),
4122 XNXXIE(),
4123 YouJizzIE(),
4124 PornotubeIE(),
4125 YouPornIE(),
4126 GooglePlusIE(),
4127 ArteTvIE(),
4128 NBAIE(),
4129 WorldStarHipHopIE(),
4130 JustinTVIE(),
4131 FunnyOrDieIE(),
4132 SteamIE(),
4133 UstreamIE(),
4134 RBMARadioIE(),
4135 EightTracksIE(),
4136 KeekIE(),
4137 TEDIE(),
4138 MySpassIE(),
4139 SpiegelIE(),
4140 LiveLeakIE(),
4141 ARDIE(),
4142 TumblrIE(),
4143 BandcampIE(),
4144 RedTubeIE(),
4145 InaIE(),
4146 GenericIE()
4147 ]
4148
4149 def get_info_extractor(ie_name):
4150 """Returns the info extractor class with the given ie_name"""
4151 return globals()[ie_name+'IE']