]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/InfoExtractors.py
Imported Upstream version 2013.06.21
[youtubedl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
36
37 The dictionaries must include the following fields:
38
39 id: Video identifier.
40 url: Final video URL.
41 title: Video title, unescaped.
42 ext: Video filename extension.
43
44 The following fields are optional:
45
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
57
58 The fields should all be Unicode strings.
59
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
63
64 _real_extract() must return a *list* of information dictionaries as
65 described above.
66
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
69 """
70
71 _ready = False
72 _downloader = None
73 _WORKING = True
74
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
77 self._ready = False
78 self.set_downloader(downloader)
79
80 @classmethod
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
84
85 @classmethod
86 def working(cls):
87 """Getter method for _WORKING."""
88 return cls._WORKING
89
90 def initialize(self):
91 """Initializes an instance (authentication, etc)."""
92 if not self._ready:
93 self._real_initialize()
94 self._ready = True
95
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
98 self.initialize()
99 return self._real_extract(url)
100
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
104
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
107 pass
108
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
111 pass
112
113 @property
114 def IE_NAME(self):
115 return type(self).__name__[:-2]
116
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
119 if note is None:
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
123 try:
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126 if errnote is None:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
129
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
135 if m:
136 encoding = m.group(1)
137 else:
138 encoding = 'utf-8'
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
141 try:
142 url = url_or_request.get_full_url()
143 except AttributeError:
144 url = url_or_request
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
150
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
154
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
158
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
162
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
166
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
170
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
176 return video_info
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
181 'url': url,
182 'ie_key': ie}
183 return video_info
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
187 'entries': entries}
188 if playlist_id:
189 video_info['id'] = playlist_id
190 if playlist_title:
191 video_info['title'] = playlist_title
192 return video_info
193
194 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
195 """
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
200 """
201 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
202 mobj = re.search(pattern, string, flags)
203 else:
204 for p in pattern:
205 mobj = re.search(p, string, flags)
206 if mobj: break
207
208 if sys.stderr.isatty() and os.name != 'nt':
209 _name = u'\033[0;34m%s\033[0m' % name
210 else:
211 _name = name
212
213 if mobj:
214 # return the first matching group
215 return next(g for g in mobj.groups() if g is not None)
216 elif default is not None:
217 return default
218 elif fatal:
219 raise ExtractorError(u'Unable to extract %s' % _name)
220 else:
221 self._downloader.report_warning(u'unable to extract %s; '
222 u'please report this issue on GitHub.' % _name)
223 return None
224
225 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
226 """
227 Like _search_regex, but strips HTML tags and unescapes entities.
228 """
229 res = self._search_regex(pattern, string, name, default, fatal, flags)
230 if res:
231 return clean_html(res).strip()
232 else:
233 return res
234
235 class SearchInfoExtractor(InfoExtractor):
236 """
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
240 """
241
242 @classmethod
243 def _make_valid_url(cls):
244 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
245
246 @classmethod
247 def suitable(cls, url):
248 return re.match(cls._make_valid_url(), url) is not None
249
250 def _real_extract(self, query):
251 mobj = re.match(self._make_valid_url(), query)
252 if mobj is None:
253 raise ExtractorError(u'Invalid search query "%s"' % query)
254
255 prefix = mobj.group('prefix')
256 query = mobj.group('query')
257 if prefix == '':
258 return self._get_n_results(query, 1)
259 elif prefix == 'all':
260 return self._get_n_results(query, self._MAX_RESULTS)
261 else:
262 n = int(prefix)
263 if n <= 0:
264 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
265 elif n > self._MAX_RESULTS:
266 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
267 n = self._MAX_RESULTS
268 return self._get_n_results(query, n)
269
270 def _get_n_results(self, query, n):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
273
274
275 class YoutubeIE(InfoExtractor):
276 """Information extractor for youtube.com."""
277
278 _VALID_URL = r"""^
279 (
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
290 v=
291 )
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
296 $"""
297 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE = 'youtube'
302 # Listed in order of quality
303 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions = {
306 '13': '3gp',
307 '17': 'mp4',
308 '18': 'mp4',
309 '22': 'mp4',
310 '37': 'mp4',
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
312 '43': 'webm',
313 '44': 'webm',
314 '45': 'webm',
315 '46': 'webm',
316 }
317 _video_dimensions = {
318 '5': '240x400',
319 '6': '???',
320 '13': '???',
321 '17': '144x176',
322 '18': '360x640',
323 '22': '720x1280',
324 '34': '360x640',
325 '35': '480x854',
326 '37': '1080x1920',
327 '38': '3072x4096',
328 '43': '360x640',
329 '44': '480x854',
330 '45': '720x1280',
331 '46': '1080x1920',
332 }
333 IE_NAME = u'youtube'
334
335 @classmethod
336 def suitable(cls, url):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE.suitable(url): return False
339 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
340
341 def report_lang(self):
342 """Report attempt to set language."""
343 self.to_screen(u'Setting language')
344
345 def report_login(self):
346 """Report attempt to log in."""
347 self.to_screen(u'Logging in')
348
349 def report_video_webpage_download(self, video_id):
350 """Report attempt to download video webpage."""
351 self.to_screen(u'%s: Downloading video webpage' % video_id)
352
353 def report_video_info_webpage_download(self, video_id):
354 """Report attempt to download video info webpage."""
355 self.to_screen(u'%s: Downloading video info webpage' % video_id)
356
357 def report_video_subtitles_download(self, video_id):
358 """Report attempt to download video info webpage."""
359 self.to_screen(u'%s: Checking available subtitles' % video_id)
360
361 def report_video_subtitles_request(self, video_id, sub_lang, format):
362 """Report attempt to download video info webpage."""
363 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
364
365 def report_video_subtitles_available(self, video_id, sub_lang_list):
366 """Report available subtitles."""
367 sub_lang = ",".join(list(sub_lang_list.keys()))
368 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
369
370 def report_information_extraction(self, video_id):
371 """Report attempt to extract video information."""
372 self.to_screen(u'%s: Extracting video information' % video_id)
373
374 def report_unavailable_format(self, video_id, format):
375 """Report extracted video URL."""
376 self.to_screen(u'%s: Format %s not available' % (video_id, format))
377
378 def report_rtmp_download(self):
379 """Indicate the download will use the RTMP protocol."""
380 self.to_screen(u'RTMP download detected')
381
382 def _get_available_subtitles(self, video_id):
383 self.report_video_subtitles_download(video_id)
384 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
385 try:
386 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
387 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
388 return (u'unable to download video subtitles: %s' % compat_str(err), None)
389 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
390 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
391 if not sub_lang_list:
392 return (u'video doesn\'t have subtitles', None)
393 return sub_lang_list
394
395 def _list_available_subtitles(self, video_id):
396 sub_lang_list = self._get_available_subtitles(video_id)
397 self.report_video_subtitles_available(video_id, sub_lang_list)
398
399 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
400 """
401 Return tuple:
402 (error_message, sub_lang, sub)
403 """
404 self.report_video_subtitles_request(video_id, sub_lang, format)
405 params = compat_urllib_parse.urlencode({
406 'lang': sub_lang,
407 'name': sub_name,
408 'v': video_id,
409 'fmt': format,
410 })
411 url = 'http://www.youtube.com/api/timedtext?' + params
412 try:
413 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
416 if not sub:
417 return (u'Did not fetch video subtitles', None, None)
418 return (None, sub_lang, sub)
419
420 def _request_automatic_caption(self, video_id, webpage):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang = self._downloader.params.get('subtitleslang')
424 sub_format = self._downloader.params.get('subtitlesformat')
425 self.to_screen(u'%s: Looking for automatic captions' % video_id)
426 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
427 err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
428 if mobj is None:
429 return [(err_msg, None, None)]
430 player_config = json.loads(mobj.group(1))
431 try:
432 args = player_config[u'args']
433 caption_url = args[u'ttsurl']
434 timestamp = args[u'timestamp']
435 params = compat_urllib_parse.urlencode({
436 'lang': 'en',
437 'tlang': sub_lang,
438 'fmt': sub_format,
439 'ts': timestamp,
440 'kind': 'asr',
441 })
442 subtitles_url = caption_url + '&' + params
443 sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
444 return [(None, sub_lang, sub)]
445 except KeyError:
446 return [(err_msg, None, None)]
447
448 def _extract_subtitle(self, video_id):
449 """
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
452 """
453 sub_lang_list = self._get_available_subtitles(video_id)
454 sub_format = self._downloader.params.get('subtitlesformat')
455 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list[0], None, None)]
457 if self._downloader.params.get('subtitleslang', False):
458 sub_lang = self._downloader.params.get('subtitleslang')
459 elif 'en' in sub_lang_list:
460 sub_lang = 'en'
461 else:
462 sub_lang = list(sub_lang_list.keys())[0]
463 if not sub_lang in sub_lang_list:
464 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
465
466 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
467 return [subtitle]
468
469 def _extract_all_subtitles(self, video_id):
470 sub_lang_list = self._get_available_subtitles(video_id)
471 sub_format = self._downloader.params.get('subtitlesformat')
472 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list[0], None, None)]
474 subtitles = []
475 for sub_lang in sub_lang_list:
476 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
477 subtitles.append(subtitle)
478 return subtitles
479
480 def _print_formats(self, formats):
481 print('Available formats:')
482 for x in formats:
483 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
484
485 def _real_initialize(self):
486 if self._downloader is None:
487 return
488
489 username = None
490 password = None
491 downloader_params = self._downloader.params
492
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params.get('username', None) is not None:
495 username = downloader_params['username']
496 password = downloader_params['password']
497 elif downloader_params.get('usenetrc', False):
498 try:
499 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
500 if info is not None:
501 username = info[0]
502 password = info[2]
503 else:
504 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
505 except (IOError, netrc.NetrcParseError) as err:
506 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
507 return
508
509 # Set language
510 request = compat_urllib_request.Request(self._LANG_URL)
511 try:
512 self.report_lang()
513 compat_urllib_request.urlopen(request).read()
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
516 return
517
518 # No authentication to be performed
519 if username is None:
520 return
521
522 request = compat_urllib_request.Request(self._LOGIN_URL)
523 try:
524 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
525 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
526 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
527 return
528
529 galx = None
530 dsh = None
531 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
532 if match:
533 galx = match.group(1)
534
535 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
536 if match:
537 dsh = match.group(1)
538
539 # Log in
540 login_form_strs = {
541 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
542 u'Email': username,
543 u'GALX': galx,
544 u'Passwd': password,
545 u'PersistentCookie': u'yes',
546 u'_utf8': u'霱',
547 u'bgresponse': u'js_disabled',
548 u'checkConnection': u'',
549 u'checkedDomains': u'youtube',
550 u'dnConn': u'',
551 u'dsh': dsh,
552 u'pstMsg': u'0',
553 u'rmShown': u'1',
554 u'secTok': u'',
555 u'signIn': u'Sign in',
556 u'timeStmp': u'',
557 u'service': u'youtube',
558 u'uilel': u'3',
559 u'hl': u'en_US',
560 }
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
562 # chokes on unicode
563 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
564 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
565 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
566 try:
567 self.report_login()
568 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
569 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
570 self._downloader.report_warning(u'unable to log in: bad username or password')
571 return
572 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
573 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
574 return
575
576 # Confirm age
577 age_form = {
578 'next_url': '/',
579 'action_confirm': 'Confirm',
580 }
581 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
582 try:
583 self.report_age_confirmation()
584 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
587
588 def _extract_id(self, url):
589 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
590 if mobj is None:
591 raise ExtractorError(u'Invalid URL: %s' % url)
592 video_id = mobj.group(2)
593 return video_id
594
595 def _real_extract(self, url):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj = re.search(self._NEXT_URL_RE, url)
598 if mobj:
599 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
600 video_id = self._extract_id(url)
601
602 # Get video webpage
603 self.report_video_webpage_download(video_id)
604 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request = compat_urllib_request.Request(url)
606 try:
607 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
609 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
610
611 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
612
613 # Attempt to extract SWF player URL
614 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
615 if mobj is not None:
616 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
617 else:
618 player_url = None
619
620 # Get video info
621 self.report_video_info_webpage_download(video_id)
622 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id, el_type))
625 video_info_webpage = self._download_webpage(video_info_url, video_id,
626 note=False,
627 errnote='unable to download video info webpage')
628 video_info = compat_parse_qs(video_info_webpage)
629 if 'token' in video_info:
630 break
631 if 'token' not in video_info:
632 if 'reason' in video_info:
633 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
634 else:
635 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
636
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
639 raise ExtractorError(u'"rental" videos not supported')
640
641 # Start extracting information
642 self.report_information_extraction(video_id)
643
644 # uploader
645 if 'author' not in video_info:
646 raise ExtractorError(u'Unable to extract uploader name')
647 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
648
649 # uploader_id
650 video_uploader_id = None
651 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
652 if mobj is not None:
653 video_uploader_id = mobj.group(1)
654 else:
655 self._downloader.report_warning(u'unable to extract uploader nickname')
656
657 # title
658 if 'title' not in video_info:
659 raise ExtractorError(u'Unable to extract video title')
660 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
661
662 # thumbnail image
663 if 'thumbnail_url' not in video_info:
664 self._downloader.report_warning(u'unable to extract video thumbnail')
665 video_thumbnail = ''
666 else: # don't panic if we can't find it
667 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
668
669 # upload date
670 upload_date = None
671 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
672 if mobj is not None:
673 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
674 upload_date = unified_strdate(upload_date)
675
676 # description
677 video_description = get_element_by_id("eow-description", video_webpage)
678 if video_description:
679 video_description = clean_html(video_description)
680 else:
681 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
682 if fd_mobj:
683 video_description = unescapeHTML(fd_mobj.group(1))
684 else:
685 video_description = u''
686
687 # subtitles
688 video_subtitles = None
689
690 if self._downloader.params.get('writesubtitles', False):
691 video_subtitles = self._extract_subtitle(video_id)
692 if video_subtitles:
693 (sub_error, sub_lang, sub) = video_subtitles[0]
694 if sub_error:
695 # We try with the automatic captions
696 video_subtitles = self._request_automatic_caption(video_id, video_webpage)
697 (sub_error_auto, sub_lang, sub) = video_subtitles[0]
698 if sub is not None:
699 pass
700 else:
701 # We report the original error
702 self._downloader.report_error(sub_error)
703
704 if self._downloader.params.get('allsubtitles', False):
705 video_subtitles = self._extract_all_subtitles(video_id)
706 for video_subtitle in video_subtitles:
707 (sub_error, sub_lang, sub) = video_subtitle
708 if sub_error:
709 self._downloader.report_error(sub_error)
710
711 if self._downloader.params.get('listsubtitles', False):
712 sub_lang_list = self._list_available_subtitles(video_id)
713 return
714
715 if 'length_seconds' not in video_info:
716 self._downloader.report_warning(u'unable to extract video duration')
717 video_duration = ''
718 else:
719 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
720
721 # token
722 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
723
724 # Decide which formats to download
725 req_format = self._downloader.params.get('format', None)
726
727 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
728 self.report_rtmp_download()
729 video_url_list = [(None, video_info['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
731 url_map = {}
732 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
733 url_data = compat_parse_qs(url_data_str)
734 if 'itag' in url_data and 'url' in url_data:
735 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
736 if not 'ratebypass' in url: url += '&ratebypass=yes'
737 url_map[url_data['itag'][0]] = url
738
739 format_limit = self._downloader.params.get('format_limit', None)
740 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
741 if format_limit is not None and format_limit in available_formats:
742 format_list = available_formats[available_formats.index(format_limit):]
743 else:
744 format_list = available_formats
745 existing_formats = [x for x in format_list if x in url_map]
746 if len(existing_formats) == 0:
747 raise ExtractorError(u'no known formats available for video')
748 if self._downloader.params.get('listformats', None):
749 self._print_formats(existing_formats)
750 return
751 if req_format is None or req_format == 'best':
752 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
753 elif req_format == 'worst':
754 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
755 elif req_format in ('-1', 'all'):
756 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
757 else:
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats = req_format.split('/')
761 video_url_list = None
762 for rf in req_formats:
763 if rf in url_map:
764 video_url_list = [(rf, url_map[rf])]
765 break
766 if video_url_list is None:
767 raise ExtractorError(u'requested format not available')
768 else:
769 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
770
771 results = []
772 for format_param, video_real_url in video_url_list:
773 # Extension
774 video_extension = self._video_extensions.get(format_param, 'flv')
775
776 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
777 self._video_dimensions.get(format_param, '???'))
778
779 results.append({
780 'id': video_id,
781 'url': video_real_url,
782 'uploader': video_uploader,
783 'uploader_id': video_uploader_id,
784 'upload_date': upload_date,
785 'title': video_title,
786 'ext': video_extension,
787 'format': video_format,
788 'thumbnail': video_thumbnail,
789 'description': video_description,
790 'player_url': player_url,
791 'subtitles': video_subtitles,
792 'duration': video_duration
793 })
794 return results
795
796
797 class MetacafeIE(InfoExtractor):
798 """Information Extractor for metacafe.com."""
799
800 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME = u'metacafe'
804
805 def report_disclaimer(self):
806 """Report disclaimer retrieval."""
807 self.to_screen(u'Retrieving disclaimer')
808
809 def _real_initialize(self):
810 # Retrieve disclaimer
811 request = compat_urllib_request.Request(self._DISCLAIMER)
812 try:
813 self.report_disclaimer()
814 disclaimer = compat_urllib_request.urlopen(request).read()
815 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
816 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
817
818 # Confirm age
819 disclaimer_form = {
820 'filters': '0',
821 'submit': "Continue - I'm over 18",
822 }
823 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
824 try:
825 self.report_age_confirmation()
826 disclaimer = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
829
830 def _real_extract(self, url):
831 # Extract id and simplified title from URL
832 mobj = re.match(self._VALID_URL, url)
833 if mobj is None:
834 raise ExtractorError(u'Invalid URL: %s' % url)
835
836 video_id = mobj.group(1)
837
838 # Check if video comes from YouTube
839 mobj2 = re.match(r'^yt-(.*)$', video_id)
840 if mobj2 is not None:
841 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
842
843 # Retrieve video webpage to extract further information
844 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
845
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
849 if mobj is not None:
850 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
851 video_extension = mediaURL[-3:]
852
853 # Extract gdaKey if available
854 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
855 if mobj is None:
856 video_url = mediaURL
857 else:
858 gdaKey = mobj.group(1)
859 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
860 else:
861 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
862 if mobj is None:
863 raise ExtractorError(u'Unable to extract media URL')
864 vardict = compat_parse_qs(mobj.group(1))
865 if 'mediaData' not in vardict:
866 raise ExtractorError(u'Unable to extract media URL')
867 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
868 if mobj is None:
869 raise ExtractorError(u'Unable to extract media URL')
870 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
871 video_extension = mediaURL[-3:]
872 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
873
874 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
875 if mobj is None:
876 raise ExtractorError(u'Unable to extract title')
877 video_title = mobj.group(1).decode('utf-8')
878
879 mobj = re.search(r'submitter=(.*?);', webpage)
880 if mobj is None:
881 raise ExtractorError(u'Unable to extract uploader nickname')
882 video_uploader = mobj.group(1)
883
884 return [{
885 'id': video_id.decode('utf-8'),
886 'url': video_url.decode('utf-8'),
887 'uploader': video_uploader.decode('utf-8'),
888 'upload_date': None,
889 'title': video_title,
890 'ext': video_extension.decode('utf-8'),
891 }]
892
893 class DailymotionIE(InfoExtractor):
894 """Information Extractor for Dailymotion"""
895
896 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME = u'dailymotion'
898
899 def _real_extract(self, url):
900 # Extract id and simplified title from URL
901 mobj = re.match(self._VALID_URL, url)
902 if mobj is None:
903 raise ExtractorError(u'Invalid URL: %s' % url)
904
905 video_id = mobj.group(1).split('_')[0].split('?')[0]
906
907 video_extension = 'mp4'
908
909 # Retrieve video webpage to extract further information
910 request = compat_urllib_request.Request(url)
911 request.add_header('Cookie', 'family_filter=off')
912 webpage = self._download_webpage(request, video_id)
913
914 # Extract URL, uploader and title from webpage
915 self.report_extraction(video_id)
916 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
917 if mobj is None:
918 raise ExtractorError(u'Unable to extract media URL')
919 flashvars = compat_urllib_parse.unquote(mobj.group(1))
920
921 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
922 if key in flashvars:
923 max_quality = key
924 self.to_screen(u'Using %s' % key)
925 break
926 else:
927 raise ExtractorError(u'Unable to extract video URL')
928
929 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
930 if mobj is None:
931 raise ExtractorError(u'Unable to extract video URL')
932
933 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
934
935 # TODO: support choosing qualities
936
937 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
938 if mobj is None:
939 raise ExtractorError(u'Unable to extract title')
940 video_title = unescapeHTML(mobj.group('title'))
941
942 video_uploader = None
943 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
944 # Looking for official user
945 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
946 webpage, 'video uploader')
947
948 video_upload_date = None
949 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
950 if mobj is not None:
951 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
952
953 return [{
954 'id': video_id,
955 'url': video_url,
956 'uploader': video_uploader,
957 'upload_date': video_upload_date,
958 'title': video_title,
959 'ext': video_extension,
960 }]
961
962
963 class PhotobucketIE(InfoExtractor):
964 """Information extractor for photobucket.com."""
965
966 # TODO: the original _VALID_URL was:
967 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
968 # Check if it's necessary to keep the old extracion process
969 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
970 IE_NAME = u'photobucket'
971
972 def _real_extract(self, url):
973 # Extract id from URL
974 mobj = re.match(self._VALID_URL, url)
975 if mobj is None:
976 raise ExtractorError(u'Invalid URL: %s' % url)
977
978 video_id = mobj.group('id')
979
980 video_extension = mobj.group('ext')
981
982 # Retrieve video webpage to extract further information
983 webpage = self._download_webpage(url, video_id)
984
985 # Extract URL, uploader, and title from webpage
986 self.report_extraction(video_id)
987 # We try first by looking the javascript code:
988 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
989 if mobj is not None:
990 info = json.loads(mobj.group('json'))
991 return [{
992 'id': video_id,
993 'url': info[u'downloadUrl'],
994 'uploader': info[u'username'],
995 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
996 'title': info[u'title'],
997 'ext': video_extension,
998 'thumbnail': info[u'thumbUrl'],
999 }]
1000
1001 # We try looking in other parts of the webpage
1002 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003 webpage, u'video URL')
1004
1005 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1006 if mobj is None:
1007 raise ExtractorError(u'Unable to extract title')
1008 video_title = mobj.group(1).decode('utf-8')
1009 video_uploader = mobj.group(2).decode('utf-8')
1010
1011 return [{
1012 'id': video_id.decode('utf-8'),
1013 'url': video_url.decode('utf-8'),
1014 'uploader': video_uploader,
1015 'upload_date': None,
1016 'title': video_title,
1017 'ext': video_extension.decode('utf-8'),
1018 }]
1019
1020
1021 class YahooIE(InfoExtractor):
1022 """Information extractor for screen.yahoo.com."""
1023 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1024
1025 def _real_extract(self, url):
1026 mobj = re.match(self._VALID_URL, url)
1027 if mobj is None:
1028 raise ExtractorError(u'Invalid URL: %s' % url)
1029 video_id = mobj.group('id')
1030 webpage = self._download_webpage(url, video_id)
1031 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
1032
1033 if m_id is None:
1034 # TODO: Check which url parameters are required
1035 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
1037 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1041 '''
1042 self.report_extraction(video_id)
1043 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
1044 if m_info is None:
1045 raise ExtractorError(u'Unable to extract video info')
1046 video_title = m_info.group('title')
1047 video_description = m_info.group('description')
1048 video_thumb = m_info.group('thumb')
1049 video_date = m_info.group('date')
1050 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
1051
1052 # TODO: Find a way to get mp4 videos
1053 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
1055 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
1056 video_url = m_rest.group('url')
1057 video_path = m_rest.group('path')
1058 if m_rest is None:
1059 raise ExtractorError(u'Unable to extract video url')
1060
1061 else: # We have to use a different method if another id is defined
1062 long_id = m_id.group('new_id')
1063 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1065 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1066 info = json.loads(json_str)
1067 res = info[u'query'][u'results'][u'mediaObj'][0]
1068 stream = res[u'streams'][0]
1069 video_path = stream[u'path']
1070 video_url = stream[u'host']
1071 meta = res[u'meta']
1072 video_title = meta[u'title']
1073 video_description = meta[u'description']
1074 video_thumb = meta[u'thumbnail']
1075 video_date = None # I can't find it
1076
1077 info_dict = {
1078 'id': video_id,
1079 'url': video_url,
1080 'play_path': video_path,
1081 'title':video_title,
1082 'description': video_description,
1083 'thumbnail': video_thumb,
1084 'upload_date': video_date,
1085 'ext': 'flv',
1086 }
1087 return info_dict
1088
1089 class VimeoIE(InfoExtractor):
1090 """Information extractor for vimeo.com."""
1091
1092 # _VALID_URL matches Vimeo URLs
1093 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 IE_NAME = u'vimeo'
1095
1096 def _real_extract(self, url, new_video=True):
1097 # Extract ID from URL
1098 mobj = re.match(self._VALID_URL, url)
1099 if mobj is None:
1100 raise ExtractorError(u'Invalid URL: %s' % url)
1101
1102 video_id = mobj.group('id')
1103 if not mobj.group('proto'):
1104 url = 'https://' + url
1105 if mobj.group('direct_link') or mobj.group('pro'):
1106 url = 'https://vimeo.com/' + video_id
1107
1108 # Retrieve video webpage to extract further information
1109 request = compat_urllib_request.Request(url, None, std_headers)
1110 webpage = self._download_webpage(request, video_id)
1111
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self.report_extraction(video_id)
1116
1117 # Extract the config JSON
1118 try:
1119 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1120 config = json.loads(config)
1121 except:
1122 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1123 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1124 else:
1125 raise ExtractorError(u'Unable to extract info section')
1126
1127 # Extract title
1128 video_title = config["video"]["title"]
1129
1130 # Extract uploader and uploader_id
1131 video_uploader = config["video"]["owner"]["name"]
1132 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
1133
1134 # Extract video thumbnail
1135 video_thumbnail = config["video"]["thumbnail"]
1136
1137 # Extract video description
1138 video_description = get_element_by_attribute("itemprop", "description", webpage)
1139 if video_description: video_description = clean_html(video_description)
1140 else: video_description = u''
1141
1142 # Extract upload date
1143 video_upload_date = None
1144 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1145 if mobj is not None:
1146 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1147
1148 # Vimeo specific: extract request signature and timestamp
1149 sig = config['request']['signature']
1150 timestamp = config['request']['timestamp']
1151
1152 # Vimeo specific: extract video codec and quality information
1153 # First consider quality, then codecs, then take everything
1154 # TODO bind to format param
1155 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156 files = { 'hd': [], 'sd': [], 'other': []}
1157 for codec_name, codec_extension in codecs:
1158 if codec_name in config["video"]["files"]:
1159 if 'hd' in config["video"]["files"][codec_name]:
1160 files['hd'].append((codec_name, codec_extension, 'hd'))
1161 elif 'sd' in config["video"]["files"][codec_name]:
1162 files['sd'].append((codec_name, codec_extension, 'sd'))
1163 else:
1164 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1165
1166 for quality in ('hd', 'sd', 'other'):
1167 if len(files[quality]) > 0:
1168 video_quality = files[quality][0][2]
1169 video_codec = files[quality][0][0]
1170 video_extension = files[quality][0][1]
1171 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1172 break
1173 else:
1174 raise ExtractorError(u'No known codec found')
1175
1176 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178
1179 return [{
1180 'id': video_id,
1181 'url': video_url,
1182 'uploader': video_uploader,
1183 'uploader_id': video_uploader_id,
1184 'upload_date': video_upload_date,
1185 'title': video_title,
1186 'ext': video_extension,
1187 'thumbnail': video_thumbnail,
1188 'description': video_description,
1189 }]
1190
1191
1192 class ArteTvIE(InfoExtractor):
1193 """arte.tv information extractor."""
1194
1195 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL = r'index-[0-9]+\.html$'
1197
1198 IE_NAME = u'arte.tv'
1199
1200 def fetch_webpage(self, url):
1201 request = compat_urllib_request.Request(url)
1202 try:
1203 self.report_download_webpage(url)
1204 webpage = compat_urllib_request.urlopen(request).read()
1205 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1206 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1207 except ValueError as err:
1208 raise ExtractorError(u'Invalid URL: %s' % url)
1209 return webpage
1210
1211 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1212 page = self.fetch_webpage(url)
1213 mobj = re.search(regex, page, regexFlags)
1214 info = {}
1215
1216 if mobj is None:
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219 for (i, key, err) in matchTuples:
1220 if mobj.group(i) is None:
1221 raise ExtractorError(err)
1222 else:
1223 info[key] = mobj.group(i)
1224
1225 return info
1226
1227 def extractLiveStream(self, url):
1228 video_lang = url.split('/')[-4]
1229 info = self.grep_webpage(
1230 url,
1231 r'src="(.*?/videothek_js.*?\.js)',
1232 0,
1233 [
1234 (1, 'url', u'Invalid URL: %s' % url)
1235 ]
1236 )
1237 http_host = url.split('/')[2]
1238 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1239 info = self.grep_webpage(
1240 next_url,
1241 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1242 '(http://.*?\.swf).*?' +
1243 '(rtmp://.*?)\'',
1244 re.DOTALL,
1245 [
1246 (1, 'path', u'could not extract video path: %s' % url),
1247 (2, 'player', u'could not extract video player: %s' % url),
1248 (3, 'url', u'could not extract video url: %s' % url)
1249 ]
1250 )
1251 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1252
1253 def extractPlus7Stream(self, url):
1254 video_lang = url.split('/')[-3]
1255 info = self.grep_webpage(
1256 url,
1257 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1258 0,
1259 [
1260 (1, 'url', u'Invalid URL: %s' % url)
1261 ]
1262 )
1263 next_url = compat_urllib_parse.unquote(info.get('url'))
1264 info = self.grep_webpage(
1265 next_url,
1266 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1267 0,
1268 [
1269 (1, 'url', u'Could not find <video> tag: %s' % url)
1270 ]
1271 )
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1273
1274 info = self.grep_webpage(
1275 next_url,
1276 r'<video id="(.*?)".*?>.*?' +
1277 '<name>(.*?)</name>.*?' +
1278 '<dateVideo>(.*?)</dateVideo>.*?' +
1279 '<url quality="hd">(.*?)</url>',
1280 re.DOTALL,
1281 [
1282 (1, 'id', u'could not extract video id: %s' % url),
1283 (2, 'title', u'could not extract video title: %s' % url),
1284 (3, 'date', u'could not extract video date: %s' % url),
1285 (4, 'url', u'could not extract video url: %s' % url)
1286 ]
1287 )
1288
1289 return {
1290 'id': info.get('id'),
1291 'url': compat_urllib_parse.unquote(info.get('url')),
1292 'uploader': u'arte.tv',
1293 'upload_date': unified_strdate(info.get('date')),
1294 'title': info.get('title').decode('utf-8'),
1295 'ext': u'mp4',
1296 'format': u'NA',
1297 'player_url': None,
1298 }
1299
1300 def _real_extract(self, url):
1301 video_id = url.split('/')[-1]
1302 self.report_extraction(video_id)
1303
1304 if re.search(self._LIVE_URL, video_id) is not None:
1305 self.extractLiveStream(url)
1306 return
1307 else:
1308 info = self.extractPlus7Stream(url)
1309
1310 return [info]
1311
1312
1313 class GenericIE(InfoExtractor):
1314 """Generic last-resort information extractor."""
1315
1316 _VALID_URL = r'.*'
1317 IE_NAME = u'generic'
1318
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.report_warning(u'Falling back on generic information extractor.')
1323 super(GenericIE, self).report_download_webpage(video_id)
1324
1325 def report_following_redirect(self, new_url):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1328
1329 def _test_redirect(self, url):
1330 """Check if it is a redirect, like url shorteners, in case return the new url."""
1331 class HeadRequest(compat_urllib_request.Request):
1332 def get_method(self):
1333 return "HEAD"
1334
1335 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1336 """
1337 Subclass the HTTPRedirectHandler to make it use our
1338 HeadRequest also on the redirected URL
1339 """
1340 def redirect_request(self, req, fp, code, msg, headers, newurl):
1341 if code in (301, 302, 303, 307):
1342 newurl = newurl.replace(' ', '%20')
1343 newheaders = dict((k,v) for k,v in req.headers.items()
1344 if k.lower() not in ("content-length", "content-type"))
1345 return HeadRequest(newurl,
1346 headers=newheaders,
1347 origin_req_host=req.get_origin_req_host(),
1348 unverifiable=True)
1349 else:
1350 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1351
1352 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1353 """
1354 Fallback to GET if HEAD is not allowed (405 HTTP error)
1355 """
1356 def http_error_405(self, req, fp, code, msg, headers):
1357 fp.read()
1358 fp.close()
1359
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content-length", "content-type"))
1362 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1363 headers=newheaders,
1364 origin_req_host=req.get_origin_req_host(),
1365 unverifiable=True))
1366
1367 # Build our opener
1368 opener = compat_urllib_request.OpenerDirector()
1369 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370 HTTPMethodFallback, HEADRedirectHandler,
1371 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372 opener.add_handler(handler())
1373
1374 response = opener.open(HeadRequest(url))
1375 if response is None:
1376 raise ExtractorError(u'Invalid URL protocol')
1377 new_url = response.geturl()
1378
1379 if url == new_url:
1380 return False
1381
1382 self.report_following_redirect(new_url)
1383 return new_url
1384
1385 def _real_extract(self, url):
1386 new_url = self._test_redirect(url)
1387 if new_url: return [self.url_result(new_url)]
1388
1389 video_id = url.split('/')[-1]
1390 try:
1391 webpage = self._download_webpage(url, video_id)
1392 except ValueError as err:
1393 # since this is the last-resort InfoExtractor, if
1394 # this error is thrown, it'll be thrown here
1395 raise ExtractorError(u'Invalid URL: %s' % url)
1396
1397 self.report_extraction(video_id)
1398 # Start with something easy: JW Player in SWFObject
1399 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1400 if mobj is None:
1401 # Broaden the search a little bit
1402 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1403 if mobj is None:
1404 # Broaden the search a little bit: JWPlayer JS loader
1405 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1406 if mobj is None:
1407 # Try to find twitter cards info
1408 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1409 if mobj is None:
1410 raise ExtractorError(u'Invalid URL: %s' % url)
1411
1412 # It's possible that one of the regexes
1413 # matched, but returned an empty group:
1414 if mobj.group(1) is None:
1415 raise ExtractorError(u'Invalid URL: %s' % url)
1416
1417 video_url = compat_urllib_parse.unquote(mobj.group(1))
1418 video_id = os.path.basename(video_url)
1419
1420 # here's a fun little line of code for you:
1421 video_extension = os.path.splitext(video_id)[1][1:]
1422 video_id = os.path.splitext(video_id)[0]
1423
1424 # it's tempting to parse this further, but you would
1425 # have to take into account all the variations like
1426 # Video Title - Site Name
1427 # Site Name | Video Title
1428 # Video Title - Tagline | Site Name
1429 # and so on and so forth; it's just not practical
1430 video_title = self._html_search_regex(r'<title>(.*)</title>',
1431 webpage, u'video title')
1432
1433 # video uploader is domain name
1434 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1435 url, u'video uploader')
1436
1437 return [{
1438 'id': video_id,
1439 'url': video_url,
1440 'uploader': video_uploader,
1441 'upload_date': None,
1442 'title': video_title,
1443 'ext': video_extension,
1444 }]
1445
1446
1447 class YoutubeSearchIE(SearchInfoExtractor):
1448 """Information Extractor for YouTube search queries."""
1449 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1450 _MAX_RESULTS = 1000
1451 IE_NAME = u'youtube:search'
1452 _SEARCH_KEY = 'ytsearch'
1453
1454 def report_download_page(self, query, pagenum):
1455 """Report attempt to download search page with given number."""
1456 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1457
1458 def _get_n_results(self, query, n):
1459 """Get a specified number of results for a query"""
1460
1461 video_ids = []
1462 pagenum = 0
1463 limit = n
1464
1465 while (50 * pagenum) < limit:
1466 self.report_download_page(query, pagenum+1)
1467 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1468 request = compat_urllib_request.Request(result_url)
1469 try:
1470 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1471 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1472 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1473 api_response = json.loads(data)['data']
1474
1475 if not 'items' in api_response:
1476 raise ExtractorError(u'[youtube] No video results')
1477
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1480
1481 limit = min(n, api_response['totalItems'])
1482 pagenum += 1
1483
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1487 return self.playlist_result(videos, query)
1488
1489
1490 class GoogleSearchIE(SearchInfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1493 _MAX_RESULTS = 1000
1494 IE_NAME = u'video.google:search'
1495 _SEARCH_KEY = 'gvsearch'
1496
1497 def _get_n_results(self, query, n):
1498 """Get a specified number of results for a query"""
1499
1500 res = {
1501 '_type': 'playlist',
1502 'id': query,
1503 'entries': []
1504 }
1505
1506 for pagenum in itertools.count(1):
1507 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1508 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1509 note='Downloading result page ' + str(pagenum))
1510
1511 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1512 e = {
1513 '_type': 'url',
1514 'url': mobj.group(1)
1515 }
1516 res['entries'].append(e)
1517
1518 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1519 return res
1520
1521 class YahooSearchIE(SearchInfoExtractor):
1522 """Information Extractor for Yahoo! Video search queries."""
1523
1524 _MAX_RESULTS = 1000
1525 IE_NAME = u'screen.yahoo:search'
1526 _SEARCH_KEY = 'yvsearch'
1527
1528 def _get_n_results(self, query, n):
1529 """Get a specified number of results for a query"""
1530
1531 res = {
1532 '_type': 'playlist',
1533 'id': query,
1534 'entries': []
1535 }
1536 for pagenum in itertools.count(0):
1537 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1538 webpage = self._download_webpage(result_url, query,
1539 note='Downloading results page '+str(pagenum+1))
1540 info = json.loads(webpage)
1541 m = info[u'm']
1542 results = info[u'results']
1543
1544 for (i, r) in enumerate(results):
1545 if (pagenum * 30) +i >= n:
1546 break
1547 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1548 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1549 res['entries'].append(e)
1550 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1551 break
1552
1553 return res
1554
1555
1556 class YoutubePlaylistIE(InfoExtractor):
1557 """Information Extractor for YouTube playlists."""
1558
1559 _VALID_URL = r"""(?:
1560 (?:https?://)?
1561 (?:\w+\.)?
1562 youtube\.com/
1563 (?:
1564 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565 \? (?:.*?&)*? (?:p|a|list)=
1566 | p/
1567 )
1568 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1569 .*
1570 |
1571 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1572 )"""
1573 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1574 _MAX_RESULTS = 50
1575 IE_NAME = u'youtube:playlist'
1576
1577 @classmethod
1578 def suitable(cls, url):
1579 """Receives a URL and returns True if suitable for this IE."""
1580 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1581
1582 def _real_extract(self, url):
1583 # Extract playlist id
1584 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1585 if mobj is None:
1586 raise ExtractorError(u'Invalid URL: %s' % url)
1587
1588 # Download playlist videos from API
1589 playlist_id = mobj.group(1) or mobj.group(2)
1590 page_num = 1
1591 videos = []
1592
1593 while True:
1594 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1595 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1596
1597 try:
1598 response = json.loads(page)
1599 except ValueError as err:
1600 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1601
1602 if 'feed' not in response:
1603 raise ExtractorError(u'Got a malformed response from YouTube API')
1604 playlist_title = response['feed']['title']['$t']
1605 if 'entry' not in response['feed']:
1606 # Number of videos is a multiple of self._MAX_RESULTS
1607 break
1608
1609 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1610 for entry in response['feed']['entry']
1611 if 'content' in entry ]
1612
1613 if len(response['feed']['entry']) < self._MAX_RESULTS:
1614 break
1615 page_num += 1
1616
1617 videos = [v[1] for v in sorted(videos)]
1618
1619 url_results = [self.url_result(url, 'Youtube') for url in videos]
1620 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1621
1622
1623 class YoutubeChannelIE(InfoExtractor):
1624 """Information Extractor for YouTube channels."""
1625
1626 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1627 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1628 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1629 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1630 IE_NAME = u'youtube:channel'
1631
1632 def extract_videos_from_page(self, page):
1633 ids_in_page = []
1634 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1635 if mobj.group(1) not in ids_in_page:
1636 ids_in_page.append(mobj.group(1))
1637 return ids_in_page
1638
1639 def _real_extract(self, url):
1640 # Extract channel id
1641 mobj = re.match(self._VALID_URL, url)
1642 if mobj is None:
1643 raise ExtractorError(u'Invalid URL: %s' % url)
1644
1645 # Download channel page
1646 channel_id = mobj.group(1)
1647 video_ids = []
1648 pagenum = 1
1649
1650 url = self._TEMPLATE_URL % (channel_id, pagenum)
1651 page = self._download_webpage(url, channel_id,
1652 u'Downloading page #%s' % pagenum)
1653
1654 # Extract video identifiers
1655 ids_in_page = self.extract_videos_from_page(page)
1656 video_ids.extend(ids_in_page)
1657
1658 # Download any subsequent channel pages using the json-based channel_ajax query
1659 if self._MORE_PAGES_INDICATOR in page:
1660 while True:
1661 pagenum = pagenum + 1
1662
1663 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1664 page = self._download_webpage(url, channel_id,
1665 u'Downloading page #%s' % pagenum)
1666
1667 page = json.loads(page)
1668
1669 ids_in_page = self.extract_videos_from_page(page['content_html'])
1670 video_ids.extend(ids_in_page)
1671
1672 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1673 break
1674
1675 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1676
1677 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1678 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1679 return [self.playlist_result(url_entries, channel_id)]
1680
1681
1682 class YoutubeUserIE(InfoExtractor):
1683 """Information Extractor for YouTube users."""
1684
1685 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1686 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1687 _GDATA_PAGE_SIZE = 50
1688 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1689 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1690 IE_NAME = u'youtube:user'
1691
1692 def _real_extract(self, url):
1693 # Extract username
1694 mobj = re.match(self._VALID_URL, url)
1695 if mobj is None:
1696 raise ExtractorError(u'Invalid URL: %s' % url)
1697
1698 username = mobj.group(1)
1699
1700 # Download video ids using YouTube Data API. Result size per
1701 # query is limited (currently to 50 videos) so we need to query
1702 # page by page until there are no video ids - it means we got
1703 # all of them.
1704
1705 video_ids = []
1706 pagenum = 0
1707
1708 while True:
1709 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1710
1711 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1712 page = self._download_webpage(gdata_url, username,
1713 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1714
1715 # Extract video identifiers
1716 ids_in_page = []
1717
1718 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721
1722 video_ids.extend(ids_in_page)
1723
1724 # A little optimization - if current page is not
1725 # "full", ie. does not contain PAGE_SIZE video ids then
1726 # we can assume that this page is the last one - there
1727 # are no more ids on further pages - no need to query
1728 # again.
1729
1730 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1731 break
1732
1733 pagenum += 1
1734
1735 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1736 url_results = [self.url_result(url, 'Youtube') for url in urls]
1737 return [self.playlist_result(url_results, playlist_title = username)]
1738
1739
1740 class BlipTVUserIE(InfoExtractor):
1741 """Information Extractor for blip.tv users."""
1742
1743 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1744 _PAGE_SIZE = 12
1745 IE_NAME = u'blip.tv:user'
1746
1747 def _real_extract(self, url):
1748 # Extract username
1749 mobj = re.match(self._VALID_URL, url)
1750 if mobj is None:
1751 raise ExtractorError(u'Invalid URL: %s' % url)
1752
1753 username = mobj.group(1)
1754
1755 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1756
1757 page = self._download_webpage(url, username, u'Downloading user page')
1758 mobj = re.search(r'data-users-id="([^"]+)"', page)
1759 page_base = page_base % mobj.group(1)
1760
1761
1762 # Download video ids using BlipTV Ajax calls. Result size per
1763 # query is limited (currently to 12 videos) so we need to query
1764 # page by page until there are no video ids - it means we got
1765 # all of them.
1766
1767 video_ids = []
1768 pagenum = 1
1769
1770 while True:
1771 url = page_base + "&page=" + str(pagenum)
1772 page = self._download_webpage(url, username,
1773 u'Downloading video ids from page %d' % pagenum)
1774
1775 # Extract video identifiers
1776 ids_in_page = []
1777
1778 for mobj in re.finditer(r'href="/([^"]+)"', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(unescapeHTML(mobj.group(1)))
1781
1782 video_ids.extend(ids_in_page)
1783
1784 # A little optimization - if current page is not
1785 # "full", ie. does not contain PAGE_SIZE video ids then
1786 # we can assume that this page is the last one - there
1787 # are no more ids on further pages - no need to query
1788 # again.
1789
1790 if len(ids_in_page) < self._PAGE_SIZE:
1791 break
1792
1793 pagenum += 1
1794
1795 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1796 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1797 return [self.playlist_result(url_entries, playlist_title = username)]
1798
1799
1800 class DepositFilesIE(InfoExtractor):
1801 """Information extractor for depositfiles.com"""
1802
1803 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1804
1805 def _real_extract(self, url):
1806 file_id = url.split('/')[-1]
1807 # Rebuild url in english locale
1808 url = 'http://depositfiles.com/en/files/' + file_id
1809
1810 # Retrieve file webpage with 'Free download' button pressed
1811 free_download_indication = { 'gateway_result' : '1' }
1812 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1813 try:
1814 self.report_download_webpage(file_id)
1815 webpage = compat_urllib_request.urlopen(request).read()
1816 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1817 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1818
1819 # Search for the real file URL
1820 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1821 if (mobj is None) or (mobj.group(1) is None):
1822 # Try to figure out reason of the error.
1823 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1824 if (mobj is not None) and (mobj.group(1) is not None):
1825 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1826 raise ExtractorError(u'%s' % restriction_message)
1827 else:
1828 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1829
1830 file_url = mobj.group(1)
1831 file_extension = os.path.splitext(file_url)[1][1:]
1832
1833 # Search for file title
1834 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1835
1836 return [{
1837 'id': file_id.decode('utf-8'),
1838 'url': file_url.decode('utf-8'),
1839 'uploader': None,
1840 'upload_date': None,
1841 'title': file_title,
1842 'ext': file_extension.decode('utf-8'),
1843 }]
1844
1845
1846 class FacebookIE(InfoExtractor):
1847 """Information Extractor for Facebook"""
1848
1849 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851 _NETRC_MACHINE = 'facebook'
1852 IE_NAME = u'facebook'
1853
1854 def report_login(self):
1855 """Report attempt to log in."""
1856 self.to_screen(u'Logging in')
1857
1858 def _real_initialize(self):
1859 if self._downloader is None:
1860 return
1861
1862 useremail = None
1863 password = None
1864 downloader_params = self._downloader.params
1865
1866 # Attempt to use provided username and password or .netrc data
1867 if downloader_params.get('username', None) is not None:
1868 useremail = downloader_params['username']
1869 password = downloader_params['password']
1870 elif downloader_params.get('usenetrc', False):
1871 try:
1872 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1873 if info is not None:
1874 useremail = info[0]
1875 password = info[2]
1876 else:
1877 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1878 except (IOError, netrc.NetrcParseError) as err:
1879 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1880 return
1881
1882 if useremail is None:
1883 return
1884
1885 # Log in
1886 login_form = {
1887 'email': useremail,
1888 'pass': password,
1889 'login': 'Log+In'
1890 }
1891 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1892 try:
1893 self.report_login()
1894 login_results = compat_urllib_request.urlopen(request).read()
1895 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1896 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1897 return
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1900 return
1901
1902 def _real_extract(self, url):
1903 mobj = re.match(self._VALID_URL, url)
1904 if mobj is None:
1905 raise ExtractorError(u'Invalid URL: %s' % url)
1906 video_id = mobj.group('ID')
1907
1908 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909 webpage = self._download_webpage(url, video_id)
1910
1911 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1912 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1914 if not m:
1915 raise ExtractorError(u'Cannot parse data')
1916 data = dict(json.loads(m.group(1)))
1917 params_raw = compat_urllib_parse.unquote(data['params'])
1918 params = json.loads(params_raw)
1919 video_data = params['video_data'][0]
1920 video_url = video_data.get('hd_src')
1921 if not video_url:
1922 video_url = video_data['sd_src']
1923 if not video_url:
1924 raise ExtractorError(u'Cannot find video URL')
1925 video_duration = int(video_data['video_duration'])
1926 thumbnail = video_data['thumbnail_src']
1927
1928 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1929 webpage, u'title')
1930
1931 info = {
1932 'id': video_id,
1933 'title': video_title,
1934 'url': video_url,
1935 'ext': 'mp4',
1936 'duration': video_duration,
1937 'thumbnail': thumbnail,
1938 }
1939 return [info]
1940
1941
1942 class BlipTVIE(InfoExtractor):
1943 """Information extractor for blip.tv"""
1944
1945 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1947 IE_NAME = u'blip.tv'
1948
1949 def report_direct_download(self, title):
1950 """Report information extraction."""
1951 self.to_screen(u'%s: Direct download detected' % title)
1952
1953 def _real_extract(self, url):
1954 mobj = re.match(self._VALID_URL, url)
1955 if mobj is None:
1956 raise ExtractorError(u'Invalid URL: %s' % url)
1957
1958 # See https://github.com/rg3/youtube-dl/issues/857
1959 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1960 if api_mobj is not None:
1961 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1962 urlp = compat_urllib_parse_urlparse(url)
1963 if urlp.path.startswith('/play/'):
1964 request = compat_urllib_request.Request(url)
1965 response = compat_urllib_request.urlopen(request)
1966 redirecturl = response.geturl()
1967 rurlp = compat_urllib_parse_urlparse(redirecturl)
1968 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1969 url = 'http://blip.tv/a/a-' + file_id
1970 return self._real_extract(url)
1971
1972
1973 if '?' in url:
1974 cchar = '&'
1975 else:
1976 cchar = '?'
1977 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1978 request = compat_urllib_request.Request(json_url)
1979 request.add_header('User-Agent', 'iTunes/10.6.1')
1980 self.report_extraction(mobj.group(1))
1981 info = None
1982 try:
1983 urlh = compat_urllib_request.urlopen(request)
1984 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1985 basename = url.split('/')[-1]
1986 title,ext = os.path.splitext(basename)
1987 title = title.decode('UTF-8')
1988 ext = ext.replace('.', '')
1989 self.report_direct_download(title)
1990 info = {
1991 'id': title,
1992 'url': url,
1993 'uploader': None,
1994 'upload_date': None,
1995 'title': title,
1996 'ext': ext,
1997 'urlhandle': urlh
1998 }
1999 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2000 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2001 if info is None: # Regular URL
2002 try:
2003 json_code_bytes = urlh.read()
2004 json_code = json_code_bytes.decode('utf-8')
2005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2006 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
2007
2008 try:
2009 json_data = json.loads(json_code)
2010 if 'Post' in json_data:
2011 data = json_data['Post']
2012 else:
2013 data = json_data
2014
2015 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016 video_url = data['media']['url']
2017 umobj = re.match(self._URL_EXT, video_url)
2018 if umobj is None:
2019 raise ValueError('Can not determine filename extension')
2020 ext = umobj.group(1)
2021
2022 info = {
2023 'id': data['item_id'],
2024 'url': video_url,
2025 'uploader': data['display_name'],
2026 'upload_date': upload_date,
2027 'title': data['title'],
2028 'ext': ext,
2029 'format': data['media']['mimeType'],
2030 'thumbnail': data['thumbnailUrl'],
2031 'description': data['description'],
2032 'player_url': data['embedUrl'],
2033 'user_agent': 'iTunes/10.6.1',
2034 }
2035 except (ValueError,KeyError) as err:
2036 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
2037
2038 return [info]
2039
2040
2041 class MyVideoIE(InfoExtractor):
2042 """Information Extractor for myvideo.de."""
2043
2044 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045 IE_NAME = u'myvideo'
2046
2047 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049 # https://github.com/rg3/youtube-dl/pull/842
2050 def __rc4crypt(self,data, key):
2051 x = 0
2052 box = list(range(256))
2053 for i in list(range(256)):
2054 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
2055 box[i], box[x] = box[x], box[i]
2056 x = 0
2057 y = 0
2058 out = ''
2059 for char in data:
2060 x = (x + 1) % 256
2061 y = (y + box[x]) % 256
2062 box[x], box[y] = box[y], box[x]
2063 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2064 return out
2065
2066 def __md5(self,s):
2067 return hashlib.md5(s).hexdigest().encode()
2068
2069 def _real_extract(self,url):
2070 mobj = re.match(self._VALID_URL, url)
2071 if mobj is None:
2072 raise ExtractorError(u'invalid URL: %s' % url)
2073
2074 video_id = mobj.group(1)
2075
2076 GK = (
2077 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079 b'TnpsbA0KTVRkbU1tSTRNdz09'
2080 )
2081
2082 # Get video webpage
2083 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2084 webpage = self._download_webpage(webpage_url, video_id)
2085
2086 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2087 if mobj is not None:
2088 self.report_extraction(video_id)
2089 video_url = mobj.group(1) + '.flv'
2090
2091 video_title = self._html_search_regex('<title>([^<]+)</title>',
2092 webpage, u'title')
2093
2094 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
2095
2096 return [{
2097 'id': video_id,
2098 'url': video_url,
2099 'uploader': None,
2100 'upload_date': None,
2101 'title': video_title,
2102 'ext': u'flv',
2103 }]
2104
2105 # try encxml
2106 mobj = re.search('var flashvars={(.+?)}', webpage)
2107 if mobj is None:
2108 raise ExtractorError(u'Unable to extract video')
2109
2110 params = {}
2111 encxml = ''
2112 sec = mobj.group(1)
2113 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2114 if not a == '_encxml':
2115 params[a] = b
2116 else:
2117 encxml = compat_urllib_parse.unquote(b)
2118 if not params.get('domain'):
2119 params['domain'] = 'www.myvideo.de'
2120 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2121 if 'flash_playertype=MTV' in xmldata_url:
2122 self._downloader.report_warning(u'avoiding MTV player')
2123 xmldata_url = (
2124 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2126 ) % video_id
2127
2128 # get enc data
2129 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2130 enc_data_b = binascii.unhexlify(enc_data)
2131 sk = self.__md5(
2132 base64.b64decode(base64.b64decode(GK)) +
2133 self.__md5(
2134 str(video_id).encode('utf-8')
2135 )
2136 )
2137 dec_data = self.__rc4crypt(enc_data_b, sk)
2138
2139 # extracting infos
2140 self.report_extraction(video_id)
2141
2142 video_url = None
2143 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2144 if mobj:
2145 video_url = compat_urllib_parse.unquote(mobj.group(1))
2146 if 'myvideo2flash' in video_url:
2147 self._downloader.report_warning(u'forcing RTMPT ...')
2148 video_url = video_url.replace('rtmpe://', 'rtmpt://')
2149
2150 if not video_url:
2151 # extract non rtmp videos
2152 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2153 if mobj is None:
2154 raise ExtractorError(u'unable to extract url')
2155 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2156
2157 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
2158 video_file = compat_urllib_parse.unquote(video_file)
2159
2160 if not video_file.endswith('f4m'):
2161 ppath, prefix = video_file.split('.')
2162 video_playpath = '%s:%s' % (prefix, ppath)
2163 video_hls_playlist = ''
2164 else:
2165 video_playpath = ''
2166 video_hls_playlist = (
2167 video_filepath + video_file
2168 ).replace('.f4m', '.m3u8')
2169
2170 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
2171 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
2172
2173 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
2174 webpage, u'title')
2175
2176 return [{
2177 'id': video_id,
2178 'url': video_url,
2179 'tc_url': video_url,
2180 'uploader': None,
2181 'upload_date': None,
2182 'title': video_title,
2183 'ext': u'flv',
2184 'play_path': video_playpath,
2185 'video_file': video_file,
2186 'video_hls_playlist': video_hls_playlist,
2187 'player_url': video_swfobj,
2188 }]
2189
2190
2191 class ComedyCentralIE(InfoExtractor):
2192 """Information extractor for The Daily Show and Colbert Report """
2193
2194 # urls can be abbreviations like :thedailyshow or :colbert
2195 # urls for episodes like:
2196 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200 |(https?://)?(www\.)?
2201 (?P<showname>thedailyshow|colbertnation)\.com/
2202 (full-episodes/(?P<episode>.*)|
2203 (?P<clip>
2204 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2206 $"""
2207
2208 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2209
2210 _video_extensions = {
2211 '3500': 'mp4',
2212 '2200': 'mp4',
2213 '1700': 'mp4',
2214 '1200': 'mp4',
2215 '750': 'mp4',
2216 '400': 'mp4',
2217 }
2218 _video_dimensions = {
2219 '3500': '1280x720',
2220 '2200': '960x540',
2221 '1700': '768x432',
2222 '1200': '640x360',
2223 '750': '512x288',
2224 '400': '384x216',
2225 }
2226
2227 @classmethod
2228 def suitable(cls, url):
2229 """Receives a URL and returns True if suitable for this IE."""
2230 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2231
2232 def _print_formats(self, formats):
2233 print('Available formats:')
2234 for x in formats:
2235 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2236
2237
2238 def _real_extract(self, url):
2239 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2240 if mobj is None:
2241 raise ExtractorError(u'Invalid URL: %s' % url)
2242
2243 if mobj.group('shortname'):
2244 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2245 url = u'http://www.thedailyshow.com/full-episodes/'
2246 else:
2247 url = u'http://www.colbertnation.com/full-episodes/'
2248 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2249 assert mobj is not None
2250
2251 if mobj.group('clip'):
2252 if mobj.group('showname') == 'thedailyshow':
2253 epTitle = mobj.group('tdstitle')
2254 else:
2255 epTitle = mobj.group('cntitle')
2256 dlNewest = False
2257 else:
2258 dlNewest = not mobj.group('episode')
2259 if dlNewest:
2260 epTitle = mobj.group('showname')
2261 else:
2262 epTitle = mobj.group('episode')
2263
2264 self.report_extraction(epTitle)
2265 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2266 if dlNewest:
2267 url = htmlHandle.geturl()
2268 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2269 if mobj is None:
2270 raise ExtractorError(u'Invalid redirected URL: ' + url)
2271 if mobj.group('episode') == '':
2272 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2273 epTitle = mobj.group('episode')
2274
2275 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2276
2277 if len(mMovieParams) == 0:
2278 # The Colbert Report embeds the information in a without
2279 # a URL prefix; so extract the alternate reference
2280 # and then add the URL prefix manually.
2281
2282 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2283 if len(altMovieParams) == 0:
2284 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2285 else:
2286 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2287
2288 uri = mMovieParams[0][1]
2289 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2290 indexXml = self._download_webpage(indexUrl, epTitle,
2291 u'Downloading show index',
2292 u'unable to download episode index')
2293
2294 results = []
2295
2296 idoc = xml.etree.ElementTree.fromstring(indexXml)
2297 itemEls = idoc.findall('.//item')
2298 for partNum,itemEl in enumerate(itemEls):
2299 mediaId = itemEl.findall('./guid')[0].text
2300 shortMediaId = mediaId.split(':')[-1]
2301 showId = mediaId.split(':')[-2].replace('.com', '')
2302 officialTitle = itemEl.findall('./title')[0].text
2303 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2304
2305 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306 compat_urllib_parse.urlencode({'uri': mediaId}))
2307 configXml = self._download_webpage(configUrl, epTitle,
2308 u'Downloading configuration for %s' % shortMediaId)
2309
2310 cdoc = xml.etree.ElementTree.fromstring(configXml)
2311 turls = []
2312 for rendition in cdoc.findall('.//rendition'):
2313 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2314 turls.append(finfo)
2315
2316 if len(turls) == 0:
2317 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2318 continue
2319
2320 if self._downloader.params.get('listformats', None):
2321 self._print_formats([i[0] for i in turls])
2322 return
2323
2324 # For now, just pick the highest bitrate
2325 format,rtmp_video_url = turls[-1]
2326
2327 # Get the format arg from the arg stream
2328 req_format = self._downloader.params.get('format', None)
2329
2330 # Select format if we can find one
2331 for f,v in turls:
2332 if f == req_format:
2333 format, rtmp_video_url = f, v
2334 break
2335
2336 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2337 if not m:
2338 raise ExtractorError(u'Cannot transform RTMP url')
2339 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340 video_url = base + m.group('finalid')
2341
2342 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2343 info = {
2344 'id': shortMediaId,
2345 'url': video_url,
2346 'uploader': showId,
2347 'upload_date': officialDate,
2348 'title': effTitle,
2349 'ext': 'mp4',
2350 'format': format,
2351 'thumbnail': None,
2352 'description': officialTitle,
2353 }
2354 results.append(info)
2355
2356 return results
2357
2358
2359 class EscapistIE(InfoExtractor):
2360 """Information extractor for The Escapist """
2361
2362 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME = u'escapist'
2364
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url)
2367 if mobj is None:
2368 raise ExtractorError(u'Invalid URL: %s' % url)
2369 showName = mobj.group('showname')
2370 videoId = mobj.group('episode')
2371
2372 self.report_extraction(videoId)
2373 webpage = self._download_webpage(url, videoId)
2374
2375 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2376 webpage, u'description', fatal=False)
2377
2378 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2379 webpage, u'thumbnail', fatal=False)
2380
2381 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2382 webpage, u'player url')
2383
2384 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2385 webpage, u'player url').split(' : ')[-1]
2386
2387 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2388 configUrl = compat_urllib_parse.unquote(configUrl)
2389
2390 configJSON = self._download_webpage(configUrl, videoId,
2391 u'Downloading configuration',
2392 u'unable to download configuration')
2393
2394 # Technically, it's JavaScript, not JSON
2395 configJSON = configJSON.replace("'", '"')
2396
2397 try:
2398 config = json.loads(configJSON)
2399 except (ValueError,) as err:
2400 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2401
2402 playlist = config['playlist']
2403 videoUrl = playlist[1]['url']
2404
2405 info = {
2406 'id': videoId,
2407 'url': videoUrl,
2408 'uploader': showName,
2409 'upload_date': None,
2410 'title': title,
2411 'ext': 'mp4',
2412 'thumbnail': imgUrl,
2413 'description': videoDesc,
2414 'player_url': playerUrl,
2415 }
2416
2417 return [info]
2418
2419 class CollegeHumorIE(InfoExtractor):
2420 """Information extractor for collegehumor.com"""
2421
2422 _WORKING = False
2423 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424 IE_NAME = u'collegehumor'
2425
2426 def report_manifest(self, video_id):
2427 """Report information extraction."""
2428 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2429
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2432 if mobj is None:
2433 raise ExtractorError(u'Invalid URL: %s' % url)
2434 video_id = mobj.group('videoid')
2435
2436 info = {
2437 'id': video_id,
2438 'uploader': None,
2439 'upload_date': None,
2440 }
2441
2442 self.report_extraction(video_id)
2443 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2444 try:
2445 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2448
2449 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2450 try:
2451 videoNode = mdoc.findall('./video')[0]
2452 info['description'] = videoNode.findall('./description')[0].text
2453 info['title'] = videoNode.findall('./caption')[0].text
2454 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2455 manifest_url = videoNode.findall('./file')[0].text
2456 except IndexError:
2457 raise ExtractorError(u'Invalid metadata XML file')
2458
2459 manifest_url += '?hdcore=2.10.3'
2460 self.report_manifest(video_id)
2461 try:
2462 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2465
2466 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2467 try:
2468 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469 node_id = media_node.attrib['url']
2470 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471 except IndexError as err:
2472 raise ExtractorError(u'Invalid manifest file')
2473
2474 url_pr = compat_urllib_parse_urlparse(manifest_url)
2475 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2476
2477 info['url'] = url
2478 info['ext'] = 'f4f'
2479 return [info]
2480
2481
2482 class XVideosIE(InfoExtractor):
2483 """Information extractor for xvideos.com"""
2484
2485 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486 IE_NAME = u'xvideos'
2487
2488 def _real_extract(self, url):
2489 mobj = re.match(self._VALID_URL, url)
2490 if mobj is None:
2491 raise ExtractorError(u'Invalid URL: %s' % url)
2492 video_id = mobj.group(1)
2493
2494 webpage = self._download_webpage(url, video_id)
2495
2496 self.report_extraction(video_id)
2497
2498 # Extract video URL
2499 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2500 webpage, u'video URL'))
2501
2502 # Extract title
2503 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2504 webpage, u'title')
2505
2506 # Extract video thumbnail
2507 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508 webpage, u'thumbnail', fatal=False)
2509
2510 info = {
2511 'id': video_id,
2512 'url': video_url,
2513 'uploader': None,
2514 'upload_date': None,
2515 'title': video_title,
2516 'ext': 'flv',
2517 'thumbnail': video_thumbnail,
2518 'description': None,
2519 }
2520
2521 return [info]
2522
2523
2524 class SoundcloudIE(InfoExtractor):
2525 """Information extractor for soundcloud.com
2526 To access the media, the uid of the song and a stream token
2527 must be extracted from the page source and the script must make
2528 a request to media.soundcloud.com/crossdomain.xml. Then
2529 the media can be grabbed by requesting from an url composed
2530 of the stream token and uid
2531 """
2532
2533 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534 IE_NAME = u'soundcloud'
2535
2536 def report_resolve(self, video_id):
2537 """Report information extraction."""
2538 self.to_screen(u'%s: Resolving id' % video_id)
2539
2540 def _real_extract(self, url):
2541 mobj = re.match(self._VALID_URL, url)
2542 if mobj is None:
2543 raise ExtractorError(u'Invalid URL: %s' % url)
2544
2545 # extract uploader (which is in the url)
2546 uploader = mobj.group(1)
2547 # extract simple title (uploader + slug of song title)
2548 slug_title = mobj.group(2)
2549 simple_title = uploader + u'-' + slug_title
2550 full_title = '%s/%s' % (uploader, slug_title)
2551
2552 self.report_resolve(full_title)
2553
2554 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2555 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2557
2558 info = json.loads(info_json)
2559 video_id = info['id']
2560 self.report_extraction(full_title)
2561
2562 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563 stream_json = self._download_webpage(streams_url, full_title,
2564 u'Downloading stream definitions',
2565 u'unable to download stream definitions')
2566
2567 streams = json.loads(stream_json)
2568 mediaURL = streams['http_mp3_128_url']
2569 upload_date = unified_strdate(info['created_at'])
2570
2571 return [{
2572 'id': info['id'],
2573 'url': mediaURL,
2574 'uploader': info['user']['username'],
2575 'upload_date': upload_date,
2576 'title': info['title'],
2577 'ext': u'mp3',
2578 'description': info['description'],
2579 }]
2580
2581 class SoundcloudSetIE(InfoExtractor):
2582 """Information extractor for soundcloud.com sets
2583 To access the media, the uid of the song and a stream token
2584 must be extracted from the page source and the script must make
2585 a request to media.soundcloud.com/crossdomain.xml. Then
2586 the media can be grabbed by requesting from an url composed
2587 of the stream token and uid
2588 """
2589
2590 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591 IE_NAME = u'soundcloud:set'
2592
2593 def report_resolve(self, video_id):
2594 """Report information extraction."""
2595 self.to_screen(u'%s: Resolving id' % video_id)
2596
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2599 if mobj is None:
2600 raise ExtractorError(u'Invalid URL: %s' % url)
2601
2602 # extract uploader (which is in the url)
2603 uploader = mobj.group(1)
2604 # extract simple title (uploader + slug of song title)
2605 slug_title = mobj.group(2)
2606 simple_title = uploader + u'-' + slug_title
2607 full_title = '%s/sets/%s' % (uploader, slug_title)
2608
2609 self.report_resolve(full_title)
2610
2611 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2612 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613 info_json = self._download_webpage(resolv_url, full_title)
2614
2615 videos = []
2616 info = json.loads(info_json)
2617 if 'errors' in info:
2618 for err in info['errors']:
2619 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2620 return
2621
2622 self.report_extraction(full_title)
2623 for track in info['tracks']:
2624 video_id = track['id']
2625
2626 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2628
2629 self.report_extraction(video_id)
2630 streams = json.loads(stream_json)
2631 mediaURL = streams['http_mp3_128_url']
2632
2633 videos.append({
2634 'id': video_id,
2635 'url': mediaURL,
2636 'uploader': track['user']['username'],
2637 'upload_date': unified_strdate(track['created_at']),
2638 'title': track['title'],
2639 'ext': u'mp3',
2640 'description': track['description'],
2641 })
2642 return videos
2643
2644
2645 class InfoQIE(InfoExtractor):
2646 """Information extractor for infoq.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2648
2649 def _real_extract(self, url):
2650 mobj = re.match(self._VALID_URL, url)
2651 if mobj is None:
2652 raise ExtractorError(u'Invalid URL: %s' % url)
2653
2654 webpage = self._download_webpage(url, video_id=url)
2655 self.report_extraction(url)
2656
2657 # Extract video URL
2658 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2659 if mobj is None:
2660 raise ExtractorError(u'Unable to extract video url')
2661 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2662 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2663
2664 # Extract title
2665 video_title = self._search_regex(r'contentTitle = "(.*?)";',
2666 webpage, u'title')
2667
2668 # Extract description
2669 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2670 webpage, u'description', fatal=False)
2671
2672 video_filename = video_url.split('/')[-1]
2673 video_id, extension = video_filename.split('.')
2674
2675 info = {
2676 'id': video_id,
2677 'url': video_url,
2678 'uploader': None,
2679 'upload_date': None,
2680 'title': video_title,
2681 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2682 'thumbnail': None,
2683 'description': video_description,
2684 }
2685
2686 return [info]
2687
2688 class MixcloudIE(InfoExtractor):
2689 """Information extractor for www.mixcloud.com"""
2690
2691 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME = u'mixcloud'
2694
2695 def report_download_json(self, file_id):
2696 """Report JSON download."""
2697 self.to_screen(u'Downloading json')
2698
2699 def get_urls(self, jsonData, fmt, bitrate='best'):
2700 """Get urls from 'audio_formats' section in json"""
2701 file_url = None
2702 try:
2703 bitrate_list = jsonData[fmt]
2704 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2705 bitrate = max(bitrate_list) # select highest
2706
2707 url_list = jsonData[fmt][bitrate]
2708 except TypeError: # we have no bitrate info.
2709 url_list = jsonData[fmt]
2710 return url_list
2711
2712 def check_urls(self, url_list):
2713 """Returns 1st active url from list"""
2714 for url in url_list:
2715 try:
2716 compat_urllib_request.urlopen(url)
2717 return url
2718 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2719 url = None
2720
2721 return None
2722
2723 def _print_formats(self, formats):
2724 print('Available formats:')
2725 for fmt in formats.keys():
2726 for b in formats[fmt]:
2727 try:
2728 ext = formats[fmt][b][0]
2729 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2730 except TypeError: # we have no bitrate info
2731 ext = formats[fmt][0]
2732 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2733 break
2734
2735 def _real_extract(self, url):
2736 mobj = re.match(self._VALID_URL, url)
2737 if mobj is None:
2738 raise ExtractorError(u'Invalid URL: %s' % url)
2739 # extract uploader & filename from url
2740 uploader = mobj.group(1).decode('utf-8')
2741 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2742
2743 # construct API request
2744 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2745 # retrieve .json file with links to files
2746 request = compat_urllib_request.Request(file_url)
2747 try:
2748 self.report_download_json(file_url)
2749 jsonData = compat_urllib_request.urlopen(request).read()
2750 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2751 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2752
2753 # parse JSON
2754 json_data = json.loads(jsonData)
2755 player_url = json_data['player_swf_url']
2756 formats = dict(json_data['audio_formats'])
2757
2758 req_format = self._downloader.params.get('format', None)
2759 bitrate = None
2760
2761 if self._downloader.params.get('listformats', None):
2762 self._print_formats(formats)
2763 return
2764
2765 if req_format is None or req_format == 'best':
2766 for format_param in formats.keys():
2767 url_list = self.get_urls(formats, format_param)
2768 # check urls
2769 file_url = self.check_urls(url_list)
2770 if file_url is not None:
2771 break # got it!
2772 else:
2773 if req_format not in formats:
2774 raise ExtractorError(u'Format is not available')
2775
2776 url_list = self.get_urls(formats, req_format)
2777 file_url = self.check_urls(url_list)
2778 format_param = req_format
2779
2780 return [{
2781 'id': file_id.decode('utf-8'),
2782 'url': file_url.decode('utf-8'),
2783 'uploader': uploader.decode('utf-8'),
2784 'upload_date': None,
2785 'title': json_data['name'],
2786 'ext': file_url.split('.')[-1].decode('utf-8'),
2787 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2788 'thumbnail': json_data['thumbnail_url'],
2789 'description': json_data['description'],
2790 'player_url': player_url.decode('utf-8'),
2791 }]
2792
2793 class StanfordOpenClassroomIE(InfoExtractor):
2794 """Information extractor for Stanford's Open ClassRoom"""
2795
2796 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797 IE_NAME = u'stanfordoc'
2798
2799 def _real_extract(self, url):
2800 mobj = re.match(self._VALID_URL, url)
2801 if mobj is None:
2802 raise ExtractorError(u'Invalid URL: %s' % url)
2803
2804 if mobj.group('course') and mobj.group('video'): # A specific video
2805 course = mobj.group('course')
2806 video = mobj.group('video')
2807 info = {
2808 'id': course + '_' + video,
2809 'uploader': None,
2810 'upload_date': None,
2811 }
2812
2813 self.report_extraction(info['id'])
2814 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2815 xmlUrl = baseUrl + video + '.xml'
2816 try:
2817 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2818 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2819 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2820 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2821 try:
2822 info['title'] = mdoc.findall('./title')[0].text
2823 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2824 except IndexError:
2825 raise ExtractorError(u'Invalid metadata XML file')
2826 info['ext'] = info['url'].rpartition('.')[2]
2827 return [info]
2828 elif mobj.group('course'): # A course page
2829 course = mobj.group('course')
2830 info = {
2831 'id': course,
2832 'type': 'playlist',
2833 'uploader': None,
2834 'upload_date': None,
2835 }
2836
2837 coursepage = self._download_webpage(url, info['id'],
2838 note='Downloading course info page',
2839 errnote='Unable to download course info page')
2840
2841 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2842
2843 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2844 coursepage, u'description', fatal=False)
2845
2846 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2847 info['list'] = [
2848 {
2849 'type': 'reference',
2850 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2851 }
2852 for vpage in links]
2853 results = []
2854 for entry in info['list']:
2855 assert entry['type'] == 'reference'
2856 results += self.extract(entry['url'])
2857 return results
2858 else: # Root page
2859 info = {
2860 'id': 'Stanford OpenClassroom',
2861 'type': 'playlist',
2862 'uploader': None,
2863 'upload_date': None,
2864 }
2865
2866 self.report_download_webpage(info['id'])
2867 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2868 try:
2869 rootpage = compat_urllib_request.urlopen(rootURL).read()
2870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2871 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2872
2873 info['title'] = info['id']
2874
2875 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2876 info['list'] = [
2877 {
2878 'type': 'reference',
2879 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2880 }
2881 for cpage in links]
2882
2883 results = []
2884 for entry in info['list']:
2885 assert entry['type'] == 'reference'
2886 results += self.extract(entry['url'])
2887 return results
2888
2889 class MTVIE(InfoExtractor):
2890 """Information extractor for MTV.com"""
2891
2892 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2893 IE_NAME = u'mtv'
2894
2895 def _real_extract(self, url):
2896 mobj = re.match(self._VALID_URL, url)
2897 if mobj is None:
2898 raise ExtractorError(u'Invalid URL: %s' % url)
2899 if not mobj.group('proto'):
2900 url = 'http://' + url
2901 video_id = mobj.group('videoid')
2902
2903 webpage = self._download_webpage(url, video_id)
2904
2905 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2906 webpage, u'song name', fatal=False)
2907
2908 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2909 webpage, u'title')
2910
2911 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2912 webpage, u'mtvn_uri', fatal=False)
2913
2914 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2915 webpage, u'content id', fatal=False)
2916
2917 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2918 self.report_extraction(video_id)
2919 request = compat_urllib_request.Request(videogen_url)
2920 try:
2921 metadataXml = compat_urllib_request.urlopen(request).read()
2922 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2923 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2924
2925 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2926 renditions = mdoc.findall('.//rendition')
2927
2928 # For now, always pick the highest quality.
2929 rendition = renditions[-1]
2930
2931 try:
2932 _,_,ext = rendition.attrib['type'].partition('/')
2933 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2934 video_url = rendition.find('./src').text
2935 except KeyError:
2936 raise ExtractorError('Invalid rendition field.')
2937
2938 info = {
2939 'id': video_id,
2940 'url': video_url,
2941 'uploader': performer,
2942 'upload_date': None,
2943 'title': video_title,
2944 'ext': ext,
2945 'format': format,
2946 }
2947
2948 return [info]
2949
2950
2951 class YoukuIE(InfoExtractor):
2952 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2953
2954 def _gen_sid(self):
2955 nowTime = int(time.time() * 1000)
2956 random1 = random.randint(1000,1998)
2957 random2 = random.randint(1000,9999)
2958
2959 return "%d%d%d" %(nowTime,random1,random2)
2960
2961 def _get_file_ID_mix_string(self, seed):
2962 mixed = []
2963 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2964 seed = float(seed)
2965 for i in range(len(source)):
2966 seed = (seed * 211 + 30031 ) % 65536
2967 index = math.floor(seed / 65536 * len(source) )
2968 mixed.append(source[int(index)])
2969 source.remove(source[int(index)])
2970 #return ''.join(mixed)
2971 return mixed
2972
2973 def _get_file_id(self, fileId, seed):
2974 mixed = self._get_file_ID_mix_string(seed)
2975 ids = fileId.split('*')
2976 realId = []
2977 for ch in ids:
2978 if ch:
2979 realId.append(mixed[int(ch)])
2980 return ''.join(realId)
2981
2982 def _real_extract(self, url):
2983 mobj = re.match(self._VALID_URL, url)
2984 if mobj is None:
2985 raise ExtractorError(u'Invalid URL: %s' % url)
2986 video_id = mobj.group('ID')
2987
2988 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2989
2990 jsondata = self._download_webpage(info_url, video_id)
2991
2992 self.report_extraction(video_id)
2993 try:
2994 config = json.loads(jsondata)
2995
2996 video_title = config['data'][0]['title']
2997 seed = config['data'][0]['seed']
2998
2999 format = self._downloader.params.get('format', None)
3000 supported_format = list(config['data'][0]['streamfileids'].keys())
3001
3002 if format is None or format == 'best':
3003 if 'hd2' in supported_format:
3004 format = 'hd2'
3005 else:
3006 format = 'flv'
3007 ext = u'flv'
3008 elif format == 'worst':
3009 format = 'mp4'
3010 ext = u'mp4'
3011 else:
3012 format = 'flv'
3013 ext = u'flv'
3014
3015
3016 fileid = config['data'][0]['streamfileids'][format]
3017 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3018 except (UnicodeDecodeError, ValueError, KeyError):
3019 raise ExtractorError(u'Unable to extract info section')
3020
3021 files_info=[]
3022 sid = self._gen_sid()
3023 fileid = self._get_file_id(fileid, seed)
3024
3025 #column 8,9 of fileid represent the segment number
3026 #fileid[7:9] should be changed
3027 for index, key in enumerate(keys):
3028
3029 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3030 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3031
3032 info = {
3033 'id': '%s_part%02d' % (video_id, index),
3034 'url': download_url,
3035 'uploader': None,
3036 'upload_date': None,
3037 'title': video_title,
3038 'ext': ext,
3039 }
3040 files_info.append(info)
3041
3042 return files_info
3043
3044
3045 class XNXXIE(InfoExtractor):
3046 """Information extractor for xnxx.com"""
3047
3048 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3049 IE_NAME = u'xnxx'
3050 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3051 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3052 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3053
3054 def _real_extract(self, url):
3055 mobj = re.match(self._VALID_URL, url)
3056 if mobj is None:
3057 raise ExtractorError(u'Invalid URL: %s' % url)
3058 video_id = mobj.group(1)
3059
3060 # Get webpage content
3061 webpage = self._download_webpage(url, video_id)
3062
3063 video_url = self._search_regex(self.VIDEO_URL_RE,
3064 webpage, u'video URL')
3065 video_url = compat_urllib_parse.unquote(video_url)
3066
3067 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
3068 webpage, u'title')
3069
3070 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
3071 webpage, u'thumbnail', fatal=False)
3072
3073 return [{
3074 'id': video_id,
3075 'url': video_url,
3076 'uploader': None,
3077 'upload_date': None,
3078 'title': video_title,
3079 'ext': 'flv',
3080 'thumbnail': video_thumbnail,
3081 'description': None,
3082 }]
3083
3084
3085 class GooglePlusIE(InfoExtractor):
3086 """Information extractor for plus.google.com."""
3087
3088 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3089 IE_NAME = u'plus.google'
3090
3091 def _real_extract(self, url):
3092 # Extract id from URL
3093 mobj = re.match(self._VALID_URL, url)
3094 if mobj is None:
3095 raise ExtractorError(u'Invalid URL: %s' % url)
3096
3097 post_url = mobj.group(0)
3098 video_id = mobj.group(1)
3099
3100 video_extension = 'flv'
3101
3102 # Step 1, Retrieve post webpage to extract further information
3103 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3104
3105 self.report_extraction(video_id)
3106
3107 # Extract update date
3108 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
3109 webpage, u'upload date', fatal=False)
3110 if upload_date:
3111 # Convert timestring to a format suitable for filename
3112 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3113 upload_date = upload_date.strftime('%Y%m%d')
3114
3115 # Extract uploader
3116 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
3117 webpage, u'uploader', fatal=False)
3118
3119 # Extract title
3120 # Get the first line for title
3121 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3122 webpage, 'title', default=u'NA')
3123
3124 # Step 2, Stimulate clicking the image box to launch video
3125 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3126 webpage, u'video page URL')
3127 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3128
3129 # Extract video links on video page
3130 """Extract video links of all sizes"""
3131 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3132 mobj = re.findall(pattern, webpage)
3133 if len(mobj) == 0:
3134 raise ExtractorError(u'Unable to extract video links')
3135
3136 # Sort in resolution
3137 links = sorted(mobj)
3138
3139 # Choose the lowest of the sort, i.e. highest resolution
3140 video_url = links[-1]
3141 # Only get the url. The resolution part in the tuple has no use anymore
3142 video_url = video_url[-1]
3143 # Treat escaped \u0026 style hex
3144 try:
3145 video_url = video_url.decode("unicode_escape")
3146 except AttributeError: # Python 3
3147 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3148
3149
3150 return [{
3151 'id': video_id,
3152 'url': video_url,
3153 'uploader': uploader,
3154 'upload_date': upload_date,
3155 'title': video_title,
3156 'ext': video_extension,
3157 }]
3158
3159 class NBAIE(InfoExtractor):
3160 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3161 IE_NAME = u'nba'
3162
3163 def _real_extract(self, url):
3164 mobj = re.match(self._VALID_URL, url)
3165 if mobj is None:
3166 raise ExtractorError(u'Invalid URL: %s' % url)
3167
3168 video_id = mobj.group(1)
3169
3170 webpage = self._download_webpage(url, video_id)
3171
3172 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3173
3174 shortened_video_id = video_id.rpartition('/')[2]
3175 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
3176 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
3177
3178 # It isn't there in the HTML it returns to us
3179 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3180
3181 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
3182
3183 info = {
3184 'id': shortened_video_id,
3185 'url': video_url,
3186 'ext': 'mp4',
3187 'title': title,
3188 # 'uploader_date': uploader_date,
3189 'description': description,
3190 }
3191 return [info]
3192
3193 class JustinTVIE(InfoExtractor):
3194 """Information extractor for justin.tv and twitch.tv"""
3195 # TODO: One broadcast may be split into multiple videos. The key
3196 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3197 # starts at 1 and increases. Can we treat all parts as one video?
3198
3199 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3200 (?:
3201 (?P<channelid>[^/]+)|
3202 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3203 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3204 )
3205 /?(?:\#.*)?$
3206 """
3207 _JUSTIN_PAGE_LIMIT = 100
3208 IE_NAME = u'justin.tv'
3209
3210 def report_download_page(self, channel, offset):
3211 """Report attempt to download a single page of videos."""
3212 self.to_screen(u'%s: Downloading video information from %d to %d' %
3213 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3214
3215 # Return count of items, list of *valid* items
3216 def _parse_page(self, url, video_id):
3217 webpage = self._download_webpage(url, video_id,
3218 u'Downloading video info JSON',
3219 u'unable to download video info JSON')
3220
3221 response = json.loads(webpage)
3222 if type(response) != list:
3223 error_text = response.get('error', 'unknown error')
3224 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3225 info = []
3226 for clip in response:
3227 video_url = clip['video_file_url']
3228 if video_url:
3229 video_extension = os.path.splitext(video_url)[1][1:]
3230 video_date = re.sub('-', '', clip['start_time'][:10])
3231 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3232 video_id = clip['id']
3233 video_title = clip.get('title', video_id)
3234 info.append({
3235 'id': video_id,
3236 'url': video_url,
3237 'title': video_title,
3238 'uploader': clip.get('channel_name', video_uploader_id),
3239 'uploader_id': video_uploader_id,
3240 'upload_date': video_date,
3241 'ext': video_extension,
3242 })
3243 return (len(response), info)
3244
3245 def _real_extract(self, url):
3246 mobj = re.match(self._VALID_URL, url)
3247 if mobj is None:
3248 raise ExtractorError(u'invalid URL: %s' % url)
3249
3250 api_base = 'http://api.justin.tv'
3251 paged = False
3252 if mobj.group('channelid'):
3253 paged = True
3254 video_id = mobj.group('channelid')
3255 api = api_base + '/channel/archives/%s.json' % video_id
3256 elif mobj.group('chapterid'):
3257 chapter_id = mobj.group('chapterid')
3258
3259 webpage = self._download_webpage(url, chapter_id)
3260 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3261 if not m:
3262 raise ExtractorError(u'Cannot find archive of a chapter')
3263 archive_id = m.group(1)
3264
3265 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3266 chapter_info_xml = self._download_webpage(api, chapter_id,
3267 note=u'Downloading chapter information',
3268 errnote=u'Chapter information download failed')
3269 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3270 for a in doc.findall('.//archive'):
3271 if archive_id == a.find('./id').text:
3272 break
3273 else:
3274 raise ExtractorError(u'Could not find chapter in chapter information')
3275
3276 video_url = a.find('./video_file_url').text
3277 video_ext = video_url.rpartition('.')[2] or u'flv'
3278
3279 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3280 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3281 note='Downloading chapter metadata',
3282 errnote='Download of chapter metadata failed')
3283 chapter_info = json.loads(chapter_info_json)
3284
3285 bracket_start = int(doc.find('.//bracket_start').text)
3286 bracket_end = int(doc.find('.//bracket_end').text)
3287
3288 # TODO determine start (and probably fix up file)
3289 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3290 #video_url += u'?start=' + TODO:start_timestamp
3291 # bracket_start is 13290, but we want 51670615
3292 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3293 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3294
3295 info = {
3296 'id': u'c' + chapter_id,
3297 'url': video_url,
3298 'ext': video_ext,
3299 'title': chapter_info['title'],
3300 'thumbnail': chapter_info['preview'],
3301 'description': chapter_info['description'],
3302 'uploader': chapter_info['channel']['display_name'],
3303 'uploader_id': chapter_info['channel']['name'],
3304 }
3305 return [info]
3306 else:
3307 video_id = mobj.group('videoid')
3308 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3309
3310 self.report_extraction(video_id)
3311
3312 info = []
3313 offset = 0
3314 limit = self._JUSTIN_PAGE_LIMIT
3315 while True:
3316 if paged:
3317 self.report_download_page(video_id, offset)
3318 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3319 page_count, page_info = self._parse_page(page_url, video_id)
3320 info.extend(page_info)
3321 if not paged or page_count != limit:
3322 break
3323 offset += limit
3324 return info
3325
3326 class FunnyOrDieIE(InfoExtractor):
3327 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3328
3329 def _real_extract(self, url):
3330 mobj = re.match(self._VALID_URL, url)
3331 if mobj is None:
3332 raise ExtractorError(u'invalid URL: %s' % url)
3333
3334 video_id = mobj.group('id')
3335 webpage = self._download_webpage(url, video_id)
3336
3337 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3338 webpage, u'video URL', flags=re.DOTALL)
3339
3340 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3341 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3342
3343 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3344 webpage, u'description', fatal=False, flags=re.DOTALL)
3345
3346 info = {
3347 'id': video_id,
3348 'url': video_url,
3349 'ext': 'mp4',
3350 'title': title,
3351 'description': video_description,
3352 }
3353 return [info]
3354
3355 class SteamIE(InfoExtractor):
3356 _VALID_URL = r"""http://store\.steampowered\.com/
3357 (agecheck/)?
3358 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3359 (?P<gameID>\d+)/?
3360 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3361 """
3362 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3363 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3364
3365 @classmethod
3366 def suitable(cls, url):
3367 """Receives a URL and returns True if suitable for this IE."""
3368 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3369
3370 def _real_extract(self, url):
3371 m = re.match(self._VALID_URL, url, re.VERBOSE)
3372 gameID = m.group('gameID')
3373
3374 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3375 webpage = self._download_webpage(videourl, gameID)
3376
3377 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3378 videourl = self._AGECHECK_TEMPLATE % gameID
3379 self.report_age_confirmation()
3380 webpage = self._download_webpage(videourl, gameID)
3381
3382 self.report_extraction(gameID)
3383 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3384 webpage, 'game title')
3385
3386 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3387 mweb = re.finditer(urlRE, webpage)
3388 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3389 titles = re.finditer(namesRE, webpage)
3390 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3391 thumbs = re.finditer(thumbsRE, webpage)
3392 videos = []
3393 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3394 video_id = vid.group('videoID')
3395 title = vtitle.group('videoName')
3396 video_url = vid.group('videoURL')
3397 video_thumb = thumb.group('thumbnail')
3398 if not video_url:
3399 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3400 info = {
3401 'id':video_id,
3402 'url':video_url,
3403 'ext': 'flv',
3404 'title': unescapeHTML(title),
3405 'thumbnail': video_thumb
3406 }
3407 videos.append(info)
3408 return [self.playlist_result(videos, gameID, game_title)]
3409
3410 class UstreamIE(InfoExtractor):
3411 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3412 IE_NAME = u'ustream'
3413
3414 def _real_extract(self, url):
3415 m = re.match(self._VALID_URL, url)
3416 video_id = m.group('videoID')
3417
3418 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3419 webpage = self._download_webpage(url, video_id)
3420
3421 self.report_extraction(video_id)
3422
3423 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3424 webpage, u'title')
3425
3426 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3427 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3428
3429 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3430 webpage, u'thumbnail', fatal=False)
3431
3432 info = {
3433 'id': video_id,
3434 'url': video_url,
3435 'ext': 'flv',
3436 'title': video_title,
3437 'uploader': uploader,
3438 'thumbnail': thumbnail,
3439 }
3440 return info
3441
3442 class WorldStarHipHopIE(InfoExtractor):
3443 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3444 IE_NAME = u'WorldStarHipHop'
3445
3446 def _real_extract(self, url):
3447 m = re.match(self._VALID_URL, url)
3448 video_id = m.group('id')
3449
3450 webpage_src = self._download_webpage(url, video_id)
3451
3452 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3453 webpage_src, u'video URL')
3454
3455 if 'mp4' in video_url:
3456 ext = 'mp4'
3457 else:
3458 ext = 'flv'
3459
3460 video_title = self._html_search_regex(r"<title>(.*)</title>",
3461 webpage_src, u'title')
3462
3463 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3464 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3465 webpage_src, u'thumbnail', fatal=False)
3466
3467 if not thumbnail:
3468 _title = r"""candytitles.*>(.*)</span>"""
3469 mobj = re.search(_title, webpage_src)
3470 if mobj is not None:
3471 video_title = mobj.group(1)
3472
3473 results = [{
3474 'id': video_id,
3475 'url' : video_url,
3476 'title' : video_title,
3477 'thumbnail' : thumbnail,
3478 'ext' : ext,
3479 }]
3480 return results
3481
3482 class RBMARadioIE(InfoExtractor):
3483 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3484
3485 def _real_extract(self, url):
3486 m = re.match(self._VALID_URL, url)
3487 video_id = m.group('videoID')
3488
3489 webpage = self._download_webpage(url, video_id)
3490
3491 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3492 webpage, u'json data', flags=re.MULTILINE)
3493
3494 try:
3495 data = json.loads(json_data)
3496 except ValueError as e:
3497 raise ExtractorError(u'Invalid JSON: ' + str(e))
3498
3499 video_url = data['akamai_url'] + '&cbr=256'
3500 url_parts = compat_urllib_parse_urlparse(video_url)
3501 video_ext = url_parts.path.rpartition('.')[2]
3502 info = {
3503 'id': video_id,
3504 'url': video_url,
3505 'ext': video_ext,
3506 'title': data['title'],
3507 'description': data.get('teaser_text'),
3508 'location': data.get('country_of_origin'),
3509 'uploader': data.get('host', {}).get('name'),
3510 'uploader_id': data.get('host', {}).get('slug'),
3511 'thumbnail': data.get('image', {}).get('large_url_2x'),
3512 'duration': data.get('duration'),
3513 }
3514 return [info]
3515
3516
3517 class YouPornIE(InfoExtractor):
3518 """Information extractor for youporn.com."""
3519 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3520
3521 def _print_formats(self, formats):
3522 """Print all available formats"""
3523 print(u'Available formats:')
3524 print(u'ext\t\tformat')
3525 print(u'---------------------------------')
3526 for format in formats:
3527 print(u'%s\t\t%s' % (format['ext'], format['format']))
3528
3529 def _specific(self, req_format, formats):
3530 for x in formats:
3531 if(x["format"]==req_format):
3532 return x
3533 return None
3534
3535 def _real_extract(self, url):
3536 mobj = re.match(self._VALID_URL, url)
3537 if mobj is None:
3538 raise ExtractorError(u'Invalid URL: %s' % url)
3539 video_id = mobj.group('videoid')
3540
3541 req = compat_urllib_request.Request(url)
3542 req.add_header('Cookie', 'age_verified=1')
3543 webpage = self._download_webpage(req, video_id)
3544
3545 # Get JSON parameters
3546 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3547 try:
3548 params = json.loads(json_params)
3549 except:
3550 raise ExtractorError(u'Invalid JSON')
3551
3552 self.report_extraction(video_id)
3553 try:
3554 video_title = params['title']
3555 upload_date = unified_strdate(params['release_date_f'])
3556 video_description = params['description']
3557 video_uploader = params['submitted_by']
3558 thumbnail = params['thumbnails'][0]['image']
3559 except KeyError:
3560 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3561
3562 # Get all of the formats available
3563 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3564 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3565 webpage, u'download list').strip()
3566
3567 # Get all of the links from the page
3568 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3569 links = re.findall(LINK_RE, download_list_html)
3570 if(len(links) == 0):
3571 raise ExtractorError(u'ERROR: no known formats available for video')
3572
3573 self.to_screen(u'Links found: %d' % len(links))
3574
3575 formats = []
3576 for link in links:
3577
3578 # A link looks like this:
3579 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3580 # A path looks like this:
3581 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3582 video_url = unescapeHTML( link )
3583 path = compat_urllib_parse_urlparse( video_url ).path
3584 extension = os.path.splitext( path )[1][1:]
3585 format = path.split('/')[4].split('_')[:2]
3586 size = format[0]
3587 bitrate = format[1]
3588 format = "-".join( format )
3589 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3590
3591 formats.append({
3592 'id': video_id,
3593 'url': video_url,
3594 'uploader': video_uploader,
3595 'upload_date': upload_date,
3596 'title': video_title,
3597 'ext': extension,
3598 'format': format,
3599 'thumbnail': thumbnail,
3600 'description': video_description
3601 })
3602
3603 if self._downloader.params.get('listformats', None):
3604 self._print_formats(formats)
3605 return
3606
3607 req_format = self._downloader.params.get('format', None)
3608 self.to_screen(u'Format: %s' % req_format)
3609
3610 if req_format is None or req_format == 'best':
3611 return [formats[0]]
3612 elif req_format == 'worst':
3613 return [formats[-1]]
3614 elif req_format in ('-1', 'all'):
3615 return formats
3616 else:
3617 format = self._specific( req_format, formats )
3618 if result is None:
3619 raise ExtractorError(u'Requested format not available')
3620 return [format]
3621
3622
3623
3624 class PornotubeIE(InfoExtractor):
3625 """Information extractor for pornotube.com."""
3626 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3627
3628 def _real_extract(self, url):
3629 mobj = re.match(self._VALID_URL, url)
3630 if mobj is None:
3631 raise ExtractorError(u'Invalid URL: %s' % url)
3632
3633 video_id = mobj.group('videoid')
3634 video_title = mobj.group('title')
3635
3636 # Get webpage content
3637 webpage = self._download_webpage(url, video_id)
3638
3639 # Get the video URL
3640 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3641 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3642 video_url = compat_urllib_parse.unquote(video_url)
3643
3644 #Get the uploaded date
3645 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3646 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3647 if upload_date: upload_date = unified_strdate(upload_date)
3648
3649 info = {'id': video_id,
3650 'url': video_url,
3651 'uploader': None,
3652 'upload_date': upload_date,
3653 'title': video_title,
3654 'ext': 'flv',
3655 'format': 'flv'}
3656
3657 return [info]
3658
3659 class YouJizzIE(InfoExtractor):
3660 """Information extractor for youjizz.com."""
3661 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3662
3663 def _real_extract(self, url):
3664 mobj = re.match(self._VALID_URL, url)
3665 if mobj is None:
3666 raise ExtractorError(u'Invalid URL: %s' % url)
3667
3668 video_id = mobj.group('videoid')
3669
3670 # Get webpage content
3671 webpage = self._download_webpage(url, video_id)
3672
3673 # Get the video title
3674 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3675 webpage, u'title').strip()
3676
3677 # Get the embed page
3678 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3679 if result is None:
3680 raise ExtractorError(u'ERROR: unable to extract embed page')
3681
3682 embed_page_url = result.group(0).strip()
3683 video_id = result.group('videoid')
3684
3685 webpage = self._download_webpage(embed_page_url, video_id)
3686
3687 # Get the video URL
3688 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3689 webpage, u'video URL')
3690
3691 info = {'id': video_id,
3692 'url': video_url,
3693 'title': video_title,
3694 'ext': 'flv',
3695 'format': 'flv',
3696 'player_url': embed_page_url}
3697
3698 return [info]
3699
3700 class EightTracksIE(InfoExtractor):
3701 IE_NAME = '8tracks'
3702 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3703
3704 def _real_extract(self, url):
3705 mobj = re.match(self._VALID_URL, url)
3706 if mobj is None:
3707 raise ExtractorError(u'Invalid URL: %s' % url)
3708 playlist_id = mobj.group('id')
3709
3710 webpage = self._download_webpage(url, playlist_id)
3711
3712 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3713 data = json.loads(json_like)
3714
3715 session = str(random.randint(0, 1000000000))
3716 mix_id = data['id']
3717 track_count = data['tracks_count']
3718 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719 next_url = first_url
3720 res = []
3721 for i in itertools.count():
3722 api_json = self._download_webpage(next_url, playlist_id,
3723 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724 errnote=u'Failed to download song information')
3725 api_data = json.loads(api_json)
3726 track_data = api_data[u'set']['track']
3727 info = {
3728 'id': track_data['id'],
3729 'url': track_data['track_file_stream_url'],
3730 'title': track_data['performer'] + u' - ' + track_data['name'],
3731 'raw_title': track_data['name'],
3732 'uploader_id': data['user']['login'],
3733 'ext': 'm4a',
3734 }
3735 res.append(info)
3736 if api_data['set']['at_last_track']:
3737 break
3738 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3739 return res
3740
3741 class KeekIE(InfoExtractor):
3742 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3743 IE_NAME = u'keek'
3744
3745 def _real_extract(self, url):
3746 m = re.match(self._VALID_URL, url)
3747 video_id = m.group('videoID')
3748
3749 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3750 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3751 webpage = self._download_webpage(url, video_id)
3752
3753 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3754 webpage, u'title')
3755
3756 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3757 webpage, u'uploader', fatal=False)
3758
3759 info = {
3760 'id': video_id,
3761 'url': video_url,
3762 'ext': 'mp4',
3763 'title': video_title,
3764 'thumbnail': thumbnail,
3765 'uploader': uploader
3766 }
3767 return [info]
3768
3769 class TEDIE(InfoExtractor):
3770 _VALID_URL=r'''http://www\.ted\.com/
3771 (
3772 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3773 |
3774 ((?P<type_talk>talks)) # We have a simple talk
3775 )
3776 (/lang/(.*?))? # The url may contain the language
3777 /(?P<name>\w+) # Here goes the name and then ".html"
3778 '''
3779
3780 @classmethod
3781 def suitable(cls, url):
3782 """Receives a URL and returns True if suitable for this IE."""
3783 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3784
3785 def _real_extract(self, url):
3786 m=re.match(self._VALID_URL, url, re.VERBOSE)
3787 if m.group('type_talk'):
3788 return [self._talk_info(url)]
3789 else :
3790 playlist_id=m.group('playlist_id')
3791 name=m.group('name')
3792 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793 return [self._playlist_videos_info(url,name,playlist_id)]
3794
3795 def _playlist_videos_info(self,url,name,playlist_id=0):
3796 '''Returns the videos of the playlist'''
3797 video_RE=r'''
3798 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3799 ([.\s]*?)data-playlist_item_id="(\d+)"
3800 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3801 '''
3802 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3803 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3804 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3805 m_names=re.finditer(video_name_RE,webpage)
3806
3807 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3808 webpage, 'playlist title')
3809
3810 playlist_entries = []
3811 for m_video, m_name in zip(m_videos,m_names):
3812 video_id=m_video.group('video_id')
3813 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3814 playlist_entries.append(self.url_result(talk_url, 'TED'))
3815 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3816
3817 def _talk_info(self, url, video_id=0):
3818 """Return the video for the talk in the url"""
3819 m = re.match(self._VALID_URL, url,re.VERBOSE)
3820 video_name = m.group('name')
3821 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3822 self.report_extraction(video_name)
3823 # If the url includes the language we get the title translated
3824 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3825 webpage, 'title')
3826 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3827 webpage, 'json data')
3828 info = json.loads(json_data)
3829 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3830 webpage, 'description', flags = re.DOTALL)
3831
3832 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3833 webpage, 'thumbnail')
3834 info = {
3835 'id': info['id'],
3836 'url': info['htmlStreams'][-1]['file'],
3837 'ext': 'mp4',
3838 'title': title,
3839 'thumbnail': thumbnail,
3840 'description': desc,
3841 }
3842 return info
3843
3844 class MySpassIE(InfoExtractor):
3845 _VALID_URL = r'http://www.myspass.de/.*'
3846
3847 def _real_extract(self, url):
3848 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3849
3850 # video id is the last path element of the URL
3851 # usually there is a trailing slash, so also try the second but last
3852 url_path = compat_urllib_parse_urlparse(url).path
3853 url_parent_path, video_id = os.path.split(url_path)
3854 if not video_id:
3855 _, video_id = os.path.split(url_parent_path)
3856
3857 # get metadata
3858 metadata_url = META_DATA_URL_TEMPLATE % video_id
3859 metadata_text = self._download_webpage(metadata_url, video_id)
3860 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3861
3862 # extract values from metadata
3863 url_flv_el = metadata.find('url_flv')
3864 if url_flv_el is None:
3865 raise ExtractorError(u'Unable to extract download url')
3866 video_url = url_flv_el.text
3867 extension = os.path.splitext(video_url)[1][1:]
3868 title_el = metadata.find('title')
3869 if title_el is None:
3870 raise ExtractorError(u'Unable to extract title')
3871 title = title_el.text
3872 format_id_el = metadata.find('format_id')
3873 if format_id_el is None:
3874 format = ext
3875 else:
3876 format = format_id_el.text
3877 description_el = metadata.find('description')
3878 if description_el is not None:
3879 description = description_el.text
3880 else:
3881 description = None
3882 imagePreview_el = metadata.find('imagePreview')
3883 if imagePreview_el is not None:
3884 thumbnail = imagePreview_el.text
3885 else:
3886 thumbnail = None
3887 info = {
3888 'id': video_id,
3889 'url': video_url,
3890 'title': title,
3891 'ext': extension,
3892 'format': format,
3893 'thumbnail': thumbnail,
3894 'description': description
3895 }
3896 return [info]
3897
3898 class SpiegelIE(InfoExtractor):
3899 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3900
3901 def _real_extract(self, url):
3902 m = re.match(self._VALID_URL, url)
3903 video_id = m.group('videoID')
3904
3905 webpage = self._download_webpage(url, video_id)
3906
3907 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3908 webpage, u'title')
3909
3910 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3911 xml_code = self._download_webpage(xml_url, video_id,
3912 note=u'Downloading XML', errnote=u'Failed to download XML')
3913
3914 idoc = xml.etree.ElementTree.fromstring(xml_code)
3915 last_type = idoc[-1]
3916 filename = last_type.findall('./filename')[0].text
3917 duration = float(last_type.findall('./duration')[0].text)
3918
3919 video_url = 'http://video2.spiegel.de/flash/' + filename
3920 video_ext = filename.rpartition('.')[2]
3921 info = {
3922 'id': video_id,
3923 'url': video_url,
3924 'ext': video_ext,
3925 'title': video_title,
3926 'duration': duration,
3927 }
3928 return [info]
3929
3930 class LiveLeakIE(InfoExtractor):
3931
3932 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3933 IE_NAME = u'liveleak'
3934
3935 def _real_extract(self, url):
3936 mobj = re.match(self._VALID_URL, url)
3937 if mobj is None:
3938 raise ExtractorError(u'Invalid URL: %s' % url)
3939
3940 video_id = mobj.group('video_id')
3941
3942 webpage = self._download_webpage(url, video_id)
3943
3944 video_url = self._search_regex(r'file: "(.*?)",',
3945 webpage, u'video URL')
3946
3947 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3948 webpage, u'title').replace('LiveLeak.com -', '').strip()
3949
3950 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3951 webpage, u'description', fatal=False)
3952
3953 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3954 webpage, u'uploader', fatal=False)
3955
3956 info = {
3957 'id': video_id,
3958 'url': video_url,
3959 'ext': 'mp4',
3960 'title': video_title,
3961 'description': video_description,
3962 'uploader': video_uploader
3963 }
3964
3965 return [info]
3966
3967 class ARDIE(InfoExtractor):
3968 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3969 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3970 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3971
3972 def _real_extract(self, url):
3973 # determine video id from url
3974 m = re.match(self._VALID_URL, url)
3975
3976 numid = re.search(r'documentId=([0-9]+)', url)
3977 if numid:
3978 video_id = numid.group(1)
3979 else:
3980 video_id = m.group('video_id')
3981
3982 # determine title and media streams from webpage
3983 html = self._download_webpage(url, video_id)
3984 title = re.search(self._TITLE, html).group('title')
3985 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3986 if not streams:
3987 assert '"fsk"' in html
3988 raise ExtractorError(u'This video is only available after 8:00 pm')
3989
3990 # choose default media type and highest quality for now
3991 stream = max([s for s in streams if int(s["media_type"]) == 0],
3992 key=lambda s: int(s["quality"]))
3993
3994 # there's two possibilities: RTMP stream or HTTP download
3995 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3996 if stream['rtmp_url']:
3997 self.to_screen(u'RTMP download detected')
3998 assert stream['video_url'].startswith('mp4:')
3999 info["url"] = stream["rtmp_url"]
4000 info["play_path"] = stream['video_url']
4001 else:
4002 assert stream["video_url"].endswith('.mp4')
4003 info["url"] = stream["video_url"]
4004 return [info]
4005
4006 class ZDFIE(InfoExtractor):
4007 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4008 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4009 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4010 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
4011 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
4012
4013 def _real_extract(self, url):
4014 mobj = re.match(self._VALID_URL, url)
4015 if mobj is None:
4016 raise ExtractorError(u'Invalid URL: %s' % url)
4017 video_id = mobj.group('video_id')
4018
4019 html = self._download_webpage(url, video_id)
4020 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4021 if streams is None:
4022 raise ExtractorError(u'No media url found.')
4023
4024 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4025 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4026 # choose first/default media type and highest quality for now
4027 for s in streams: #find 300 - dsl1000mbit
4028 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
4029 stream_=s
4030 break
4031 for s in streams: #find veryhigh - dsl2000mbit
4032 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4033 stream_=s
4034 break
4035 if stream_ is None:
4036 raise ExtractorError(u'No stream found.')
4037
4038 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
4039
4040 self.report_extraction(video_id)
4041 mobj = re.search(self._TITLE, html)
4042 if mobj is None:
4043 raise ExtractorError(u'Cannot extract title')
4044 title = unescapeHTML(mobj.group('title'))
4045
4046 mobj = re.search(self._MMS_STREAM, media_link)
4047 if mobj is None:
4048 mobj = re.search(self._RTSP_STREAM, media_link)
4049 if mobj is None:
4050 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
4051 mms_url = mobj.group('video_url')
4052
4053 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
4054 if mobj is None:
4055 raise ExtractorError(u'Cannot extract extention')
4056 ext = mobj.group('ext')
4057
4058 return [{'id': video_id,
4059 'url': mms_url,
4060 'title': title,
4061 'ext': ext
4062 }]
4063
4064 class TumblrIE(InfoExtractor):
4065 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4066
4067 def _real_extract(self, url):
4068 m_url = re.match(self._VALID_URL, url)
4069 video_id = m_url.group('id')
4070 blog = m_url.group('blog_name')
4071
4072 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4073 webpage = self._download_webpage(url, video_id)
4074
4075 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4076 video = re.search(re_video, webpage)
4077 if video is None:
4078 raise ExtractorError(u'Unable to extract video')
4079 video_url = video.group('video_url')
4080 ext = video.group('ext')
4081
4082 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4083 webpage, u'thumbnail', fatal=False) # We pick the first poster
4084 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
4085
4086 # The only place where you can get a title, it's not complete,
4087 # but searching in other places doesn't work for all videos
4088 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
4089 webpage, u'title', flags=re.DOTALL)
4090
4091 return [{'id': video_id,
4092 'url': video_url,
4093 'title': video_title,
4094 'thumbnail': video_thumbnail,
4095 'ext': ext
4096 }]
4097
4098 class BandcampIE(InfoExtractor):
4099 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4100
4101 def _real_extract(self, url):
4102 mobj = re.match(self._VALID_URL, url)
4103 title = mobj.group('title')
4104 webpage = self._download_webpage(url, title)
4105 # We get the link to the free download page
4106 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4107 if m_download is None:
4108 raise ExtractorError(u'No free songs found')
4109
4110 download_link = m_download.group(1)
4111 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4112 webpage, re.MULTILINE|re.DOTALL).group('id')
4113
4114 download_webpage = self._download_webpage(download_link, id,
4115 'Downloading free downloads page')
4116 # We get the dictionary of the track from some javascrip code
4117 info = re.search(r'items: (.*?),$',
4118 download_webpage, re.MULTILINE).group(1)
4119 info = json.loads(info)[0]
4120 # We pick mp3-320 for now, until format selection can be easily implemented.
4121 mp3_info = info[u'downloads'][u'mp3-320']
4122 # If we try to use this url it says the link has expired
4123 initial_url = mp3_info[u'url']
4124 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4125 m_url = re.match(re_url, initial_url)
4126 #We build the url we will use to get the final track url
4127 # This url is build in Bandcamp in the script download_bunde_*.js
4128 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4129 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4130 # If we could correctly generate the .rand field the url would be
4131 #in the "download_url" key
4132 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4133
4134 track_info = {'id':id,
4135 'title' : info[u'title'],
4136 'ext' : 'mp3',
4137 'url' : final_url,
4138 'thumbnail' : info[u'thumb_url'],
4139 'uploader' : info[u'artist']
4140 }
4141
4142 return [track_info]
4143
4144 class RedTubeIE(InfoExtractor):
4145 """Information Extractor for redtube"""
4146 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4147
4148 def _real_extract(self,url):
4149 mobj = re.match(self._VALID_URL, url)
4150 if mobj is None:
4151 raise ExtractorError(u'Invalid URL: %s' % url)
4152
4153 video_id = mobj.group('id')
4154 video_extension = 'mp4'
4155 webpage = self._download_webpage(url, video_id)
4156
4157 self.report_extraction(video_id)
4158
4159 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
4160 webpage, u'video URL')
4161
4162 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4163 webpage, u'title')
4164
4165 return [{
4166 'id': video_id,
4167 'url': video_url,
4168 'ext': video_extension,
4169 'title': video_title,
4170 }]
4171
4172 class InaIE(InfoExtractor):
4173 """Information Extractor for Ina.fr"""
4174 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4175
4176 def _real_extract(self,url):
4177 mobj = re.match(self._VALID_URL, url)
4178
4179 video_id = mobj.group('id')
4180 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4181 video_extension = 'mp4'
4182 webpage = self._download_webpage(mrss_url, video_id)
4183
4184 self.report_extraction(video_id)
4185
4186 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4187 webpage, u'video URL')
4188
4189 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4190 webpage, u'title')
4191
4192 return [{
4193 'id': video_id,
4194 'url': video_url,
4195 'ext': video_extension,
4196 'title': video_title,
4197 }]
4198
4199 class HowcastIE(InfoExtractor):
4200 """Information Extractor for Howcast.com"""
4201 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4202
4203 def _real_extract(self, url):
4204 mobj = re.match(self._VALID_URL, url)
4205
4206 video_id = mobj.group('id')
4207 webpage_url = 'http://www.howcast.com/videos/' + video_id
4208 webpage = self._download_webpage(webpage_url, video_id)
4209
4210 self.report_extraction(video_id)
4211
4212 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4213 webpage, u'video URL')
4214
4215 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4216 webpage, u'title')
4217
4218 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4219 webpage, u'description', fatal=False)
4220
4221 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4222 webpage, u'thumbnail', fatal=False)
4223
4224 return [{
4225 'id': video_id,
4226 'url': video_url,
4227 'ext': 'mp4',
4228 'title': video_title,
4229 'description': video_description,
4230 'thumbnail': thumbnail,
4231 }]
4232
4233 class VineIE(InfoExtractor):
4234 """Information Extractor for Vine.co"""
4235 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4236
4237 def _real_extract(self, url):
4238 mobj = re.match(self._VALID_URL, url)
4239
4240 video_id = mobj.group('id')
4241 webpage_url = 'https://vine.co/v/' + video_id
4242 webpage = self._download_webpage(webpage_url, video_id)
4243
4244 self.report_extraction(video_id)
4245
4246 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4247 webpage, u'video URL')
4248
4249 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4250 webpage, u'title')
4251
4252 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4253 webpage, u'thumbnail', fatal=False)
4254
4255 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4256 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4257
4258 return [{
4259 'id': video_id,
4260 'url': video_url,
4261 'ext': 'mp4',
4262 'title': video_title,
4263 'thumbnail': thumbnail,
4264 'uploader': uploader,
4265 }]
4266
4267 class FlickrIE(InfoExtractor):
4268 """Information Extractor for Flickr videos"""
4269 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4270
4271 def _real_extract(self, url):
4272 mobj = re.match(self._VALID_URL, url)
4273
4274 video_id = mobj.group('id')
4275 video_uploader_id = mobj.group('uploader_id')
4276 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4277 webpage = self._download_webpage(webpage_url, video_id)
4278
4279 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4280
4281 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4282 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4283
4284 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4285 first_xml, u'node_id')
4286
4287 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4288 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4289
4290 self.report_extraction(video_id)
4291
4292 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4293 if mobj is None:
4294 raise ExtractorError(u'Unable to extract video url')
4295 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4296
4297 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4298 webpage, u'video title')
4299
4300 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4301 webpage, u'description', fatal=False)
4302
4303 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4304 webpage, u'thumbnail', fatal=False)
4305
4306 return [{
4307 'id': video_id,
4308 'url': video_url,
4309 'ext': 'mp4',
4310 'title': video_title,
4311 'description': video_description,
4312 'thumbnail': thumbnail,
4313 'uploader_id': video_uploader_id,
4314 }]
4315
4316 class TeamcocoIE(InfoExtractor):
4317 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4318
4319 def _real_extract(self, url):
4320 mobj = re.match(self._VALID_URL, url)
4321 if mobj is None:
4322 raise ExtractorError(u'Invalid URL: %s' % url)
4323 url_title = mobj.group('url_title')
4324 webpage = self._download_webpage(url, url_title)
4325
4326 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4327 webpage, u'video id')
4328
4329 self.report_extraction(video_id)
4330
4331 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4332 webpage, u'title')
4333
4334 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4335 webpage, u'thumbnail', fatal=False)
4336
4337 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4338 webpage, u'description', fatal=False)
4339
4340 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4341 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4342
4343 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4344 data, u'video URL')
4345
4346 return [{
4347 'id': video_id,
4348 'url': video_url,
4349 'ext': 'mp4',
4350 'title': video_title,
4351 'thumbnail': thumbnail,
4352 'description': video_description,
4353 }]
4354
4355 class XHamsterIE(InfoExtractor):
4356 """Information Extractor for xHamster"""
4357 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4358
4359 def _real_extract(self,url):
4360 mobj = re.match(self._VALID_URL, url)
4361
4362 video_id = mobj.group('id')
4363 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4364 webpage = self._download_webpage(mrss_url, video_id)
4365
4366 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4367 if mobj is None:
4368 raise ExtractorError(u'Unable to extract media URL')
4369 if len(mobj.group('server')) == 0:
4370 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4371 else:
4372 video_url = mobj.group('server')+'/key='+mobj.group('file')
4373 video_extension = video_url.split('.')[-1]
4374
4375 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4376 webpage, u'title')
4377
4378 # Can't see the description anywhere in the UI
4379 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4380 # webpage, u'description', fatal=False)
4381 # if video_description: video_description = unescapeHTML(video_description)
4382
4383 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4384 if mobj:
4385 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4386 else:
4387 video_upload_date = None
4388 self._downloader.report_warning(u'Unable to extract upload date')
4389
4390 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4391 webpage, u'uploader id', default=u'anonymous')
4392
4393 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4394 webpage, u'thumbnail', fatal=False)
4395
4396 return [{
4397 'id': video_id,
4398 'url': video_url,
4399 'ext': video_extension,
4400 'title': video_title,
4401 # 'description': video_description,
4402 'upload_date': video_upload_date,
4403 'uploader_id': video_uploader_id,
4404 'thumbnail': video_thumbnail
4405 }]
4406
4407 class HypemIE(InfoExtractor):
4408 """Information Extractor for hypem"""
4409 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4410
4411 def _real_extract(self, url):
4412 mobj = re.match(self._VALID_URL, url)
4413 if mobj is None:
4414 raise ExtractorError(u'Invalid URL: %s' % url)
4415 track_id = mobj.group(1)
4416
4417 data = { 'ax': 1, 'ts': time.time() }
4418 data_encoded = compat_urllib_parse.urlencode(data)
4419 complete_url = url + "?" + data_encoded
4420 request = compat_urllib_request.Request(complete_url)
4421 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4422 cookie = urlh.headers.get('Set-Cookie', '')
4423
4424 self.report_extraction(track_id)
4425
4426 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4427 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4428 try:
4429 track_list = json.loads(html_tracks)
4430 track = track_list[u'tracks'][0]
4431 except ValueError:
4432 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4433
4434 key = track[u"key"]
4435 track_id = track[u"id"]
4436 artist = track[u"artist"]
4437 title = track[u"song"]
4438
4439 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4440 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4441 request.add_header('cookie', cookie)
4442 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4443 try:
4444 song_data = json.loads(song_data_json)
4445 except ValueError:
4446 raise ExtractorError(u'Hypemachine contained invalid JSON.')
4447 final_url = song_data[u"url"]
4448
4449 return [{
4450 'id': track_id,
4451 'url': final_url,
4452 'ext': "mp3",
4453 'title': title,
4454 'artist': artist,
4455 }]
4456
4457 class Vbox7IE(InfoExtractor):
4458 """Information Extractor for Vbox7"""
4459 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4460
4461 def _real_extract(self,url):
4462 mobj = re.match(self._VALID_URL, url)
4463 if mobj is None:
4464 raise ExtractorError(u'Invalid URL: %s' % url)
4465 video_id = mobj.group(1)
4466
4467 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4468 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4469 redirect_url = urlh.geturl() + new_location
4470 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4471
4472 title = self._html_search_regex(r'<title>(.*)</title>',
4473 webpage, u'title').split('/')[0].strip()
4474
4475 ext = "flv"
4476 info_url = "http://vbox7.com/play/magare.do"
4477 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4478 info_request = compat_urllib_request.Request(info_url, data)
4479 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4480 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4481 if info_response is None:
4482 raise ExtractorError(u'Unable to extract the media url')
4483 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4484
4485 return [{
4486 'id': video_id,
4487 'url': final_url,
4488 'ext': ext,
4489 'title': title,
4490 'thumbnail': thumbnail_url,
4491 }]
4492
4493 class GametrailersIE(InfoExtractor):
4494 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4495
4496 def _real_extract(self, url):
4497 mobj = re.match(self._VALID_URL, url)
4498 if mobj is None:
4499 raise ExtractorError(u'Invalid URL: %s' % url)
4500 video_id = mobj.group('id')
4501 video_type = mobj.group('type')
4502 webpage = self._download_webpage(url, video_id)
4503 if video_type == 'full-episodes':
4504 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4505 else:
4506 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4507 mgid = self._search_regex(mgid_re, webpage, u'mgid')
4508 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4509
4510 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4511 video_id, u'Downloading video info')
4512 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4513 video_id, u'Downloading video urls info')
4514
4515 self.report_extraction(video_id)
4516 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4517 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4518 <image>.*
4519 <url>(?P<thumb>.*?)</url>.*
4520 </image>'''
4521
4522 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4523 if m_info is None:
4524 raise ExtractorError(u'Unable to extract video info')
4525 video_title = m_info.group('title')
4526 video_description = m_info.group('description')
4527 video_thumb = m_info.group('thumb')
4528
4529 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4530 if m_urls is None or len(m_urls) == 0:
4531 raise ExtractError(u'Unable to extrat video url')
4532 # They are sorted from worst to best quality
4533 video_url = m_urls[-1].group('url')
4534
4535 return {'url': video_url,
4536 'id': video_id,
4537 'title': video_title,
4538 # Videos are actually flv not mp4
4539 'ext': 'flv',
4540 'thumbnail': video_thumb,
4541 'description': video_description,
4542 }
4543
4544 def gen_extractors():
4545 """ Return a list of an instance of every supported extractor.
4546 The order does matter; the first extractor matched is the one handling the URL.
4547 """
4548 return [
4549 YoutubePlaylistIE(),
4550 YoutubeChannelIE(),
4551 YoutubeUserIE(),
4552 YoutubeSearchIE(),
4553 YoutubeIE(),
4554 MetacafeIE(),
4555 DailymotionIE(),
4556 GoogleSearchIE(),
4557 PhotobucketIE(),
4558 YahooIE(),
4559 YahooSearchIE(),
4560 DepositFilesIE(),
4561 FacebookIE(),
4562 BlipTVIE(),
4563 BlipTVUserIE(),
4564 VimeoIE(),
4565 MyVideoIE(),
4566 ComedyCentralIE(),
4567 EscapistIE(),
4568 CollegeHumorIE(),
4569 XVideosIE(),
4570 SoundcloudSetIE(),
4571 SoundcloudIE(),
4572 InfoQIE(),
4573 MixcloudIE(),
4574 StanfordOpenClassroomIE(),
4575 MTVIE(),
4576 YoukuIE(),
4577 XNXXIE(),
4578 YouJizzIE(),
4579 PornotubeIE(),
4580 YouPornIE(),
4581 GooglePlusIE(),
4582 ArteTvIE(),
4583 NBAIE(),
4584 WorldStarHipHopIE(),
4585 JustinTVIE(),
4586 FunnyOrDieIE(),
4587 SteamIE(),
4588 UstreamIE(),
4589 RBMARadioIE(),
4590 EightTracksIE(),
4591 KeekIE(),
4592 TEDIE(),
4593 MySpassIE(),
4594 SpiegelIE(),
4595 LiveLeakIE(),
4596 ARDIE(),
4597 ZDFIE(),
4598 TumblrIE(),
4599 BandcampIE(),
4600 RedTubeIE(),
4601 InaIE(),
4602 HowcastIE(),
4603 VineIE(),
4604 FlickrIE(),
4605 TeamcocoIE(),
4606 XHamsterIE(),
4607 HypemIE(),
4608 Vbox7IE(),
4609 GametrailersIE(),
4610 GenericIE()
4611 ]
4612
4613 def get_info_extractor(ie_name):
4614 """Returns the info extractor class with the given ie_name"""
4615 return globals()[ie_name+'IE']