]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/InfoExtractors.py
Merge tag 'upstream/2013.05.23'
[youtubedl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import base64
7 import datetime
8 import itertools
9 import netrc
10 import os
11 import re
12 import socket
13 import time
14 import email.utils
15 import xml.etree.ElementTree
16 import random
17 import math
18 import operator
19 import hashlib
20 import binascii
21 import urllib
22
23 from .utils import *
24
25
26 class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
36
37 The dictionaries must include the following fields:
38
39 id: Video identifier.
40 url: Final video URL.
41 title: Video title, unescaped.
42 ext: Video filename extension.
43
44 The following fields are optional:
45
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
57
58 The fields should all be Unicode strings.
59
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
63
64 _real_extract() must return a *list* of information dictionaries as
65 described above.
66
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
69 """
70
71 _ready = False
72 _downloader = None
73 _WORKING = True
74
75 def __init__(self, downloader=None):
76 """Constructor. Receives an optional downloader."""
77 self._ready = False
78 self.set_downloader(downloader)
79
80 @classmethod
81 def suitable(cls, url):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re.match(cls._VALID_URL, url) is not None
84
85 @classmethod
86 def working(cls):
87 """Getter method for _WORKING."""
88 return cls._WORKING
89
90 def initialize(self):
91 """Initializes an instance (authentication, etc)."""
92 if not self._ready:
93 self._real_initialize()
94 self._ready = True
95
96 def extract(self, url):
97 """Extracts URL information and returns it in list of dicts."""
98 self.initialize()
99 return self._real_extract(url)
100
101 def set_downloader(self, downloader):
102 """Sets the downloader for this IE."""
103 self._downloader = downloader
104
105 def _real_initialize(self):
106 """Real initialization process. Redefine in subclasses."""
107 pass
108
109 def _real_extract(self, url):
110 """Real extraction process. Redefine in subclasses."""
111 pass
112
113 @property
114 def IE_NAME(self):
115 return type(self).__name__[:-2]
116
117 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
118 """ Returns the response handle """
119 if note is None:
120 self.report_download_webpage(video_id)
121 elif note is not False:
122 self.to_screen(u'%s: %s' % (video_id, note))
123 try:
124 return compat_urllib_request.urlopen(url_or_request)
125 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
126 if errnote is None:
127 errnote = u'Unable to download webpage'
128 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
129
130 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
133 content_type = urlh.headers.get('Content-Type', '')
134 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
135 if m:
136 encoding = m.group(1)
137 else:
138 encoding = 'utf-8'
139 webpage_bytes = urlh.read()
140 if self._downloader.params.get('dump_intermediate_pages', False):
141 try:
142 url = url_or_request.get_full_url()
143 except AttributeError:
144 url = url_or_request
145 self.to_screen(u'Dumping request to ' + url)
146 dump = base64.b64encode(webpage_bytes).decode('ascii')
147 self._downloader.to_screen(dump)
148 content = webpage_bytes.decode(encoding, 'replace')
149 return (content, urlh)
150
151 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
152 """ Returns the data of the page as a string """
153 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
154
155 def to_screen(self, msg):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
158
159 def report_extraction(self, id_or_name):
160 """Report information extraction."""
161 self.to_screen(u'%s: Extracting information' % id_or_name)
162
163 def report_download_webpage(self, video_id):
164 """Report webpage download."""
165 self.to_screen(u'%s: Downloading webpage' % video_id)
166
167 def report_age_confirmation(self):
168 """Report attempt to confirm age."""
169 self.to_screen(u'Confirming age')
170
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self, video_info):
174 """Returns a video"""
175 video_info['_type'] = 'video'
176 return video_info
177 def url_result(self, url, ie=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info = {'_type': 'url',
181 'url': url,
182 'ie_key': ie}
183 return video_info
184 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
185 """Returns a playlist"""
186 video_info = {'_type': 'playlist',
187 'entries': entries}
188 if playlist_id:
189 video_info['id'] = playlist_id
190 if playlist_title:
191 video_info['title'] = playlist_title
192 return video_info
193
194 class SearchInfoExtractor(InfoExtractor):
195 """
196 Base class for paged search queries extractors.
197 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
198 Instances should define _SEARCH_KEY and _MAX_RESULTS.
199 """
200
201 @classmethod
202 def _make_valid_url(cls):
203 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
204
205 @classmethod
206 def suitable(cls, url):
207 return re.match(cls._make_valid_url(), url) is not None
208
209 def _real_extract(self, query):
210 mobj = re.match(self._make_valid_url(), query)
211 if mobj is None:
212 raise ExtractorError(u'Invalid search query "%s"' % query)
213
214 prefix = mobj.group('prefix')
215 query = mobj.group('query')
216 if prefix == '':
217 return self._get_n_results(query, 1)
218 elif prefix == 'all':
219 return self._get_n_results(query, self._MAX_RESULTS)
220 else:
221 n = int(prefix)
222 if n <= 0:
223 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
224 elif n > self._MAX_RESULTS:
225 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
226 n = self._MAX_RESULTS
227 return self._get_n_results(query, n)
228
229 def _get_n_results(self, query, n):
230 """Get a specified number of results for a query"""
231 raise NotImplementedError("This method must be implemented by sublclasses")
232
233
234 class YoutubeIE(InfoExtractor):
235 """Information extractor for youtube.com."""
236
237 _VALID_URL = r"""^
238 (
239 (?:https?://)? # http(s):// (optional)
240 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
241 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
242 (?:.*?\#/)? # handle anchor (#/) redirect urls
243 (?: # the various things that can precede the ID:
244 (?:(?:v|embed|e)/) # v/ or embed/ or e/
245 |(?: # or the v= param in all its forms
246 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
247 (?:\?|\#!?) # the params delimiter ? or # or #!
248 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
249 v=
250 )
251 )? # optional -> youtube.com/xxxx is OK
252 )? # all until now is optional -> you can pass the naked ID
253 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
254 (?(1).+)? # if we found the ID, everything can follow
255 $"""
256 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
257 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
258 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
259 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
260 _NETRC_MACHINE = 'youtube'
261 # Listed in order of quality
262 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
263 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
264 _video_extensions = {
265 '13': '3gp',
266 '17': 'mp4',
267 '18': 'mp4',
268 '22': 'mp4',
269 '37': 'mp4',
270 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
271 '43': 'webm',
272 '44': 'webm',
273 '45': 'webm',
274 '46': 'webm',
275 }
276 _video_dimensions = {
277 '5': '240x400',
278 '6': '???',
279 '13': '???',
280 '17': '144x176',
281 '18': '360x640',
282 '22': '720x1280',
283 '34': '360x640',
284 '35': '480x854',
285 '37': '1080x1920',
286 '38': '3072x4096',
287 '43': '360x640',
288 '44': '480x854',
289 '45': '720x1280',
290 '46': '1080x1920',
291 }
292 IE_NAME = u'youtube'
293
294 @classmethod
295 def suitable(cls, url):
296 """Receives a URL and returns True if suitable for this IE."""
297 if YoutubePlaylistIE.suitable(url): return False
298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
299
300 def report_lang(self):
301 """Report attempt to set language."""
302 self.to_screen(u'Setting language')
303
304 def report_login(self):
305 """Report attempt to log in."""
306 self.to_screen(u'Logging in')
307
308 def report_video_webpage_download(self, video_id):
309 """Report attempt to download video webpage."""
310 self.to_screen(u'%s: Downloading video webpage' % video_id)
311
312 def report_video_info_webpage_download(self, video_id):
313 """Report attempt to download video info webpage."""
314 self.to_screen(u'%s: Downloading video info webpage' % video_id)
315
316 def report_video_subtitles_download(self, video_id):
317 """Report attempt to download video info webpage."""
318 self.to_screen(u'%s: Checking available subtitles' % video_id)
319
320 def report_video_subtitles_request(self, video_id, sub_lang, format):
321 """Report attempt to download video info webpage."""
322 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
323
324 def report_video_subtitles_available(self, video_id, sub_lang_list):
325 """Report available subtitles."""
326 sub_lang = ",".join(list(sub_lang_list.keys()))
327 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
328
329 def report_information_extraction(self, video_id):
330 """Report attempt to extract video information."""
331 self.to_screen(u'%s: Extracting video information' % video_id)
332
333 def report_unavailable_format(self, video_id, format):
334 """Report extracted video URL."""
335 self.to_screen(u'%s: Format %s not available' % (video_id, format))
336
337 def report_rtmp_download(self):
338 """Indicate the download will use the RTMP protocol."""
339 self.to_screen(u'RTMP download detected')
340
341 def _get_available_subtitles(self, video_id):
342 self.report_video_subtitles_download(video_id)
343 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
344 try:
345 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
347 return (u'unable to download video subtitles: %s' % compat_str(err), None)
348 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
349 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
350 if not sub_lang_list:
351 return (u'video doesn\'t have subtitles', None)
352 return sub_lang_list
353
354 def _list_available_subtitles(self, video_id):
355 sub_lang_list = self._get_available_subtitles(video_id)
356 self.report_video_subtitles_available(video_id, sub_lang_list)
357
358 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
359 """
360 Return tuple:
361 (error_message, sub_lang, sub)
362 """
363 self.report_video_subtitles_request(video_id, sub_lang, format)
364 params = compat_urllib_parse.urlencode({
365 'lang': sub_lang,
366 'name': sub_name,
367 'v': video_id,
368 'fmt': format,
369 })
370 url = 'http://www.youtube.com/api/timedtext?' + params
371 try:
372 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
375 if not sub:
376 return (u'Did not fetch video subtitles', None, None)
377 return (None, sub_lang, sub)
378
379 def _extract_subtitle(self, video_id):
380 """
381 Return a list with a tuple:
382 [(error_message, sub_lang, sub)]
383 """
384 sub_lang_list = self._get_available_subtitles(video_id)
385 sub_format = self._downloader.params.get('subtitlesformat')
386 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
387 return [(sub_lang_list[0], None, None)]
388 if self._downloader.params.get('subtitleslang', False):
389 sub_lang = self._downloader.params.get('subtitleslang')
390 elif 'en' in sub_lang_list:
391 sub_lang = 'en'
392 else:
393 sub_lang = list(sub_lang_list.keys())[0]
394 if not sub_lang in sub_lang_list:
395 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
396
397 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
398 return [subtitle]
399
400 def _extract_all_subtitles(self, video_id):
401 sub_lang_list = self._get_available_subtitles(video_id)
402 sub_format = self._downloader.params.get('subtitlesformat')
403 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
404 return [(sub_lang_list[0], None, None)]
405 subtitles = []
406 for sub_lang in sub_lang_list:
407 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
408 subtitles.append(subtitle)
409 return subtitles
410
411 def _print_formats(self, formats):
412 print('Available formats:')
413 for x in formats:
414 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
415
416 def _real_initialize(self):
417 if self._downloader is None:
418 return
419
420 username = None
421 password = None
422 downloader_params = self._downloader.params
423
424 # Attempt to use provided username and password or .netrc data
425 if downloader_params.get('username', None) is not None:
426 username = downloader_params['username']
427 password = downloader_params['password']
428 elif downloader_params.get('usenetrc', False):
429 try:
430 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431 if info is not None:
432 username = info[0]
433 password = info[2]
434 else:
435 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
436 except (IOError, netrc.NetrcParseError) as err:
437 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
438 return
439
440 # Set language
441 request = compat_urllib_request.Request(self._LANG_URL)
442 try:
443 self.report_lang()
444 compat_urllib_request.urlopen(request).read()
445 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
446 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
447 return
448
449 # No authentication to be performed
450 if username is None:
451 return
452
453 request = compat_urllib_request.Request(self._LOGIN_URL)
454 try:
455 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
458 return
459
460 galx = None
461 dsh = None
462 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
463 if match:
464 galx = match.group(1)
465
466 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
467 if match:
468 dsh = match.group(1)
469
470 # Log in
471 login_form_strs = {
472 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
473 u'Email': username,
474 u'GALX': galx,
475 u'Passwd': password,
476 u'PersistentCookie': u'yes',
477 u'_utf8': u'霱',
478 u'bgresponse': u'js_disabled',
479 u'checkConnection': u'',
480 u'checkedDomains': u'youtube',
481 u'dnConn': u'',
482 u'dsh': dsh,
483 u'pstMsg': u'0',
484 u'rmShown': u'1',
485 u'secTok': u'',
486 u'signIn': u'Sign in',
487 u'timeStmp': u'',
488 u'service': u'youtube',
489 u'uilel': u'3',
490 u'hl': u'en_US',
491 }
492 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
493 # chokes on unicode
494 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
495 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
496 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
497 try:
498 self.report_login()
499 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
500 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
501 self._downloader.report_warning(u'unable to log in: bad username or password')
502 return
503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
504 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
505 return
506
507 # Confirm age
508 age_form = {
509 'next_url': '/',
510 'action_confirm': 'Confirm',
511 }
512 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
513 try:
514 self.report_age_confirmation()
515 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
517 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
518
519 def _extract_id(self, url):
520 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
521 if mobj is None:
522 raise ExtractorError(u'Invalid URL: %s' % url)
523 video_id = mobj.group(2)
524 return video_id
525
526 def _real_extract(self, url):
527 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
528 mobj = re.search(self._NEXT_URL_RE, url)
529 if mobj:
530 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
531 video_id = self._extract_id(url)
532
533 # Get video webpage
534 self.report_video_webpage_download(video_id)
535 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
536 request = compat_urllib_request.Request(url)
537 try:
538 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
539 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
540 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
541
542 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
543
544 # Attempt to extract SWF player URL
545 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
546 if mobj is not None:
547 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
548 else:
549 player_url = None
550
551 # Get video info
552 self.report_video_info_webpage_download(video_id)
553 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
554 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
555 % (video_id, el_type))
556 video_info_webpage = self._download_webpage(video_info_url, video_id,
557 note=False,
558 errnote='unable to download video info webpage')
559 video_info = compat_parse_qs(video_info_webpage)
560 if 'token' in video_info:
561 break
562 if 'token' not in video_info:
563 if 'reason' in video_info:
564 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
565 else:
566 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
567
568 # Check for "rental" videos
569 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
570 raise ExtractorError(u'"rental" videos not supported')
571
572 # Start extracting information
573 self.report_information_extraction(video_id)
574
575 # uploader
576 if 'author' not in video_info:
577 raise ExtractorError(u'Unable to extract uploader name')
578 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
579
580 # uploader_id
581 video_uploader_id = None
582 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
583 if mobj is not None:
584 video_uploader_id = mobj.group(1)
585 else:
586 self._downloader.report_warning(u'unable to extract uploader nickname')
587
588 # title
589 if 'title' not in video_info:
590 raise ExtractorError(u'Unable to extract video title')
591 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
592
593 # thumbnail image
594 if 'thumbnail_url' not in video_info:
595 self._downloader.report_warning(u'unable to extract video thumbnail')
596 video_thumbnail = ''
597 else: # don't panic if we can't find it
598 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
599
600 # upload date
601 upload_date = None
602 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
603 if mobj is not None:
604 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
605 upload_date = unified_strdate(upload_date)
606
607 # description
608 video_description = get_element_by_id("eow-description", video_webpage)
609 if video_description:
610 video_description = clean_html(video_description)
611 else:
612 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
613 if fd_mobj:
614 video_description = unescapeHTML(fd_mobj.group(1))
615 else:
616 video_description = u''
617
618 # subtitles
619 video_subtitles = None
620
621 if self._downloader.params.get('writesubtitles', False):
622 video_subtitles = self._extract_subtitle(video_id)
623 if video_subtitles:
624 (sub_error, sub_lang, sub) = video_subtitles[0]
625 if sub_error:
626 self._downloader.report_error(sub_error)
627
628 if self._downloader.params.get('allsubtitles', False):
629 video_subtitles = self._extract_all_subtitles(video_id)
630 for video_subtitle in video_subtitles:
631 (sub_error, sub_lang, sub) = video_subtitle
632 if sub_error:
633 self._downloader.report_error(sub_error)
634
635 if self._downloader.params.get('listsubtitles', False):
636 sub_lang_list = self._list_available_subtitles(video_id)
637 return
638
639 if 'length_seconds' not in video_info:
640 self._downloader.report_warning(u'unable to extract video duration')
641 video_duration = ''
642 else:
643 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
644
645 # token
646 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
647
648 # Decide which formats to download
649 req_format = self._downloader.params.get('format', None)
650
651 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
652 self.report_rtmp_download()
653 video_url_list = [(None, video_info['conn'][0])]
654 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
655 url_map = {}
656 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
657 url_data = compat_parse_qs(url_data_str)
658 if 'itag' in url_data and 'url' in url_data:
659 url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
660 if not 'ratebypass' in url: url += '&ratebypass=yes'
661 url_map[url_data['itag'][0]] = url
662
663 format_limit = self._downloader.params.get('format_limit', None)
664 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
665 if format_limit is not None and format_limit in available_formats:
666 format_list = available_formats[available_formats.index(format_limit):]
667 else:
668 format_list = available_formats
669 existing_formats = [x for x in format_list if x in url_map]
670 if len(existing_formats) == 0:
671 raise ExtractorError(u'no known formats available for video')
672 if self._downloader.params.get('listformats', None):
673 self._print_formats(existing_formats)
674 return
675 if req_format is None or req_format == 'best':
676 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
677 elif req_format == 'worst':
678 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
679 elif req_format in ('-1', 'all'):
680 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
681 else:
682 # Specific formats. We pick the first in a slash-delimeted sequence.
683 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
684 req_formats = req_format.split('/')
685 video_url_list = None
686 for rf in req_formats:
687 if rf in url_map:
688 video_url_list = [(rf, url_map[rf])]
689 break
690 if video_url_list is None:
691 raise ExtractorError(u'requested format not available')
692 else:
693 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
694
695 results = []
696 for format_param, video_real_url in video_url_list:
697 # Extension
698 video_extension = self._video_extensions.get(format_param, 'flv')
699
700 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
701 self._video_dimensions.get(format_param, '???'))
702
703 results.append({
704 'id': video_id,
705 'url': video_real_url,
706 'uploader': video_uploader,
707 'uploader_id': video_uploader_id,
708 'upload_date': upload_date,
709 'title': video_title,
710 'ext': video_extension,
711 'format': video_format,
712 'thumbnail': video_thumbnail,
713 'description': video_description,
714 'player_url': player_url,
715 'subtitles': video_subtitles,
716 'duration': video_duration
717 })
718 return results
719
720
721 class MetacafeIE(InfoExtractor):
722 """Information Extractor for metacafe.com."""
723
724 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
725 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
726 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
727 IE_NAME = u'metacafe'
728
729 def report_disclaimer(self):
730 """Report disclaimer retrieval."""
731 self.to_screen(u'Retrieving disclaimer')
732
733 def _real_initialize(self):
734 # Retrieve disclaimer
735 request = compat_urllib_request.Request(self._DISCLAIMER)
736 try:
737 self.report_disclaimer()
738 disclaimer = compat_urllib_request.urlopen(request).read()
739 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
740 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
741
742 # Confirm age
743 disclaimer_form = {
744 'filters': '0',
745 'submit': "Continue - I'm over 18",
746 }
747 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
748 try:
749 self.report_age_confirmation()
750 disclaimer = compat_urllib_request.urlopen(request).read()
751 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
752 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
753
754 def _real_extract(self, url):
755 # Extract id and simplified title from URL
756 mobj = re.match(self._VALID_URL, url)
757 if mobj is None:
758 raise ExtractorError(u'Invalid URL: %s' % url)
759
760 video_id = mobj.group(1)
761
762 # Check if video comes from YouTube
763 mobj2 = re.match(r'^yt-(.*)$', video_id)
764 if mobj2 is not None:
765 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
766
767 # Retrieve video webpage to extract further information
768 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
769
770 # Extract URL, uploader and title from webpage
771 self.report_extraction(video_id)
772 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
773 if mobj is not None:
774 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
775 video_extension = mediaURL[-3:]
776
777 # Extract gdaKey if available
778 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
779 if mobj is None:
780 video_url = mediaURL
781 else:
782 gdaKey = mobj.group(1)
783 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
784 else:
785 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
786 if mobj is None:
787 raise ExtractorError(u'Unable to extract media URL')
788 vardict = compat_parse_qs(mobj.group(1))
789 if 'mediaData' not in vardict:
790 raise ExtractorError(u'Unable to extract media URL')
791 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
792 if mobj is None:
793 raise ExtractorError(u'Unable to extract media URL')
794 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
795 video_extension = mediaURL[-3:]
796 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
797
798 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
799 if mobj is None:
800 raise ExtractorError(u'Unable to extract title')
801 video_title = mobj.group(1).decode('utf-8')
802
803 mobj = re.search(r'submitter=(.*?);', webpage)
804 if mobj is None:
805 raise ExtractorError(u'Unable to extract uploader nickname')
806 video_uploader = mobj.group(1)
807
808 return [{
809 'id': video_id.decode('utf-8'),
810 'url': video_url.decode('utf-8'),
811 'uploader': video_uploader.decode('utf-8'),
812 'upload_date': None,
813 'title': video_title,
814 'ext': video_extension.decode('utf-8'),
815 }]
816
817 class DailymotionIE(InfoExtractor):
818 """Information Extractor for Dailymotion"""
819
820 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
821 IE_NAME = u'dailymotion'
822
823 def _real_extract(self, url):
824 # Extract id and simplified title from URL
825 mobj = re.match(self._VALID_URL, url)
826 if mobj is None:
827 raise ExtractorError(u'Invalid URL: %s' % url)
828
829 video_id = mobj.group(1).split('_')[0].split('?')[0]
830
831 video_extension = 'mp4'
832
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
835 request.add_header('Cookie', 'family_filter=off')
836 webpage = self._download_webpage(request, video_id)
837
838 # Extract URL, uploader and title from webpage
839 self.report_extraction(video_id)
840 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
841 if mobj is None:
842 raise ExtractorError(u'Unable to extract media URL')
843 flashvars = compat_urllib_parse.unquote(mobj.group(1))
844
845 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
846 if key in flashvars:
847 max_quality = key
848 self.to_screen(u'Using %s' % key)
849 break
850 else:
851 raise ExtractorError(u'Unable to extract video URL')
852
853 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
854 if mobj is None:
855 raise ExtractorError(u'Unable to extract video URL')
856
857 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
858
859 # TODO: support choosing qualities
860
861 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
862 if mobj is None:
863 raise ExtractorError(u'Unable to extract title')
864 video_title = unescapeHTML(mobj.group('title'))
865
866 video_uploader = None
867 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
868 if mobj is None:
869 # lookin for official user
870 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
871 if mobj_official is None:
872 self._downloader.report_warning(u'unable to extract uploader nickname')
873 else:
874 video_uploader = mobj_official.group(1)
875 else:
876 video_uploader = mobj.group(1)
877
878 video_upload_date = None
879 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
880 if mobj is not None:
881 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
882
883 return [{
884 'id': video_id,
885 'url': video_url,
886 'uploader': video_uploader,
887 'upload_date': video_upload_date,
888 'title': video_title,
889 'ext': video_extension,
890 }]
891
892
893 class PhotobucketIE(InfoExtractor):
894 """Information extractor for photobucket.com."""
895
896 # TODO: the original _VALID_URL was:
897 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
898 # Check if it's necessary to keep the old extracion process
899 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
900 IE_NAME = u'photobucket'
901
902 def _real_extract(self, url):
903 # Extract id from URL
904 mobj = re.match(self._VALID_URL, url)
905 if mobj is None:
906 raise ExtractorError(u'Invalid URL: %s' % url)
907
908 video_id = mobj.group('id')
909
910 video_extension = mobj.group('ext')
911
912 # Retrieve video webpage to extract further information
913 webpage = self._download_webpage(url, video_id)
914
915 # Extract URL, uploader, and title from webpage
916 self.report_extraction(video_id)
917 # We try first by looking the javascript code:
918 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
919 if mobj is not None:
920 info = json.loads(mobj.group('json'))
921 return [{
922 'id': video_id,
923 'url': info[u'downloadUrl'],
924 'uploader': info[u'username'],
925 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
926 'title': info[u'title'],
927 'ext': video_extension,
928 'thumbnail': info[u'thumbUrl'],
929 }]
930
931 # We try looking in other parts of the webpage
932 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
933 if mobj is None:
934 raise ExtractorError(u'Unable to extract media URL')
935 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
936
937 video_url = mediaURL
938
939 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
940 if mobj is None:
941 raise ExtractorError(u'Unable to extract title')
942 video_title = mobj.group(1).decode('utf-8')
943
944 video_uploader = mobj.group(2).decode('utf-8')
945
946 return [{
947 'id': video_id.decode('utf-8'),
948 'url': video_url.decode('utf-8'),
949 'uploader': video_uploader,
950 'upload_date': None,
951 'title': video_title,
952 'ext': video_extension.decode('utf-8'),
953 }]
954
955
956 class YahooIE(InfoExtractor):
957 """Information extractor for screen.yahoo.com."""
958 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
959
960 def _real_extract(self, url):
961 mobj = re.match(self._VALID_URL, url)
962 if mobj is None:
963 raise ExtractorError(u'Invalid URL: %s' % url)
964 video_id = mobj.group('id')
965 webpage = self._download_webpage(url, video_id)
966 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
967
968 if m_id is None:
969 # TODO: Check which url parameters are required
970 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
971 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
972 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
973 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
974 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
975 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
976 '''
977 self.report_extraction(video_id)
978 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
979 if m_info is None:
980 raise ExtractorError(u'Unable to extract video info')
981 video_title = m_info.group('title')
982 video_description = m_info.group('description')
983 video_thumb = m_info.group('thumb')
984 video_date = m_info.group('date')
985 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
986
987 # TODO: Find a way to get mp4 videos
988 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
989 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
990 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
991 video_url = m_rest.group('url')
992 video_path = m_rest.group('path')
993 if m_rest is None:
994 raise ExtractorError(u'Unable to extract video url')
995
996 else: # We have to use a different method if another id is defined
997 long_id = m_id.group('new_id')
998 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
999 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
1000 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
1001 info = json.loads(json_str)
1002 res = info[u'query'][u'results'][u'mediaObj'][0]
1003 stream = res[u'streams'][0]
1004 video_path = stream[u'path']
1005 video_url = stream[u'host']
1006 meta = res[u'meta']
1007 video_title = meta[u'title']
1008 video_description = meta[u'description']
1009 video_thumb = meta[u'thumbnail']
1010 video_date = None # I can't find it
1011
1012 info_dict = {
1013 'id': video_id,
1014 'url': video_url,
1015 'play_path': video_path,
1016 'title':video_title,
1017 'description': video_description,
1018 'thumbnail': video_thumb,
1019 'upload_date': video_date,
1020 'ext': 'flv',
1021 }
1022 return info_dict
1023
1024 class VimeoIE(InfoExtractor):
1025 """Information extractor for vimeo.com."""
1026
1027 # _VALID_URL matches Vimeo URLs
1028 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1029 IE_NAME = u'vimeo'
1030
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1034 if mobj is None:
1035 raise ExtractorError(u'Invalid URL: %s' % url)
1036
1037 video_id = mobj.group('id')
1038 if not mobj.group('proto'):
1039 url = 'https://' + url
1040 if mobj.group('direct_link'):
1041 url = 'https://vimeo.com/' + video_id
1042
1043 # Retrieve video webpage to extract further information
1044 request = compat_urllib_request.Request(url, None, std_headers)
1045 webpage = self._download_webpage(request, video_id)
1046
1047 # Now we begin extracting as much information as we can from what we
1048 # retrieved. First we extract the information common to all extractors,
1049 # and latter we extract those that are Vimeo specific.
1050 self.report_extraction(video_id)
1051
1052 # Extract the config JSON
1053 try:
1054 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1055 config = json.loads(config)
1056 except:
1057 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1058 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1059 else:
1060 raise ExtractorError(u'Unable to extract info section')
1061
1062 # Extract title
1063 video_title = config["video"]["title"]
1064
1065 # Extract uploader and uploader_id
1066 video_uploader = config["video"]["owner"]["name"]
1067 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1068
1069 # Extract video thumbnail
1070 video_thumbnail = config["video"]["thumbnail"]
1071
1072 # Extract video description
1073 video_description = get_element_by_attribute("itemprop", "description", webpage)
1074 if video_description: video_description = clean_html(video_description)
1075 else: video_description = u''
1076
1077 # Extract upload date
1078 video_upload_date = None
1079 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1080 if mobj is not None:
1081 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1082
1083 # Vimeo specific: extract request signature and timestamp
1084 sig = config['request']['signature']
1085 timestamp = config['request']['timestamp']
1086
1087 # Vimeo specific: extract video codec and quality information
1088 # First consider quality, then codecs, then take everything
1089 # TODO bind to format param
1090 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1091 files = { 'hd': [], 'sd': [], 'other': []}
1092 for codec_name, codec_extension in codecs:
1093 if codec_name in config["video"]["files"]:
1094 if 'hd' in config["video"]["files"][codec_name]:
1095 files['hd'].append((codec_name, codec_extension, 'hd'))
1096 elif 'sd' in config["video"]["files"][codec_name]:
1097 files['sd'].append((codec_name, codec_extension, 'sd'))
1098 else:
1099 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1100
1101 for quality in ('hd', 'sd', 'other'):
1102 if len(files[quality]) > 0:
1103 video_quality = files[quality][0][2]
1104 video_codec = files[quality][0][0]
1105 video_extension = files[quality][0][1]
1106 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1107 break
1108 else:
1109 raise ExtractorError(u'No known codec found')
1110
1111 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1112 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1113
1114 return [{
1115 'id': video_id,
1116 'url': video_url,
1117 'uploader': video_uploader,
1118 'uploader_id': video_uploader_id,
1119 'upload_date': video_upload_date,
1120 'title': video_title,
1121 'ext': video_extension,
1122 'thumbnail': video_thumbnail,
1123 'description': video_description,
1124 }]
1125
1126
1127 class ArteTvIE(InfoExtractor):
1128 """arte.tv information extractor."""
1129
1130 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1131 _LIVE_URL = r'index-[0-9]+\.html$'
1132
1133 IE_NAME = u'arte.tv'
1134
1135 def fetch_webpage(self, url):
1136 request = compat_urllib_request.Request(url)
1137 try:
1138 self.report_download_webpage(url)
1139 webpage = compat_urllib_request.urlopen(request).read()
1140 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1141 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1142 except ValueError as err:
1143 raise ExtractorError(u'Invalid URL: %s' % url)
1144 return webpage
1145
1146 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1147 page = self.fetch_webpage(url)
1148 mobj = re.search(regex, page, regexFlags)
1149 info = {}
1150
1151 if mobj is None:
1152 raise ExtractorError(u'Invalid URL: %s' % url)
1153
1154 for (i, key, err) in matchTuples:
1155 if mobj.group(i) is None:
1156 raise ExtractorError(err)
1157 else:
1158 info[key] = mobj.group(i)
1159
1160 return info
1161
1162 def extractLiveStream(self, url):
1163 video_lang = url.split('/')[-4]
1164 info = self.grep_webpage(
1165 url,
1166 r'src="(.*?/videothek_js.*?\.js)',
1167 0,
1168 [
1169 (1, 'url', u'Invalid URL: %s' % url)
1170 ]
1171 )
1172 http_host = url.split('/')[2]
1173 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1174 info = self.grep_webpage(
1175 next_url,
1176 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1177 '(http://.*?\.swf).*?' +
1178 '(rtmp://.*?)\'',
1179 re.DOTALL,
1180 [
1181 (1, 'path', u'could not extract video path: %s' % url),
1182 (2, 'player', u'could not extract video player: %s' % url),
1183 (3, 'url', u'could not extract video url: %s' % url)
1184 ]
1185 )
1186 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1187
1188 def extractPlus7Stream(self, url):
1189 video_lang = url.split('/')[-3]
1190 info = self.grep_webpage(
1191 url,
1192 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1193 0,
1194 [
1195 (1, 'url', u'Invalid URL: %s' % url)
1196 ]
1197 )
1198 next_url = compat_urllib_parse.unquote(info.get('url'))
1199 info = self.grep_webpage(
1200 next_url,
1201 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1202 0,
1203 [
1204 (1, 'url', u'Could not find <video> tag: %s' % url)
1205 ]
1206 )
1207 next_url = compat_urllib_parse.unquote(info.get('url'))
1208
1209 info = self.grep_webpage(
1210 next_url,
1211 r'<video id="(.*?)".*?>.*?' +
1212 '<name>(.*?)</name>.*?' +
1213 '<dateVideo>(.*?)</dateVideo>.*?' +
1214 '<url quality="hd">(.*?)</url>',
1215 re.DOTALL,
1216 [
1217 (1, 'id', u'could not extract video id: %s' % url),
1218 (2, 'title', u'could not extract video title: %s' % url),
1219 (3, 'date', u'could not extract video date: %s' % url),
1220 (4, 'url', u'could not extract video url: %s' % url)
1221 ]
1222 )
1223
1224 return {
1225 'id': info.get('id'),
1226 'url': compat_urllib_parse.unquote(info.get('url')),
1227 'uploader': u'arte.tv',
1228 'upload_date': unified_strdate(info.get('date')),
1229 'title': info.get('title').decode('utf-8'),
1230 'ext': u'mp4',
1231 'format': u'NA',
1232 'player_url': None,
1233 }
1234
1235 def _real_extract(self, url):
1236 video_id = url.split('/')[-1]
1237 self.report_extraction(video_id)
1238
1239 if re.search(self._LIVE_URL, video_id) is not None:
1240 self.extractLiveStream(url)
1241 return
1242 else:
1243 info = self.extractPlus7Stream(url)
1244
1245 return [info]
1246
1247
1248 class GenericIE(InfoExtractor):
1249 """Generic last-resort information extractor."""
1250
1251 _VALID_URL = r'.*'
1252 IE_NAME = u'generic'
1253
1254 def report_download_webpage(self, video_id):
1255 """Report webpage download."""
1256 if not self._downloader.params.get('test', False):
1257 self._downloader.report_warning(u'Falling back on generic information extractor.')
1258 super(GenericIE, self).report_download_webpage(video_id)
1259
1260 def report_following_redirect(self, new_url):
1261 """Report information extraction."""
1262 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1263
1264 def _test_redirect(self, url):
1265 """Check if it is a redirect, like url shorteners, in case return the new url."""
1266 class HeadRequest(compat_urllib_request.Request):
1267 def get_method(self):
1268 return "HEAD"
1269
1270 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1271 """
1272 Subclass the HTTPRedirectHandler to make it use our
1273 HeadRequest also on the redirected URL
1274 """
1275 def redirect_request(self, req, fp, code, msg, headers, newurl):
1276 if code in (301, 302, 303, 307):
1277 newurl = newurl.replace(' ', '%20')
1278 newheaders = dict((k,v) for k,v in req.headers.items()
1279 if k.lower() not in ("content-length", "content-type"))
1280 return HeadRequest(newurl,
1281 headers=newheaders,
1282 origin_req_host=req.get_origin_req_host(),
1283 unverifiable=True)
1284 else:
1285 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1286
1287 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1288 """
1289 Fallback to GET if HEAD is not allowed (405 HTTP error)
1290 """
1291 def http_error_405(self, req, fp, code, msg, headers):
1292 fp.read()
1293 fp.close()
1294
1295 newheaders = dict((k,v) for k,v in req.headers.items()
1296 if k.lower() not in ("content-length", "content-type"))
1297 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1298 headers=newheaders,
1299 origin_req_host=req.get_origin_req_host(),
1300 unverifiable=True))
1301
1302 # Build our opener
1303 opener = compat_urllib_request.OpenerDirector()
1304 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1305 HTTPMethodFallback, HEADRedirectHandler,
1306 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1307 opener.add_handler(handler())
1308
1309 response = opener.open(HeadRequest(url))
1310 if response is None:
1311 raise ExtractorError(u'Invalid URL protocol')
1312 new_url = response.geturl()
1313
1314 if url == new_url:
1315 return False
1316
1317 self.report_following_redirect(new_url)
1318 return new_url
1319
1320 def _real_extract(self, url):
1321 new_url = self._test_redirect(url)
1322 if new_url: return [self.url_result(new_url)]
1323
1324 video_id = url.split('/')[-1]
1325 try:
1326 webpage = self._download_webpage(url, video_id)
1327 except ValueError as err:
1328 # since this is the last-resort InfoExtractor, if
1329 # this error is thrown, it'll be thrown here
1330 raise ExtractorError(u'Invalid URL: %s' % url)
1331
1332 self.report_extraction(video_id)
1333 # Start with something easy: JW Player in SWFObject
1334 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1335 if mobj is None:
1336 # Broaden the search a little bit
1337 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1338 if mobj is None:
1339 # Broaden the search a little bit: JWPlayer JS loader
1340 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1341 if mobj is None:
1342 raise ExtractorError(u'Invalid URL: %s' % url)
1343
1344 # It's possible that one of the regexes
1345 # matched, but returned an empty group:
1346 if mobj.group(1) is None:
1347 raise ExtractorError(u'Invalid URL: %s' % url)
1348
1349 video_url = compat_urllib_parse.unquote(mobj.group(1))
1350 video_id = os.path.basename(video_url)
1351
1352 # here's a fun little line of code for you:
1353 video_extension = os.path.splitext(video_id)[1][1:]
1354 video_id = os.path.splitext(video_id)[0]
1355
1356 # it's tempting to parse this further, but you would
1357 # have to take into account all the variations like
1358 # Video Title - Site Name
1359 # Site Name | Video Title
1360 # Video Title - Tagline | Site Name
1361 # and so on and so forth; it's just not practical
1362 mobj = re.search(r'<title>(.*)</title>', webpage)
1363 if mobj is None:
1364 raise ExtractorError(u'Unable to extract title')
1365 video_title = mobj.group(1)
1366
1367 # video uploader is domain name
1368 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1369 if mobj is None:
1370 raise ExtractorError(u'Unable to extract title')
1371 video_uploader = mobj.group(1)
1372
1373 return [{
1374 'id': video_id,
1375 'url': video_url,
1376 'uploader': video_uploader,
1377 'upload_date': None,
1378 'title': video_title,
1379 'ext': video_extension,
1380 }]
1381
1382
1383 class YoutubeSearchIE(SearchInfoExtractor):
1384 """Information Extractor for YouTube search queries."""
1385 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1386 _MAX_RESULTS = 1000
1387 IE_NAME = u'youtube:search'
1388 _SEARCH_KEY = 'ytsearch'
1389
1390 def report_download_page(self, query, pagenum):
1391 """Report attempt to download search page with given number."""
1392 query = query.decode(preferredencoding())
1393 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1394
1395 def _get_n_results(self, query, n):
1396 """Get a specified number of results for a query"""
1397
1398 video_ids = []
1399 pagenum = 0
1400 limit = n
1401
1402 while (50 * pagenum) < limit:
1403 self.report_download_page(query, pagenum+1)
1404 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1405 request = compat_urllib_request.Request(result_url)
1406 try:
1407 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1408 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1409 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1410 api_response = json.loads(data)['data']
1411
1412 if not 'items' in api_response:
1413 raise ExtractorError(u'[youtube] No video results')
1414
1415 new_ids = list(video['id'] for video in api_response['items'])
1416 video_ids += new_ids
1417
1418 limit = min(n, api_response['totalItems'])
1419 pagenum += 1
1420
1421 if len(video_ids) > n:
1422 video_ids = video_ids[:n]
1423 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1424 return self.playlist_result(videos, query)
1425
1426
1427 class GoogleSearchIE(SearchInfoExtractor):
1428 """Information Extractor for Google Video search queries."""
1429 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1430 _MAX_RESULTS = 1000
1431 IE_NAME = u'video.google:search'
1432 _SEARCH_KEY = 'gvsearch'
1433
1434 def _get_n_results(self, query, n):
1435 """Get a specified number of results for a query"""
1436
1437 res = {
1438 '_type': 'playlist',
1439 'id': query,
1440 'entries': []
1441 }
1442
1443 for pagenum in itertools.count(1):
1444 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1445 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1446 note='Downloading result page ' + str(pagenum))
1447
1448 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1449 e = {
1450 '_type': 'url',
1451 'url': mobj.group(1)
1452 }
1453 res['entries'].append(e)
1454
1455 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1456 return res
1457
1458 class YahooSearchIE(SearchInfoExtractor):
1459 """Information Extractor for Yahoo! Video search queries."""
1460
1461 _MAX_RESULTS = 1000
1462 IE_NAME = u'screen.yahoo:search'
1463 _SEARCH_KEY = 'yvsearch'
1464
1465 def _get_n_results(self, query, n):
1466 """Get a specified number of results for a query"""
1467
1468 res = {
1469 '_type': 'playlist',
1470 'id': query,
1471 'entries': []
1472 }
1473 for pagenum in itertools.count(0):
1474 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1475 webpage = self._download_webpage(result_url, query,
1476 note='Downloading results page '+str(pagenum+1))
1477 info = json.loads(webpage)
1478 m = info[u'm']
1479 results = info[u'results']
1480
1481 for (i, r) in enumerate(results):
1482 if (pagenum * 30) +i >= n:
1483 break
1484 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1485 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1486 res['entries'].append(e)
1487 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1488 break
1489
1490 return res
1491
1492
1493 class YoutubePlaylistIE(InfoExtractor):
1494 """Information Extractor for YouTube playlists."""
1495
1496 _VALID_URL = r"""(?:
1497 (?:https?://)?
1498 (?:\w+\.)?
1499 youtube\.com/
1500 (?:
1501 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1502 \? (?:.*?&)*? (?:p|a|list)=
1503 | p/
1504 )
1505 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1506 .*
1507 |
1508 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1509 )"""
1510 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1511 _MAX_RESULTS = 50
1512 IE_NAME = u'youtube:playlist'
1513
1514 @classmethod
1515 def suitable(cls, url):
1516 """Receives a URL and returns True if suitable for this IE."""
1517 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1518
1519 def _real_extract(self, url):
1520 # Extract playlist id
1521 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1522 if mobj is None:
1523 raise ExtractorError(u'Invalid URL: %s' % url)
1524
1525 # Download playlist videos from API
1526 playlist_id = mobj.group(1) or mobj.group(2)
1527 page_num = 1
1528 videos = []
1529
1530 while True:
1531 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1532 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1533
1534 try:
1535 response = json.loads(page)
1536 except ValueError as err:
1537 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1538
1539 if 'feed' not in response:
1540 raise ExtractorError(u'Got a malformed response from YouTube API')
1541 playlist_title = response['feed']['title']['$t']
1542 if 'entry' not in response['feed']:
1543 # Number of videos is a multiple of self._MAX_RESULTS
1544 break
1545
1546 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1547 for entry in response['feed']['entry']
1548 if 'content' in entry ]
1549
1550 if len(response['feed']['entry']) < self._MAX_RESULTS:
1551 break
1552 page_num += 1
1553
1554 videos = [v[1] for v in sorted(videos)]
1555
1556 url_results = [self.url_result(url, 'Youtube') for url in videos]
1557 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1558
1559
1560 class YoutubeChannelIE(InfoExtractor):
1561 """Information Extractor for YouTube channels."""
1562
1563 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1564 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1565 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1566 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1567 IE_NAME = u'youtube:channel'
1568
1569 def extract_videos_from_page(self, page):
1570 ids_in_page = []
1571 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1572 if mobj.group(1) not in ids_in_page:
1573 ids_in_page.append(mobj.group(1))
1574 return ids_in_page
1575
1576 def _real_extract(self, url):
1577 # Extract channel id
1578 mobj = re.match(self._VALID_URL, url)
1579 if mobj is None:
1580 raise ExtractorError(u'Invalid URL: %s' % url)
1581
1582 # Download channel page
1583 channel_id = mobj.group(1)
1584 video_ids = []
1585 pagenum = 1
1586
1587 url = self._TEMPLATE_URL % (channel_id, pagenum)
1588 page = self._download_webpage(url, channel_id,
1589 u'Downloading page #%s' % pagenum)
1590
1591 # Extract video identifiers
1592 ids_in_page = self.extract_videos_from_page(page)
1593 video_ids.extend(ids_in_page)
1594
1595 # Download any subsequent channel pages using the json-based channel_ajax query
1596 if self._MORE_PAGES_INDICATOR in page:
1597 while True:
1598 pagenum = pagenum + 1
1599
1600 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1601 page = self._download_webpage(url, channel_id,
1602 u'Downloading page #%s' % pagenum)
1603
1604 page = json.loads(page)
1605
1606 ids_in_page = self.extract_videos_from_page(page['content_html'])
1607 video_ids.extend(ids_in_page)
1608
1609 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1610 break
1611
1612 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1613
1614 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1615 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1616 return [self.playlist_result(url_entries, channel_id)]
1617
1618
1619 class YoutubeUserIE(InfoExtractor):
1620 """Information Extractor for YouTube users."""
1621
1622 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1623 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1624 _GDATA_PAGE_SIZE = 50
1625 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1626 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1627 IE_NAME = u'youtube:user'
1628
1629 def _real_extract(self, url):
1630 # Extract username
1631 mobj = re.match(self._VALID_URL, url)
1632 if mobj is None:
1633 raise ExtractorError(u'Invalid URL: %s' % url)
1634
1635 username = mobj.group(1)
1636
1637 # Download video ids using YouTube Data API. Result size per
1638 # query is limited (currently to 50 videos) so we need to query
1639 # page by page until there are no video ids - it means we got
1640 # all of them.
1641
1642 video_ids = []
1643 pagenum = 0
1644
1645 while True:
1646 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1647
1648 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1649 page = self._download_webpage(gdata_url, username,
1650 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1651
1652 # Extract video identifiers
1653 ids_in_page = []
1654
1655 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1656 if mobj.group(1) not in ids_in_page:
1657 ids_in_page.append(mobj.group(1))
1658
1659 video_ids.extend(ids_in_page)
1660
1661 # A little optimization - if current page is not
1662 # "full", ie. does not contain PAGE_SIZE video ids then
1663 # we can assume that this page is the last one - there
1664 # are no more ids on further pages - no need to query
1665 # again.
1666
1667 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1668 break
1669
1670 pagenum += 1
1671
1672 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1673 url_results = [self.url_result(url, 'Youtube') for url in urls]
1674 return [self.playlist_result(url_results, playlist_title = username)]
1675
1676
1677 class BlipTVUserIE(InfoExtractor):
1678 """Information Extractor for blip.tv users."""
1679
1680 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1681 _PAGE_SIZE = 12
1682 IE_NAME = u'blip.tv:user'
1683
1684 def _real_extract(self, url):
1685 # Extract username
1686 mobj = re.match(self._VALID_URL, url)
1687 if mobj is None:
1688 raise ExtractorError(u'Invalid URL: %s' % url)
1689
1690 username = mobj.group(1)
1691
1692 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1693
1694 page = self._download_webpage(url, username, u'Downloading user page')
1695 mobj = re.search(r'data-users-id="([^"]+)"', page)
1696 page_base = page_base % mobj.group(1)
1697
1698
1699 # Download video ids using BlipTV Ajax calls. Result size per
1700 # query is limited (currently to 12 videos) so we need to query
1701 # page by page until there are no video ids - it means we got
1702 # all of them.
1703
1704 video_ids = []
1705 pagenum = 1
1706
1707 while True:
1708 url = page_base + "&page=" + str(pagenum)
1709 page = self._download_webpage(url, username,
1710 u'Downloading video ids from page %d' % pagenum)
1711
1712 # Extract video identifiers
1713 ids_in_page = []
1714
1715 for mobj in re.finditer(r'href="/([^"]+)"', page):
1716 if mobj.group(1) not in ids_in_page:
1717 ids_in_page.append(unescapeHTML(mobj.group(1)))
1718
1719 video_ids.extend(ids_in_page)
1720
1721 # A little optimization - if current page is not
1722 # "full", ie. does not contain PAGE_SIZE video ids then
1723 # we can assume that this page is the last one - there
1724 # are no more ids on further pages - no need to query
1725 # again.
1726
1727 if len(ids_in_page) < self._PAGE_SIZE:
1728 break
1729
1730 pagenum += 1
1731
1732 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1733 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1734 return [self.playlist_result(url_entries, playlist_title = username)]
1735
1736
1737 class DepositFilesIE(InfoExtractor):
1738 """Information extractor for depositfiles.com"""
1739
1740 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1741
1742 def _real_extract(self, url):
1743 file_id = url.split('/')[-1]
1744 # Rebuild url in english locale
1745 url = 'http://depositfiles.com/en/files/' + file_id
1746
1747 # Retrieve file webpage with 'Free download' button pressed
1748 free_download_indication = { 'gateway_result' : '1' }
1749 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1750 try:
1751 self.report_download_webpage(file_id)
1752 webpage = compat_urllib_request.urlopen(request).read()
1753 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1754 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1755
1756 # Search for the real file URL
1757 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1758 if (mobj is None) or (mobj.group(1) is None):
1759 # Try to figure out reason of the error.
1760 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1761 if (mobj is not None) and (mobj.group(1) is not None):
1762 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1763 raise ExtractorError(u'%s' % restriction_message)
1764 else:
1765 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1766
1767 file_url = mobj.group(1)
1768 file_extension = os.path.splitext(file_url)[1][1:]
1769
1770 # Search for file title
1771 mobj = re.search(r'<b title="(.*?)">', webpage)
1772 if mobj is None:
1773 raise ExtractorError(u'Unable to extract title')
1774 file_title = mobj.group(1).decode('utf-8')
1775
1776 return [{
1777 'id': file_id.decode('utf-8'),
1778 'url': file_url.decode('utf-8'),
1779 'uploader': None,
1780 'upload_date': None,
1781 'title': file_title,
1782 'ext': file_extension.decode('utf-8'),
1783 }]
1784
1785
1786 class FacebookIE(InfoExtractor):
1787 """Information Extractor for Facebook"""
1788
1789 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1790 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1791 _NETRC_MACHINE = 'facebook'
1792 IE_NAME = u'facebook'
1793
1794 def report_login(self):
1795 """Report attempt to log in."""
1796 self.to_screen(u'Logging in')
1797
1798 def _real_initialize(self):
1799 if self._downloader is None:
1800 return
1801
1802 useremail = None
1803 password = None
1804 downloader_params = self._downloader.params
1805
1806 # Attempt to use provided username and password or .netrc data
1807 if downloader_params.get('username', None) is not None:
1808 useremail = downloader_params['username']
1809 password = downloader_params['password']
1810 elif downloader_params.get('usenetrc', False):
1811 try:
1812 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1813 if info is not None:
1814 useremail = info[0]
1815 password = info[2]
1816 else:
1817 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1818 except (IOError, netrc.NetrcParseError) as err:
1819 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1820 return
1821
1822 if useremail is None:
1823 return
1824
1825 # Log in
1826 login_form = {
1827 'email': useremail,
1828 'pass': password,
1829 'login': 'Log+In'
1830 }
1831 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1832 try:
1833 self.report_login()
1834 login_results = compat_urllib_request.urlopen(request).read()
1835 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1836 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1837 return
1838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1839 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1840 return
1841
1842 def _real_extract(self, url):
1843 mobj = re.match(self._VALID_URL, url)
1844 if mobj is None:
1845 raise ExtractorError(u'Invalid URL: %s' % url)
1846 video_id = mobj.group('ID')
1847
1848 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1849 webpage = self._download_webpage(url, video_id)
1850
1851 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1852 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1853 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1854 if not m:
1855 raise ExtractorError(u'Cannot parse data')
1856 data = dict(json.loads(m.group(1)))
1857 params_raw = compat_urllib_parse.unquote(data['params'])
1858 params = json.loads(params_raw)
1859 video_data = params['video_data'][0]
1860 video_url = video_data.get('hd_src')
1861 if not video_url:
1862 video_url = video_data['sd_src']
1863 if not video_url:
1864 raise ExtractorError(u'Cannot find video URL')
1865 video_duration = int(video_data['video_duration'])
1866 thumbnail = video_data['thumbnail_src']
1867
1868 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1869 if not m:
1870 raise ExtractorError(u'Cannot find title in webpage')
1871 video_title = unescapeHTML(m.group(1))
1872
1873 info = {
1874 'id': video_id,
1875 'title': video_title,
1876 'url': video_url,
1877 'ext': 'mp4',
1878 'duration': video_duration,
1879 'thumbnail': thumbnail,
1880 }
1881 return [info]
1882
1883
1884 class BlipTVIE(InfoExtractor):
1885 """Information extractor for blip.tv"""
1886
1887 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1888 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1889 IE_NAME = u'blip.tv'
1890
1891 def report_direct_download(self, title):
1892 """Report information extraction."""
1893 self.to_screen(u'%s: Direct download detected' % title)
1894
1895 def _real_extract(self, url):
1896 mobj = re.match(self._VALID_URL, url)
1897 if mobj is None:
1898 raise ExtractorError(u'Invalid URL: %s' % url)
1899
1900 urlp = compat_urllib_parse_urlparse(url)
1901 if urlp.path.startswith('/play/'):
1902 request = compat_urllib_request.Request(url)
1903 response = compat_urllib_request.urlopen(request)
1904 redirecturl = response.geturl()
1905 rurlp = compat_urllib_parse_urlparse(redirecturl)
1906 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1907 url = 'http://blip.tv/a/a-' + file_id
1908 return self._real_extract(url)
1909
1910
1911 if '?' in url:
1912 cchar = '&'
1913 else:
1914 cchar = '?'
1915 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1916 request = compat_urllib_request.Request(json_url)
1917 request.add_header('User-Agent', 'iTunes/10.6.1')
1918 self.report_extraction(mobj.group(1))
1919 info = None
1920 try:
1921 urlh = compat_urllib_request.urlopen(request)
1922 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1923 basename = url.split('/')[-1]
1924 title,ext = os.path.splitext(basename)
1925 title = title.decode('UTF-8')
1926 ext = ext.replace('.', '')
1927 self.report_direct_download(title)
1928 info = {
1929 'id': title,
1930 'url': url,
1931 'uploader': None,
1932 'upload_date': None,
1933 'title': title,
1934 'ext': ext,
1935 'urlhandle': urlh
1936 }
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1939 if info is None: # Regular URL
1940 try:
1941 json_code_bytes = urlh.read()
1942 json_code = json_code_bytes.decode('utf-8')
1943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1944 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1945
1946 try:
1947 json_data = json.loads(json_code)
1948 if 'Post' in json_data:
1949 data = json_data['Post']
1950 else:
1951 data = json_data
1952
1953 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1954 video_url = data['media']['url']
1955 umobj = re.match(self._URL_EXT, video_url)
1956 if umobj is None:
1957 raise ValueError('Can not determine filename extension')
1958 ext = umobj.group(1)
1959
1960 info = {
1961 'id': data['item_id'],
1962 'url': video_url,
1963 'uploader': data['display_name'],
1964 'upload_date': upload_date,
1965 'title': data['title'],
1966 'ext': ext,
1967 'format': data['media']['mimeType'],
1968 'thumbnail': data['thumbnailUrl'],
1969 'description': data['description'],
1970 'player_url': data['embedUrl'],
1971 'user_agent': 'iTunes/10.6.1',
1972 }
1973 except (ValueError,KeyError) as err:
1974 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1975
1976 return [info]
1977
1978
1979 class MyVideoIE(InfoExtractor):
1980 """Information Extractor for myvideo.de."""
1981
1982 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1983 IE_NAME = u'myvideo'
1984
1985 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1986 # Released into the Public Domain by Tristan Fischer on 2013-05-19
1987 # https://github.com/rg3/youtube-dl/pull/842
1988 def __rc4crypt(self,data, key):
1989 x = 0
1990 box = list(range(256))
1991 for i in list(range(256)):
1992 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1993 box[i], box[x] = box[x], box[i]
1994 x = 0
1995 y = 0
1996 out = ''
1997 for char in data:
1998 x = (x + 1) % 256
1999 y = (y + box[x]) % 256
2000 box[x], box[y] = box[y], box[x]
2001 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
2002 return out
2003
2004 def __md5(self,s):
2005 return hashlib.md5(s).hexdigest().encode()
2006
2007 def _real_extract(self,url):
2008 mobj = re.match(self._VALID_URL, url)
2009 if mobj is None:
2010 raise ExtractorError(u'invalid URL: %s' % url)
2011
2012 video_id = mobj.group(1)
2013
2014 GK = (
2015 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2016 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2017 b'TnpsbA0KTVRkbU1tSTRNdz09'
2018 )
2019
2020 # Get video webpage
2021 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2022 webpage = self._download_webpage(webpage_url, video_id)
2023
2024 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
2025 if mobj is not None:
2026 self.report_extraction(video_id)
2027 video_url = mobj.group(1) + '.flv'
2028
2029 mobj = re.search('<title>([^<]+)</title>', webpage)
2030 if mobj is None:
2031 raise ExtractorError(u'Unable to extract title')
2032 video_title = mobj.group(1)
2033
2034 mobj = re.search('[.](.+?)$', video_url)
2035 if mobj is None:
2036 raise ExtractorError(u'Unable to extract extention')
2037 video_ext = mobj.group(1)
2038
2039 return [{
2040 'id': video_id,
2041 'url': video_url,
2042 'uploader': None,
2043 'upload_date': None,
2044 'title': video_title,
2045 'ext': u'flv',
2046 }]
2047
2048 # try encxml
2049 mobj = re.search('var flashvars={(.+?)}', webpage)
2050 if mobj is None:
2051 raise ExtractorError(u'Unable to extract video')
2052
2053 params = {}
2054 encxml = ''
2055 sec = mobj.group(1)
2056 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
2057 if not a == '_encxml':
2058 params[a] = b
2059 else:
2060 encxml = compat_urllib_parse.unquote(b)
2061 if not params.get('domain'):
2062 params['domain'] = 'www.myvideo.de'
2063 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
2064 if 'flash_playertype=MTV' in xmldata_url:
2065 self._downloader.report_warning(u'avoiding MTV player')
2066 xmldata_url = (
2067 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2068 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2069 ) % video_id
2070
2071 # get enc data
2072 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
2073 enc_data_b = binascii.unhexlify(enc_data)
2074 sk = self.__md5(
2075 base64.b64decode(base64.b64decode(GK)) +
2076 self.__md5(
2077 str(video_id).encode('utf-8')
2078 )
2079 )
2080 dec_data = self.__rc4crypt(enc_data_b, sk)
2081
2082 # extracting infos
2083 self.report_extraction(video_id)
2084
2085 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
2086 if mobj is None:
2087 raise ExtractorError(u'unable to extract rtmpurl')
2088 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
2089 if 'myvideo2flash' in video_rtmpurl:
2090 self._downloader.report_warning(u'forcing RTMPT ...')
2091 video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
2092
2093 # extract non rtmp videos
2094 if (video_rtmpurl is None) or (video_rtmpurl == ''):
2095 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
2096 if mobj is None:
2097 raise ExtractorError(u'unable to extract url')
2098 video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
2099
2100 mobj = re.search('source=\'(.*?)\'', dec_data)
2101 if mobj is None:
2102 raise ExtractorError(u'unable to extract swfobj')
2103 video_file = compat_urllib_parse.unquote(mobj.group(1))
2104
2105 if not video_file.endswith('f4m'):
2106 ppath, prefix = video_file.split('.')
2107 video_playpath = '%s:%s' % (prefix, ppath)
2108 video_hls_playlist = ''
2109 else:
2110 video_playpath = ''
2111 video_hls_playlist = (
2112 video_filepath + video_file
2113 ).replace('.f4m', '.m3u8')
2114
2115 mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
2116 if mobj is None:
2117 raise ExtractorError(u'unable to extract swfobj')
2118 video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
2119
2120 mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
2121 if mobj is None:
2122 raise ExtractorError(u'unable to extract title')
2123 video_title = mobj.group(1)
2124
2125 return [{
2126 'id': video_id,
2127 'url': video_rtmpurl,
2128 'tc_url': video_rtmpurl,
2129 'uploader': None,
2130 'upload_date': None,
2131 'title': video_title,
2132 'ext': u'flv',
2133 'play_path': video_playpath,
2134 'video_file': video_file,
2135 'video_hls_playlist': video_hls_playlist,
2136 'player_url': video_swfobj,
2137 }]
2138
2139 class ComedyCentralIE(InfoExtractor):
2140 """Information extractor for The Daily Show and Colbert Report """
2141
2142 # urls can be abbreviations like :thedailyshow or :colbert
2143 # urls for episodes like:
2144 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2145 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2146 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2147 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2148 |(https?://)?(www\.)?
2149 (?P<showname>thedailyshow|colbertnation)\.com/
2150 (full-episodes/(?P<episode>.*)|
2151 (?P<clip>
2152 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2153 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2154 $"""
2155
2156 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2157
2158 _video_extensions = {
2159 '3500': 'mp4',
2160 '2200': 'mp4',
2161 '1700': 'mp4',
2162 '1200': 'mp4',
2163 '750': 'mp4',
2164 '400': 'mp4',
2165 }
2166 _video_dimensions = {
2167 '3500': '1280x720',
2168 '2200': '960x540',
2169 '1700': '768x432',
2170 '1200': '640x360',
2171 '750': '512x288',
2172 '400': '384x216',
2173 }
2174
2175 @classmethod
2176 def suitable(cls, url):
2177 """Receives a URL and returns True if suitable for this IE."""
2178 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2179
2180 def _print_formats(self, formats):
2181 print('Available formats:')
2182 for x in formats:
2183 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2184
2185
2186 def _real_extract(self, url):
2187 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2188 if mobj is None:
2189 raise ExtractorError(u'Invalid URL: %s' % url)
2190
2191 if mobj.group('shortname'):
2192 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2193 url = u'http://www.thedailyshow.com/full-episodes/'
2194 else:
2195 url = u'http://www.colbertnation.com/full-episodes/'
2196 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2197 assert mobj is not None
2198
2199 if mobj.group('clip'):
2200 if mobj.group('showname') == 'thedailyshow':
2201 epTitle = mobj.group('tdstitle')
2202 else:
2203 epTitle = mobj.group('cntitle')
2204 dlNewest = False
2205 else:
2206 dlNewest = not mobj.group('episode')
2207 if dlNewest:
2208 epTitle = mobj.group('showname')
2209 else:
2210 epTitle = mobj.group('episode')
2211
2212 self.report_extraction(epTitle)
2213 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2214 if dlNewest:
2215 url = htmlHandle.geturl()
2216 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2217 if mobj is None:
2218 raise ExtractorError(u'Invalid redirected URL: ' + url)
2219 if mobj.group('episode') == '':
2220 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2221 epTitle = mobj.group('episode')
2222
2223 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2224
2225 if len(mMovieParams) == 0:
2226 # The Colbert Report embeds the information in a without
2227 # a URL prefix; so extract the alternate reference
2228 # and then add the URL prefix manually.
2229
2230 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2231 if len(altMovieParams) == 0:
2232 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2233 else:
2234 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2235
2236 uri = mMovieParams[0][1]
2237 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2238 indexXml = self._download_webpage(indexUrl, epTitle,
2239 u'Downloading show index',
2240 u'unable to download episode index')
2241
2242 results = []
2243
2244 idoc = xml.etree.ElementTree.fromstring(indexXml)
2245 itemEls = idoc.findall('.//item')
2246 for partNum,itemEl in enumerate(itemEls):
2247 mediaId = itemEl.findall('./guid')[0].text
2248 shortMediaId = mediaId.split(':')[-1]
2249 showId = mediaId.split(':')[-2].replace('.com', '')
2250 officialTitle = itemEl.findall('./title')[0].text
2251 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2252
2253 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2254 compat_urllib_parse.urlencode({'uri': mediaId}))
2255 configXml = self._download_webpage(configUrl, epTitle,
2256 u'Downloading configuration for %s' % shortMediaId)
2257
2258 cdoc = xml.etree.ElementTree.fromstring(configXml)
2259 turls = []
2260 for rendition in cdoc.findall('.//rendition'):
2261 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2262 turls.append(finfo)
2263
2264 if len(turls) == 0:
2265 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2266 continue
2267
2268 if self._downloader.params.get('listformats', None):
2269 self._print_formats([i[0] for i in turls])
2270 return
2271
2272 # For now, just pick the highest bitrate
2273 format,rtmp_video_url = turls[-1]
2274
2275 # Get the format arg from the arg stream
2276 req_format = self._downloader.params.get('format', None)
2277
2278 # Select format if we can find one
2279 for f,v in turls:
2280 if f == req_format:
2281 format, rtmp_video_url = f, v
2282 break
2283
2284 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2285 if not m:
2286 raise ExtractorError(u'Cannot transform RTMP url')
2287 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2288 video_url = base + m.group('finalid')
2289
2290 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2291 info = {
2292 'id': shortMediaId,
2293 'url': video_url,
2294 'uploader': showId,
2295 'upload_date': officialDate,
2296 'title': effTitle,
2297 'ext': 'mp4',
2298 'format': format,
2299 'thumbnail': None,
2300 'description': officialTitle,
2301 }
2302 results.append(info)
2303
2304 return results
2305
2306
2307 class EscapistIE(InfoExtractor):
2308 """Information extractor for The Escapist """
2309
2310 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2311 IE_NAME = u'escapist'
2312
2313 def _real_extract(self, url):
2314 mobj = re.match(self._VALID_URL, url)
2315 if mobj is None:
2316 raise ExtractorError(u'Invalid URL: %s' % url)
2317 showName = mobj.group('showname')
2318 videoId = mobj.group('episode')
2319
2320 self.report_extraction(showName)
2321 webPage = self._download_webpage(url, showName)
2322
2323 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2324 description = unescapeHTML(descMatch.group(1))
2325 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2326 imgUrl = unescapeHTML(imgMatch.group(1))
2327 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2328 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2329 configUrlMatch = re.search('config=(.*)$', playerUrl)
2330 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2331
2332 configJSON = self._download_webpage(configUrl, showName,
2333 u'Downloading configuration',
2334 u'unable to download configuration')
2335
2336 # Technically, it's JavaScript, not JSON
2337 configJSON = configJSON.replace("'", '"')
2338
2339 try:
2340 config = json.loads(configJSON)
2341 except (ValueError,) as err:
2342 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2343
2344 playlist = config['playlist']
2345 videoUrl = playlist[1]['url']
2346
2347 info = {
2348 'id': videoId,
2349 'url': videoUrl,
2350 'uploader': showName,
2351 'upload_date': None,
2352 'title': showName,
2353 'ext': 'mp4',
2354 'thumbnail': imgUrl,
2355 'description': description,
2356 'player_url': playerUrl,
2357 }
2358
2359 return [info]
2360
2361 class CollegeHumorIE(InfoExtractor):
2362 """Information extractor for collegehumor.com"""
2363
2364 _WORKING = False
2365 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2366 IE_NAME = u'collegehumor'
2367
2368 def report_manifest(self, video_id):
2369 """Report information extraction."""
2370 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2371
2372 def _real_extract(self, url):
2373 mobj = re.match(self._VALID_URL, url)
2374 if mobj is None:
2375 raise ExtractorError(u'Invalid URL: %s' % url)
2376 video_id = mobj.group('videoid')
2377
2378 info = {
2379 'id': video_id,
2380 'uploader': None,
2381 'upload_date': None,
2382 }
2383
2384 self.report_extraction(video_id)
2385 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2386 try:
2387 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2389 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2390
2391 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2392 try:
2393 videoNode = mdoc.findall('./video')[0]
2394 info['description'] = videoNode.findall('./description')[0].text
2395 info['title'] = videoNode.findall('./caption')[0].text
2396 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2397 manifest_url = videoNode.findall('./file')[0].text
2398 except IndexError:
2399 raise ExtractorError(u'Invalid metadata XML file')
2400
2401 manifest_url += '?hdcore=2.10.3'
2402 self.report_manifest(video_id)
2403 try:
2404 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2407
2408 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2409 try:
2410 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2411 node_id = media_node.attrib['url']
2412 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2413 except IndexError as err:
2414 raise ExtractorError(u'Invalid manifest file')
2415
2416 url_pr = compat_urllib_parse_urlparse(manifest_url)
2417 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2418
2419 info['url'] = url
2420 info['ext'] = 'f4f'
2421 return [info]
2422
2423
2424 class XVideosIE(InfoExtractor):
2425 """Information extractor for xvideos.com"""
2426
2427 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2428 IE_NAME = u'xvideos'
2429
2430 def _real_extract(self, url):
2431 mobj = re.match(self._VALID_URL, url)
2432 if mobj is None:
2433 raise ExtractorError(u'Invalid URL: %s' % url)
2434 video_id = mobj.group(1)
2435
2436 webpage = self._download_webpage(url, video_id)
2437
2438 self.report_extraction(video_id)
2439
2440
2441 # Extract video URL
2442 mobj = re.search(r'flv_url=(.+?)&', webpage)
2443 if mobj is None:
2444 raise ExtractorError(u'Unable to extract video url')
2445 video_url = compat_urllib_parse.unquote(mobj.group(1))
2446
2447
2448 # Extract title
2449 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2450 if mobj is None:
2451 raise ExtractorError(u'Unable to extract video title')
2452 video_title = mobj.group(1)
2453
2454
2455 # Extract video thumbnail
2456 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2457 if mobj is None:
2458 raise ExtractorError(u'Unable to extract video thumbnail')
2459 video_thumbnail = mobj.group(0)
2460
2461 info = {
2462 'id': video_id,
2463 'url': video_url,
2464 'uploader': None,
2465 'upload_date': None,
2466 'title': video_title,
2467 'ext': 'flv',
2468 'thumbnail': video_thumbnail,
2469 'description': None,
2470 }
2471
2472 return [info]
2473
2474
2475 class SoundcloudIE(InfoExtractor):
2476 """Information extractor for soundcloud.com
2477 To access the media, the uid of the song and a stream token
2478 must be extracted from the page source and the script must make
2479 a request to media.soundcloud.com/crossdomain.xml. Then
2480 the media can be grabbed by requesting from an url composed
2481 of the stream token and uid
2482 """
2483
2484 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2485 IE_NAME = u'soundcloud'
2486
2487 def report_resolve(self, video_id):
2488 """Report information extraction."""
2489 self.to_screen(u'%s: Resolving id' % video_id)
2490
2491 def _real_extract(self, url):
2492 mobj = re.match(self._VALID_URL, url)
2493 if mobj is None:
2494 raise ExtractorError(u'Invalid URL: %s' % url)
2495
2496 # extract uploader (which is in the url)
2497 uploader = mobj.group(1)
2498 # extract simple title (uploader + slug of song title)
2499 slug_title = mobj.group(2)
2500 simple_title = uploader + u'-' + slug_title
2501 full_title = '%s/%s' % (uploader, slug_title)
2502
2503 self.report_resolve(full_title)
2504
2505 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2506 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2507 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2508
2509 info = json.loads(info_json)
2510 video_id = info['id']
2511 self.report_extraction(full_title)
2512
2513 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2514 stream_json = self._download_webpage(streams_url, full_title,
2515 u'Downloading stream definitions',
2516 u'unable to download stream definitions')
2517
2518 streams = json.loads(stream_json)
2519 mediaURL = streams['http_mp3_128_url']
2520 upload_date = unified_strdate(info['created_at'])
2521
2522 return [{
2523 'id': info['id'],
2524 'url': mediaURL,
2525 'uploader': info['user']['username'],
2526 'upload_date': upload_date,
2527 'title': info['title'],
2528 'ext': u'mp3',
2529 'description': info['description'],
2530 }]
2531
2532 class SoundcloudSetIE(InfoExtractor):
2533 """Information extractor for soundcloud.com sets
2534 To access the media, the uid of the song and a stream token
2535 must be extracted from the page source and the script must make
2536 a request to media.soundcloud.com/crossdomain.xml. Then
2537 the media can be grabbed by requesting from an url composed
2538 of the stream token and uid
2539 """
2540
2541 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2542 IE_NAME = u'soundcloud:set'
2543
2544 def report_resolve(self, video_id):
2545 """Report information extraction."""
2546 self.to_screen(u'%s: Resolving id' % video_id)
2547
2548 def _real_extract(self, url):
2549 mobj = re.match(self._VALID_URL, url)
2550 if mobj is None:
2551 raise ExtractorError(u'Invalid URL: %s' % url)
2552
2553 # extract uploader (which is in the url)
2554 uploader = mobj.group(1)
2555 # extract simple title (uploader + slug of song title)
2556 slug_title = mobj.group(2)
2557 simple_title = uploader + u'-' + slug_title
2558 full_title = '%s/sets/%s' % (uploader, slug_title)
2559
2560 self.report_resolve(full_title)
2561
2562 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2563 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2564 info_json = self._download_webpage(resolv_url, full_title)
2565
2566 videos = []
2567 info = json.loads(info_json)
2568 if 'errors' in info:
2569 for err in info['errors']:
2570 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2571 return
2572
2573 self.report_extraction(full_title)
2574 for track in info['tracks']:
2575 video_id = track['id']
2576
2577 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2578 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2579
2580 self.report_extraction(video_id)
2581 streams = json.loads(stream_json)
2582 mediaURL = streams['http_mp3_128_url']
2583
2584 videos.append({
2585 'id': video_id,
2586 'url': mediaURL,
2587 'uploader': track['user']['username'],
2588 'upload_date': unified_strdate(track['created_at']),
2589 'title': track['title'],
2590 'ext': u'mp3',
2591 'description': track['description'],
2592 })
2593 return videos
2594
2595
2596 class InfoQIE(InfoExtractor):
2597 """Information extractor for infoq.com"""
2598 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2599
2600 def _real_extract(self, url):
2601 mobj = re.match(self._VALID_URL, url)
2602 if mobj is None:
2603 raise ExtractorError(u'Invalid URL: %s' % url)
2604
2605 webpage = self._download_webpage(url, video_id=url)
2606 self.report_extraction(url)
2607
2608 # Extract video URL
2609 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2610 if mobj is None:
2611 raise ExtractorError(u'Unable to extract video url')
2612 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2613 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2614
2615 # Extract title
2616 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2617 if mobj is None:
2618 raise ExtractorError(u'Unable to extract video title')
2619 video_title = mobj.group(1)
2620
2621 # Extract description
2622 video_description = u'No description available.'
2623 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2624 if mobj is not None:
2625 video_description = mobj.group(1)
2626
2627 video_filename = video_url.split('/')[-1]
2628 video_id, extension = video_filename.split('.')
2629
2630 info = {
2631 'id': video_id,
2632 'url': video_url,
2633 'uploader': None,
2634 'upload_date': None,
2635 'title': video_title,
2636 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2637 'thumbnail': None,
2638 'description': video_description,
2639 }
2640
2641 return [info]
2642
2643 class MixcloudIE(InfoExtractor):
2644 """Information extractor for www.mixcloud.com"""
2645
2646 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'mixcloud'
2649
2650 def report_download_json(self, file_id):
2651 """Report JSON download."""
2652 self.to_screen(u'Downloading json')
2653
2654 def get_urls(self, jsonData, fmt, bitrate='best'):
2655 """Get urls from 'audio_formats' section in json"""
2656 file_url = None
2657 try:
2658 bitrate_list = jsonData[fmt]
2659 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2660 bitrate = max(bitrate_list) # select highest
2661
2662 url_list = jsonData[fmt][bitrate]
2663 except TypeError: # we have no bitrate info.
2664 url_list = jsonData[fmt]
2665 return url_list
2666
2667 def check_urls(self, url_list):
2668 """Returns 1st active url from list"""
2669 for url in url_list:
2670 try:
2671 compat_urllib_request.urlopen(url)
2672 return url
2673 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674 url = None
2675
2676 return None
2677
2678 def _print_formats(self, formats):
2679 print('Available formats:')
2680 for fmt in formats.keys():
2681 for b in formats[fmt]:
2682 try:
2683 ext = formats[fmt][b][0]
2684 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2685 except TypeError: # we have no bitrate info
2686 ext = formats[fmt][0]
2687 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2688 break
2689
2690 def _real_extract(self, url):
2691 mobj = re.match(self._VALID_URL, url)
2692 if mobj is None:
2693 raise ExtractorError(u'Invalid URL: %s' % url)
2694 # extract uploader & filename from url
2695 uploader = mobj.group(1).decode('utf-8')
2696 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2697
2698 # construct API request
2699 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2700 # retrieve .json file with links to files
2701 request = compat_urllib_request.Request(file_url)
2702 try:
2703 self.report_download_json(file_url)
2704 jsonData = compat_urllib_request.urlopen(request).read()
2705 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2706 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2707
2708 # parse JSON
2709 json_data = json.loads(jsonData)
2710 player_url = json_data['player_swf_url']
2711 formats = dict(json_data['audio_formats'])
2712
2713 req_format = self._downloader.params.get('format', None)
2714 bitrate = None
2715
2716 if self._downloader.params.get('listformats', None):
2717 self._print_formats(formats)
2718 return
2719
2720 if req_format is None or req_format == 'best':
2721 for format_param in formats.keys():
2722 url_list = self.get_urls(formats, format_param)
2723 # check urls
2724 file_url = self.check_urls(url_list)
2725 if file_url is not None:
2726 break # got it!
2727 else:
2728 if req_format not in formats:
2729 raise ExtractorError(u'Format is not available')
2730
2731 url_list = self.get_urls(formats, req_format)
2732 file_url = self.check_urls(url_list)
2733 format_param = req_format
2734
2735 return [{
2736 'id': file_id.decode('utf-8'),
2737 'url': file_url.decode('utf-8'),
2738 'uploader': uploader.decode('utf-8'),
2739 'upload_date': None,
2740 'title': json_data['name'],
2741 'ext': file_url.split('.')[-1].decode('utf-8'),
2742 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2743 'thumbnail': json_data['thumbnail_url'],
2744 'description': json_data['description'],
2745 'player_url': player_url.decode('utf-8'),
2746 }]
2747
2748 class StanfordOpenClassroomIE(InfoExtractor):
2749 """Information extractor for Stanford's Open ClassRoom"""
2750
2751 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2752 IE_NAME = u'stanfordoc'
2753
2754 def _real_extract(self, url):
2755 mobj = re.match(self._VALID_URL, url)
2756 if mobj is None:
2757 raise ExtractorError(u'Invalid URL: %s' % url)
2758
2759 if mobj.group('course') and mobj.group('video'): # A specific video
2760 course = mobj.group('course')
2761 video = mobj.group('video')
2762 info = {
2763 'id': course + '_' + video,
2764 'uploader': None,
2765 'upload_date': None,
2766 }
2767
2768 self.report_extraction(info['id'])
2769 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2770 xmlUrl = baseUrl + video + '.xml'
2771 try:
2772 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2773 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2774 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2775 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2776 try:
2777 info['title'] = mdoc.findall('./title')[0].text
2778 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2779 except IndexError:
2780 raise ExtractorError(u'Invalid metadata XML file')
2781 info['ext'] = info['url'].rpartition('.')[2]
2782 return [info]
2783 elif mobj.group('course'): # A course page
2784 course = mobj.group('course')
2785 info = {
2786 'id': course,
2787 'type': 'playlist',
2788 'uploader': None,
2789 'upload_date': None,
2790 }
2791
2792 coursepage = self._download_webpage(url, info['id'],
2793 note='Downloading course info page',
2794 errnote='Unable to download course info page')
2795
2796 m = re.search('<h1>([^<]+)</h1>', coursepage)
2797 if m:
2798 info['title'] = unescapeHTML(m.group(1))
2799 else:
2800 info['title'] = info['id']
2801
2802 m = re.search('<description>([^<]+)</description>', coursepage)
2803 if m:
2804 info['description'] = unescapeHTML(m.group(1))
2805
2806 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2807 info['list'] = [
2808 {
2809 'type': 'reference',
2810 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2811 }
2812 for vpage in links]
2813 results = []
2814 for entry in info['list']:
2815 assert entry['type'] == 'reference'
2816 results += self.extract(entry['url'])
2817 return results
2818 else: # Root page
2819 info = {
2820 'id': 'Stanford OpenClassroom',
2821 'type': 'playlist',
2822 'uploader': None,
2823 'upload_date': None,
2824 }
2825
2826 self.report_download_webpage(info['id'])
2827 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2828 try:
2829 rootpage = compat_urllib_request.urlopen(rootURL).read()
2830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2831 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2832
2833 info['title'] = info['id']
2834
2835 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2836 info['list'] = [
2837 {
2838 'type': 'reference',
2839 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2840 }
2841 for cpage in links]
2842
2843 results = []
2844 for entry in info['list']:
2845 assert entry['type'] == 'reference'
2846 results += self.extract(entry['url'])
2847 return results
2848
2849 class MTVIE(InfoExtractor):
2850 """Information extractor for MTV.com"""
2851
2852 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2853 IE_NAME = u'mtv'
2854
2855 def _real_extract(self, url):
2856 mobj = re.match(self._VALID_URL, url)
2857 if mobj is None:
2858 raise ExtractorError(u'Invalid URL: %s' % url)
2859 if not mobj.group('proto'):
2860 url = 'http://' + url
2861 video_id = mobj.group('videoid')
2862
2863 webpage = self._download_webpage(url, video_id)
2864
2865 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2866 if mobj is None:
2867 raise ExtractorError(u'Unable to extract song name')
2868 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2869 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2870 if mobj is None:
2871 raise ExtractorError(u'Unable to extract performer')
2872 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2873 video_title = performer + ' - ' + song_name
2874
2875 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2876 if mobj is None:
2877 raise ExtractorError(u'Unable to mtvn_uri')
2878 mtvn_uri = mobj.group(1)
2879
2880 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2881 if mobj is None:
2882 raise ExtractorError(u'Unable to extract content id')
2883 content_id = mobj.group(1)
2884
2885 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2886 self.report_extraction(video_id)
2887 request = compat_urllib_request.Request(videogen_url)
2888 try:
2889 metadataXml = compat_urllib_request.urlopen(request).read()
2890 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2891 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2892
2893 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2894 renditions = mdoc.findall('.//rendition')
2895
2896 # For now, always pick the highest quality.
2897 rendition = renditions[-1]
2898
2899 try:
2900 _,_,ext = rendition.attrib['type'].partition('/')
2901 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2902 video_url = rendition.find('./src').text
2903 except KeyError:
2904 raise ExtractorError('Invalid rendition field.')
2905
2906 info = {
2907 'id': video_id,
2908 'url': video_url,
2909 'uploader': performer,
2910 'upload_date': None,
2911 'title': video_title,
2912 'ext': ext,
2913 'format': format,
2914 }
2915
2916 return [info]
2917
2918
2919 class YoukuIE(InfoExtractor):
2920 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2921
2922 def _gen_sid(self):
2923 nowTime = int(time.time() * 1000)
2924 random1 = random.randint(1000,1998)
2925 random2 = random.randint(1000,9999)
2926
2927 return "%d%d%d" %(nowTime,random1,random2)
2928
2929 def _get_file_ID_mix_string(self, seed):
2930 mixed = []
2931 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2932 seed = float(seed)
2933 for i in range(len(source)):
2934 seed = (seed * 211 + 30031 ) % 65536
2935 index = math.floor(seed / 65536 * len(source) )
2936 mixed.append(source[int(index)])
2937 source.remove(source[int(index)])
2938 #return ''.join(mixed)
2939 return mixed
2940
2941 def _get_file_id(self, fileId, seed):
2942 mixed = self._get_file_ID_mix_string(seed)
2943 ids = fileId.split('*')
2944 realId = []
2945 for ch in ids:
2946 if ch:
2947 realId.append(mixed[int(ch)])
2948 return ''.join(realId)
2949
2950 def _real_extract(self, url):
2951 mobj = re.match(self._VALID_URL, url)
2952 if mobj is None:
2953 raise ExtractorError(u'Invalid URL: %s' % url)
2954 video_id = mobj.group('ID')
2955
2956 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2957
2958 jsondata = self._download_webpage(info_url, video_id)
2959
2960 self.report_extraction(video_id)
2961 try:
2962 config = json.loads(jsondata)
2963
2964 video_title = config['data'][0]['title']
2965 seed = config['data'][0]['seed']
2966
2967 format = self._downloader.params.get('format', None)
2968 supported_format = list(config['data'][0]['streamfileids'].keys())
2969
2970 if format is None or format == 'best':
2971 if 'hd2' in supported_format:
2972 format = 'hd2'
2973 else:
2974 format = 'flv'
2975 ext = u'flv'
2976 elif format == 'worst':
2977 format = 'mp4'
2978 ext = u'mp4'
2979 else:
2980 format = 'flv'
2981 ext = u'flv'
2982
2983
2984 fileid = config['data'][0]['streamfileids'][format]
2985 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2986 except (UnicodeDecodeError, ValueError, KeyError):
2987 raise ExtractorError(u'Unable to extract info section')
2988
2989 files_info=[]
2990 sid = self._gen_sid()
2991 fileid = self._get_file_id(fileid, seed)
2992
2993 #column 8,9 of fileid represent the segment number
2994 #fileid[7:9] should be changed
2995 for index, key in enumerate(keys):
2996
2997 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2998 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2999
3000 info = {
3001 'id': '%s_part%02d' % (video_id, index),
3002 'url': download_url,
3003 'uploader': None,
3004 'upload_date': None,
3005 'title': video_title,
3006 'ext': ext,
3007 }
3008 files_info.append(info)
3009
3010 return files_info
3011
3012
3013 class XNXXIE(InfoExtractor):
3014 """Information extractor for xnxx.com"""
3015
3016 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3017 IE_NAME = u'xnxx'
3018 VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3019 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3020 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3021
3022 def _real_extract(self, url):
3023 mobj = re.match(self._VALID_URL, url)
3024 if mobj is None:
3025 raise ExtractorError(u'Invalid URL: %s' % url)
3026 video_id = mobj.group(1)
3027
3028 # Get webpage content
3029 webpage = self._download_webpage(url, video_id)
3030
3031 result = re.search(self.VIDEO_URL_RE, webpage)
3032 if result is None:
3033 raise ExtractorError(u'Unable to extract video url')
3034 video_url = compat_urllib_parse.unquote(result.group(1))
3035
3036 result = re.search(self.VIDEO_TITLE_RE, webpage)
3037 if result is None:
3038 raise ExtractorError(u'Unable to extract video title')
3039 video_title = result.group(1)
3040
3041 result = re.search(self.VIDEO_THUMB_RE, webpage)
3042 if result is None:
3043 raise ExtractorError(u'Unable to extract video thumbnail')
3044 video_thumbnail = result.group(1)
3045
3046 return [{
3047 'id': video_id,
3048 'url': video_url,
3049 'uploader': None,
3050 'upload_date': None,
3051 'title': video_title,
3052 'ext': 'flv',
3053 'thumbnail': video_thumbnail,
3054 'description': None,
3055 }]
3056
3057
3058 class GooglePlusIE(InfoExtractor):
3059 """Information extractor for plus.google.com."""
3060
3061 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3062 IE_NAME = u'plus.google'
3063
3064 def report_extract_entry(self, url):
3065 """Report downloading extry"""
3066 self.to_screen(u'Downloading entry: %s' % url)
3067
3068 def report_date(self, upload_date):
3069 """Report downloading extry"""
3070 self.to_screen(u'Entry date: %s' % upload_date)
3071
3072 def report_uploader(self, uploader):
3073 """Report downloading extry"""
3074 self.to_screen(u'Uploader: %s' % uploader)
3075
3076 def report_title(self, video_title):
3077 """Report downloading extry"""
3078 self.to_screen(u'Title: %s' % video_title)
3079
3080 def report_extract_vid_page(self, video_page):
3081 """Report information extraction."""
3082 self.to_screen(u'Extracting video page: %s' % video_page)
3083
3084 def _real_extract(self, url):
3085 # Extract id from URL
3086 mobj = re.match(self._VALID_URL, url)
3087 if mobj is None:
3088 raise ExtractorError(u'Invalid URL: %s' % url)
3089
3090 post_url = mobj.group(0)
3091 video_id = mobj.group(1)
3092
3093 video_extension = 'flv'
3094
3095 # Step 1, Retrieve post webpage to extract further information
3096 self.report_extract_entry(post_url)
3097 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3098
3099 # Extract update date
3100 upload_date = None
3101 pattern = 'title="Timestamp">(.*?)</a>'
3102 mobj = re.search(pattern, webpage)
3103 if mobj:
3104 upload_date = mobj.group(1)
3105 # Convert timestring to a format suitable for filename
3106 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3107 upload_date = upload_date.strftime('%Y%m%d')
3108 self.report_date(upload_date)
3109
3110 # Extract uploader
3111 uploader = None
3112 pattern = r'rel\="author".*?>(.*?)</a>'
3113 mobj = re.search(pattern, webpage)
3114 if mobj:
3115 uploader = mobj.group(1)
3116 self.report_uploader(uploader)
3117
3118 # Extract title
3119 # Get the first line for title
3120 video_title = u'NA'
3121 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3122 mobj = re.search(pattern, webpage)
3123 if mobj:
3124 video_title = mobj.group(1)
3125 self.report_title(video_title)
3126
3127 # Step 2, Stimulate clicking the image box to launch video
3128 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3129 mobj = re.search(pattern, webpage)
3130 if mobj is None:
3131 raise ExtractorError(u'Unable to extract video page URL')
3132
3133 video_page = mobj.group(1)
3134 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3135 self.report_extract_vid_page(video_page)
3136
3137
3138 # Extract video links on video page
3139 """Extract video links of all sizes"""
3140 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3141 mobj = re.findall(pattern, webpage)
3142 if len(mobj) == 0:
3143 raise ExtractorError(u'Unable to extract video links')
3144
3145 # Sort in resolution
3146 links = sorted(mobj)
3147
3148 # Choose the lowest of the sort, i.e. highest resolution
3149 video_url = links[-1]
3150 # Only get the url. The resolution part in the tuple has no use anymore
3151 video_url = video_url[-1]
3152 # Treat escaped \u0026 style hex
3153 try:
3154 video_url = video_url.decode("unicode_escape")
3155 except AttributeError: # Python 3
3156 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3157
3158
3159 return [{
3160 'id': video_id,
3161 'url': video_url,
3162 'uploader': uploader,
3163 'upload_date': upload_date,
3164 'title': video_title,
3165 'ext': video_extension,
3166 }]
3167
3168 class NBAIE(InfoExtractor):
3169 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3170 IE_NAME = u'nba'
3171
3172 def _real_extract(self, url):
3173 mobj = re.match(self._VALID_URL, url)
3174 if mobj is None:
3175 raise ExtractorError(u'Invalid URL: %s' % url)
3176
3177 video_id = mobj.group(1)
3178 if video_id.endswith('/index.html'):
3179 video_id = video_id[:-len('/index.html')]
3180
3181 webpage = self._download_webpage(url, video_id)
3182
3183 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3184 def _findProp(rexp, default=None):
3185 m = re.search(rexp, webpage)
3186 if m:
3187 return unescapeHTML(m.group(1))
3188 else:
3189 return default
3190
3191 shortened_video_id = video_id.rpartition('/')[2]
3192 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3193 info = {
3194 'id': shortened_video_id,
3195 'url': video_url,
3196 'ext': 'mp4',
3197 'title': title,
3198 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3199 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3200 }
3201 return [info]
3202
3203 class JustinTVIE(InfoExtractor):
3204 """Information extractor for justin.tv and twitch.tv"""
3205 # TODO: One broadcast may be split into multiple videos. The key
3206 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3207 # starts at 1 and increases. Can we treat all parts as one video?
3208
3209 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3210 (?:
3211 (?P<channelid>[^/]+)|
3212 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3213 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3214 )
3215 /?(?:\#.*)?$
3216 """
3217 _JUSTIN_PAGE_LIMIT = 100
3218 IE_NAME = u'justin.tv'
3219
3220 def report_download_page(self, channel, offset):
3221 """Report attempt to download a single page of videos."""
3222 self.to_screen(u'%s: Downloading video information from %d to %d' %
3223 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3224
3225 # Return count of items, list of *valid* items
3226 def _parse_page(self, url, video_id):
3227 webpage = self._download_webpage(url, video_id,
3228 u'Downloading video info JSON',
3229 u'unable to download video info JSON')
3230
3231 response = json.loads(webpage)
3232 if type(response) != list:
3233 error_text = response.get('error', 'unknown error')
3234 raise ExtractorError(u'Justin.tv API: %s' % error_text)
3235 info = []
3236 for clip in response:
3237 video_url = clip['video_file_url']
3238 if video_url:
3239 video_extension = os.path.splitext(video_url)[1][1:]
3240 video_date = re.sub('-', '', clip['start_time'][:10])
3241 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3242 video_id = clip['id']
3243 video_title = clip.get('title', video_id)
3244 info.append({
3245 'id': video_id,
3246 'url': video_url,
3247 'title': video_title,
3248 'uploader': clip.get('channel_name', video_uploader_id),
3249 'uploader_id': video_uploader_id,
3250 'upload_date': video_date,
3251 'ext': video_extension,
3252 })
3253 return (len(response), info)
3254
3255 def _real_extract(self, url):
3256 mobj = re.match(self._VALID_URL, url)
3257 if mobj is None:
3258 raise ExtractorError(u'invalid URL: %s' % url)
3259
3260 api_base = 'http://api.justin.tv'
3261 paged = False
3262 if mobj.group('channelid'):
3263 paged = True
3264 video_id = mobj.group('channelid')
3265 api = api_base + '/channel/archives/%s.json' % video_id
3266 elif mobj.group('chapterid'):
3267 chapter_id = mobj.group('chapterid')
3268
3269 webpage = self._download_webpage(url, chapter_id)
3270 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3271 if not m:
3272 raise ExtractorError(u'Cannot find archive of a chapter')
3273 archive_id = m.group(1)
3274
3275 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3276 chapter_info_xml = self._download_webpage(api, chapter_id,
3277 note=u'Downloading chapter information',
3278 errnote=u'Chapter information download failed')
3279 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3280 for a in doc.findall('.//archive'):
3281 if archive_id == a.find('./id').text:
3282 break
3283 else:
3284 raise ExtractorError(u'Could not find chapter in chapter information')
3285
3286 video_url = a.find('./video_file_url').text
3287 video_ext = video_url.rpartition('.')[2] or u'flv'
3288
3289 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3290 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3291 note='Downloading chapter metadata',
3292 errnote='Download of chapter metadata failed')
3293 chapter_info = json.loads(chapter_info_json)
3294
3295 bracket_start = int(doc.find('.//bracket_start').text)
3296 bracket_end = int(doc.find('.//bracket_end').text)
3297
3298 # TODO determine start (and probably fix up file)
3299 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3300 #video_url += u'?start=' + TODO:start_timestamp
3301 # bracket_start is 13290, but we want 51670615
3302 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3303 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3304
3305 info = {
3306 'id': u'c' + chapter_id,
3307 'url': video_url,
3308 'ext': video_ext,
3309 'title': chapter_info['title'],
3310 'thumbnail': chapter_info['preview'],
3311 'description': chapter_info['description'],
3312 'uploader': chapter_info['channel']['display_name'],
3313 'uploader_id': chapter_info['channel']['name'],
3314 }
3315 return [info]
3316 else:
3317 video_id = mobj.group('videoid')
3318 api = api_base + '/broadcast/by_archive/%s.json' % video_id
3319
3320 self.report_extraction(video_id)
3321
3322 info = []
3323 offset = 0
3324 limit = self._JUSTIN_PAGE_LIMIT
3325 while True:
3326 if paged:
3327 self.report_download_page(video_id, offset)
3328 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3329 page_count, page_info = self._parse_page(page_url, video_id)
3330 info.extend(page_info)
3331 if not paged or page_count != limit:
3332 break
3333 offset += limit
3334 return info
3335
3336 class FunnyOrDieIE(InfoExtractor):
3337 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3338
3339 def _real_extract(self, url):
3340 mobj = re.match(self._VALID_URL, url)
3341 if mobj is None:
3342 raise ExtractorError(u'invalid URL: %s' % url)
3343
3344 video_id = mobj.group('id')
3345 webpage = self._download_webpage(url, video_id)
3346
3347 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3348 if not m:
3349 raise ExtractorError(u'Unable to find video information')
3350 video_url = unescapeHTML(m.group('url'))
3351
3352 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3353 if not m:
3354 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3355 if not m:
3356 raise ExtractorError(u'Cannot find video title')
3357 title = clean_html(m.group('title'))
3358
3359 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3360 if m:
3361 desc = unescapeHTML(m.group('desc'))
3362 else:
3363 desc = None
3364
3365 info = {
3366 'id': video_id,
3367 'url': video_url,
3368 'ext': 'mp4',
3369 'title': title,
3370 'description': desc,
3371 }
3372 return [info]
3373
3374 class SteamIE(InfoExtractor):
3375 _VALID_URL = r"""http://store\.steampowered\.com/
3376 (agecheck/)?
3377 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3378 (?P<gameID>\d+)/?
3379 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3380 """
3381
3382 @classmethod
3383 def suitable(cls, url):
3384 """Receives a URL and returns True if suitable for this IE."""
3385 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3386
3387 def _real_extract(self, url):
3388 m = re.match(self._VALID_URL, url, re.VERBOSE)
3389 gameID = m.group('gameID')
3390 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3391 self.report_age_confirmation()
3392 webpage = self._download_webpage(videourl, gameID)
3393 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3394
3395 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3396 mweb = re.finditer(urlRE, webpage)
3397 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3398 titles = re.finditer(namesRE, webpage)
3399 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3400 thumbs = re.finditer(thumbsRE, webpage)
3401 videos = []
3402 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3403 video_id = vid.group('videoID')
3404 title = vtitle.group('videoName')
3405 video_url = vid.group('videoURL')
3406 video_thumb = thumb.group('thumbnail')
3407 if not video_url:
3408 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3409 info = {
3410 'id':video_id,
3411 'url':video_url,
3412 'ext': 'flv',
3413 'title': unescapeHTML(title),
3414 'thumbnail': video_thumb
3415 }
3416 videos.append(info)
3417 return [self.playlist_result(videos, gameID, game_title)]
3418
3419 class UstreamIE(InfoExtractor):
3420 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3421 IE_NAME = u'ustream'
3422
3423 def _real_extract(self, url):
3424 m = re.match(self._VALID_URL, url)
3425 video_id = m.group('videoID')
3426 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3427 webpage = self._download_webpage(url, video_id)
3428 self.report_extraction(video_id)
3429 try:
3430 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3431 title = m.group('title')
3432 m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3433 webpage, re.DOTALL)
3434 uploader = unescapeHTML(m.group('uploader').strip())
3435 m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
3436 thumb = m.group('thumb')
3437 except AttributeError:
3438 raise ExtractorError(u'Unable to extract info')
3439 info = {
3440 'id':video_id,
3441 'url':video_url,
3442 'ext': 'flv',
3443 'title': title,
3444 'uploader': uploader,
3445 'thumbnail': thumb,
3446 }
3447 return info
3448
3449 class WorldStarHipHopIE(InfoExtractor):
3450 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3451 IE_NAME = u'WorldStarHipHop'
3452
3453 def _real_extract(self, url):
3454 _src_url = r'so\.addVariable\("file","(.*?)"\)'
3455
3456 m = re.match(self._VALID_URL, url)
3457 video_id = m.group('id')
3458
3459 webpage_src = self._download_webpage(url, video_id)
3460
3461 mobj = re.search(_src_url, webpage_src)
3462
3463 if mobj is not None:
3464 video_url = mobj.group(1)
3465 if 'mp4' in video_url:
3466 ext = 'mp4'
3467 else:
3468 ext = 'flv'
3469 else:
3470 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3471
3472 mobj = re.search(r"<title>(.*)</title>", webpage_src)
3473
3474 if mobj is None:
3475 raise ExtractorError(u'Cannot determine title')
3476 title = mobj.group(1)
3477
3478 mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3479 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3480 if mobj is not None:
3481 thumbnail = mobj.group(1)
3482 else:
3483 _title = r"""candytitles.*>(.*)</span>"""
3484 mobj = re.search(_title, webpage_src)
3485 if mobj is not None:
3486 title = mobj.group(1)
3487 thumbnail = None
3488
3489 results = [{
3490 'id': video_id,
3491 'url' : video_url,
3492 'title' : title,
3493 'thumbnail' : thumbnail,
3494 'ext' : ext,
3495 }]
3496 return results
3497
3498 class RBMARadioIE(InfoExtractor):
3499 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3500
3501 def _real_extract(self, url):
3502 m = re.match(self._VALID_URL, url)
3503 video_id = m.group('videoID')
3504
3505 webpage = self._download_webpage(url, video_id)
3506 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3507 if not m:
3508 raise ExtractorError(u'Cannot find metadata')
3509 json_data = m.group(1)
3510
3511 try:
3512 data = json.loads(json_data)
3513 except ValueError as e:
3514 raise ExtractorError(u'Invalid JSON: ' + str(e))
3515
3516 video_url = data['akamai_url'] + '&cbr=256'
3517 url_parts = compat_urllib_parse_urlparse(video_url)
3518 video_ext = url_parts.path.rpartition('.')[2]
3519 info = {
3520 'id': video_id,
3521 'url': video_url,
3522 'ext': video_ext,
3523 'title': data['title'],
3524 'description': data.get('teaser_text'),
3525 'location': data.get('country_of_origin'),
3526 'uploader': data.get('host', {}).get('name'),
3527 'uploader_id': data.get('host', {}).get('slug'),
3528 'thumbnail': data.get('image', {}).get('large_url_2x'),
3529 'duration': data.get('duration'),
3530 }
3531 return [info]
3532
3533
3534 class YouPornIE(InfoExtractor):
3535 """Information extractor for youporn.com."""
3536 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3537
3538 def _print_formats(self, formats):
3539 """Print all available formats"""
3540 print(u'Available formats:')
3541 print(u'ext\t\tformat')
3542 print(u'---------------------------------')
3543 for format in formats:
3544 print(u'%s\t\t%s' % (format['ext'], format['format']))
3545
3546 def _specific(self, req_format, formats):
3547 for x in formats:
3548 if(x["format"]==req_format):
3549 return x
3550 return None
3551
3552 def _real_extract(self, url):
3553 mobj = re.match(self._VALID_URL, url)
3554 if mobj is None:
3555 raise ExtractorError(u'Invalid URL: %s' % url)
3556
3557 video_id = mobj.group('videoid')
3558
3559 req = compat_urllib_request.Request(url)
3560 req.add_header('Cookie', 'age_verified=1')
3561 webpage = self._download_webpage(req, video_id)
3562
3563 # Get the video title
3564 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3565 if result is None:
3566 raise ExtractorError(u'Unable to extract video title')
3567 video_title = result.group('title').strip()
3568
3569 # Get the video date
3570 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3571 if result is None:
3572 self._downloader.report_warning(u'unable to extract video date')
3573 upload_date = None
3574 else:
3575 upload_date = unified_strdate(result.group('date').strip())
3576
3577 # Get the video uploader
3578 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3579 if result is None:
3580 self._downloader.report_warning(u'unable to extract uploader')
3581 video_uploader = None
3582 else:
3583 video_uploader = result.group('uploader').strip()
3584 video_uploader = clean_html( video_uploader )
3585
3586 # Get all of the formats available
3587 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3588 result = re.search(DOWNLOAD_LIST_RE, webpage)
3589 if result is None:
3590 raise ExtractorError(u'Unable to extract download list')
3591 download_list_html = result.group('download_list').strip()
3592
3593 # Get all of the links from the page
3594 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3595 links = re.findall(LINK_RE, download_list_html)
3596 if(len(links) == 0):
3597 raise ExtractorError(u'ERROR: no known formats available for video')
3598
3599 self.to_screen(u'Links found: %d' % len(links))
3600
3601 formats = []
3602 for link in links:
3603
3604 # A link looks like this:
3605 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3606 # A path looks like this:
3607 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3608 video_url = unescapeHTML( link )
3609 path = compat_urllib_parse_urlparse( video_url ).path
3610 extension = os.path.splitext( path )[1][1:]
3611 format = path.split('/')[4].split('_')[:2]
3612 size = format[0]
3613 bitrate = format[1]
3614 format = "-".join( format )
3615 title = u'%s-%s-%s' % (video_title, size, bitrate)
3616
3617 formats.append({
3618 'id': video_id,
3619 'url': video_url,
3620 'uploader': video_uploader,
3621 'upload_date': upload_date,
3622 'title': title,
3623 'ext': extension,
3624 'format': format,
3625 'thumbnail': None,
3626 'description': None,
3627 'player_url': None
3628 })
3629
3630 if self._downloader.params.get('listformats', None):
3631 self._print_formats(formats)
3632 return
3633
3634 req_format = self._downloader.params.get('format', None)
3635 self.to_screen(u'Format: %s' % req_format)
3636
3637 if req_format is None or req_format == 'best':
3638 return [formats[0]]
3639 elif req_format == 'worst':
3640 return [formats[-1]]
3641 elif req_format in ('-1', 'all'):
3642 return formats
3643 else:
3644 format = self._specific( req_format, formats )
3645 if result is None:
3646 raise ExtractorError(u'Requested format not available')
3647 return [format]
3648
3649
3650
3651 class PornotubeIE(InfoExtractor):
3652 """Information extractor for pornotube.com."""
3653 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3654
3655 def _real_extract(self, url):
3656 mobj = re.match(self._VALID_URL, url)
3657 if mobj is None:
3658 raise ExtractorError(u'Invalid URL: %s' % url)
3659
3660 video_id = mobj.group('videoid')
3661 video_title = mobj.group('title')
3662
3663 # Get webpage content
3664 webpage = self._download_webpage(url, video_id)
3665
3666 # Get the video URL
3667 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3668 result = re.search(VIDEO_URL_RE, webpage)
3669 if result is None:
3670 raise ExtractorError(u'Unable to extract video url')
3671 video_url = compat_urllib_parse.unquote(result.group('url'))
3672
3673 #Get the uploaded date
3674 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3675 result = re.search(VIDEO_UPLOADED_RE, webpage)
3676 if result is None:
3677 raise ExtractorError(u'Unable to extract video title')
3678 upload_date = unified_strdate(result.group('date'))
3679
3680 info = {'id': video_id,
3681 'url': video_url,
3682 'uploader': None,
3683 'upload_date': upload_date,
3684 'title': video_title,
3685 'ext': 'flv',
3686 'format': 'flv'}
3687
3688 return [info]
3689
3690 class YouJizzIE(InfoExtractor):
3691 """Information extractor for youjizz.com."""
3692 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3693
3694 def _real_extract(self, url):
3695 mobj = re.match(self._VALID_URL, url)
3696 if mobj is None:
3697 raise ExtractorError(u'Invalid URL: %s' % url)
3698
3699 video_id = mobj.group('videoid')
3700
3701 # Get webpage content
3702 webpage = self._download_webpage(url, video_id)
3703
3704 # Get the video title
3705 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3706 if result is None:
3707 raise ExtractorError(u'ERROR: unable to extract video title')
3708 video_title = result.group('title').strip()
3709
3710 # Get the embed page
3711 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3712 if result is None:
3713 raise ExtractorError(u'ERROR: unable to extract embed page')
3714
3715 embed_page_url = result.group(0).strip()
3716 video_id = result.group('videoid')
3717
3718 webpage = self._download_webpage(embed_page_url, video_id)
3719
3720 # Get the video URL
3721 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3722 if result is None:
3723 raise ExtractorError(u'ERROR: unable to extract video url')
3724 video_url = result.group('source')
3725
3726 info = {'id': video_id,
3727 'url': video_url,
3728 'title': video_title,
3729 'ext': 'flv',
3730 'format': 'flv',
3731 'player_url': embed_page_url}
3732
3733 return [info]
3734
3735 class EightTracksIE(InfoExtractor):
3736 IE_NAME = '8tracks'
3737 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3738
3739 def _real_extract(self, url):
3740 mobj = re.match(self._VALID_URL, url)
3741 if mobj is None:
3742 raise ExtractorError(u'Invalid URL: %s' % url)
3743 playlist_id = mobj.group('id')
3744
3745 webpage = self._download_webpage(url, playlist_id)
3746
3747 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3748 if not m:
3749 raise ExtractorError(u'Cannot find trax information')
3750 json_like = m.group(1)
3751 data = json.loads(json_like)
3752
3753 session = str(random.randint(0, 1000000000))
3754 mix_id = data['id']
3755 track_count = data['tracks_count']
3756 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3757 next_url = first_url
3758 res = []
3759 for i in itertools.count():
3760 api_json = self._download_webpage(next_url, playlist_id,
3761 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3762 errnote=u'Failed to download song information')
3763 api_data = json.loads(api_json)
3764 track_data = api_data[u'set']['track']
3765 info = {
3766 'id': track_data['id'],
3767 'url': track_data['track_file_stream_url'],
3768 'title': track_data['performer'] + u' - ' + track_data['name'],
3769 'raw_title': track_data['name'],
3770 'uploader_id': data['user']['login'],
3771 'ext': 'm4a',
3772 }
3773 res.append(info)
3774 if api_data['set']['at_last_track']:
3775 break
3776 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3777 return res
3778
3779 class KeekIE(InfoExtractor):
3780 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3781 IE_NAME = u'keek'
3782
3783 def _real_extract(self, url):
3784 m = re.match(self._VALID_URL, url)
3785 video_id = m.group('videoID')
3786 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3787 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3788 webpage = self._download_webpage(url, video_id)
3789 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3790 title = unescapeHTML(m.group('title'))
3791 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3792 uploader = clean_html(m.group('uploader'))
3793 info = {
3794 'id': video_id,
3795 'url': video_url,
3796 'ext': 'mp4',
3797 'title': title,
3798 'thumbnail': thumbnail,
3799 'uploader': uploader
3800 }
3801 return [info]
3802
3803 class TEDIE(InfoExtractor):
3804 _VALID_URL=r'''http://www\.ted\.com/
3805 (
3806 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3807 |
3808 ((?P<type_talk>talks)) # We have a simple talk
3809 )
3810 (/lang/(.*?))? # The url may contain the language
3811 /(?P<name>\w+) # Here goes the name and then ".html"
3812 '''
3813
3814 @classmethod
3815 def suitable(cls, url):
3816 """Receives a URL and returns True if suitable for this IE."""
3817 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3818
3819 def _real_extract(self, url):
3820 m=re.match(self._VALID_URL, url, re.VERBOSE)
3821 if m.group('type_talk'):
3822 return [self._talk_info(url)]
3823 else :
3824 playlist_id=m.group('playlist_id')
3825 name=m.group('name')
3826 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3827 return [self._playlist_videos_info(url,name,playlist_id)]
3828
3829 def _talk_video_link(self,mediaSlug):
3830 '''Returns the video link for that mediaSlug'''
3831 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3832
3833 def _playlist_videos_info(self,url,name,playlist_id=0):
3834 '''Returns the videos of the playlist'''
3835 video_RE=r'''
3836 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3837 ([.\s]*?)data-playlist_item_id="(\d+)"
3838 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3839 '''
3840 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3841 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3842 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3843 m_names=re.finditer(video_name_RE,webpage)
3844
3845 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3846 m_playlist = re.search(playlist_RE, webpage)
3847 playlist_title = m_playlist.group('playlist_title')
3848
3849 playlist_entries = []
3850 for m_video, m_name in zip(m_videos,m_names):
3851 video_id=m_video.group('video_id')
3852 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3853 playlist_entries.append(self.url_result(talk_url, 'TED'))
3854 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3855
3856 def _talk_info(self, url, video_id=0):
3857 """Return the video for the talk in the url"""
3858 m=re.match(self._VALID_URL, url,re.VERBOSE)
3859 videoName=m.group('name')
3860 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3861 # If the url includes the language we get the title translated
3862 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3863 title=re.search(title_RE, webpage).group('title')
3864 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3865 "id":(?P<videoID>[\d]+).*?
3866 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3867 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3868 thumb_match=re.search(thumb_RE,webpage)
3869 info_match=re.search(info_RE,webpage,re.VERBOSE)
3870 video_id=info_match.group('videoID')
3871 mediaSlug=info_match.group('mediaSlug')
3872 video_url=self._talk_video_link(mediaSlug)
3873 info = {
3874 'id': video_id,
3875 'url': video_url,
3876 'ext': 'mp4',
3877 'title': title,
3878 'thumbnail': thumb_match.group('thumbnail')
3879 }
3880 return info
3881
3882 class MySpassIE(InfoExtractor):
3883 _VALID_URL = r'http://www.myspass.de/.*'
3884
3885 def _real_extract(self, url):
3886 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3887
3888 # video id is the last path element of the URL
3889 # usually there is a trailing slash, so also try the second but last
3890 url_path = compat_urllib_parse_urlparse(url).path
3891 url_parent_path, video_id = os.path.split(url_path)
3892 if not video_id:
3893 _, video_id = os.path.split(url_parent_path)
3894
3895 # get metadata
3896 metadata_url = META_DATA_URL_TEMPLATE % video_id
3897 metadata_text = self._download_webpage(metadata_url, video_id)
3898 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3899
3900 # extract values from metadata
3901 url_flv_el = metadata.find('url_flv')
3902 if url_flv_el is None:
3903 raise ExtractorError(u'Unable to extract download url')
3904 video_url = url_flv_el.text
3905 extension = os.path.splitext(video_url)[1][1:]
3906 title_el = metadata.find('title')
3907 if title_el is None:
3908 raise ExtractorError(u'Unable to extract title')
3909 title = title_el.text
3910 format_id_el = metadata.find('format_id')
3911 if format_id_el is None:
3912 format = ext
3913 else:
3914 format = format_id_el.text
3915 description_el = metadata.find('description')
3916 if description_el is not None:
3917 description = description_el.text
3918 else:
3919 description = None
3920 imagePreview_el = metadata.find('imagePreview')
3921 if imagePreview_el is not None:
3922 thumbnail = imagePreview_el.text
3923 else:
3924 thumbnail = None
3925 info = {
3926 'id': video_id,
3927 'url': video_url,
3928 'title': title,
3929 'ext': extension,
3930 'format': format,
3931 'thumbnail': thumbnail,
3932 'description': description
3933 }
3934 return [info]
3935
3936 class SpiegelIE(InfoExtractor):
3937 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3938
3939 def _real_extract(self, url):
3940 m = re.match(self._VALID_URL, url)
3941 video_id = m.group('videoID')
3942
3943 webpage = self._download_webpage(url, video_id)
3944 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3945 if not m:
3946 raise ExtractorError(u'Cannot find title')
3947 video_title = unescapeHTML(m.group(1))
3948
3949 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3950 xml_code = self._download_webpage(xml_url, video_id,
3951 note=u'Downloading XML', errnote=u'Failed to download XML')
3952
3953 idoc = xml.etree.ElementTree.fromstring(xml_code)
3954 last_type = idoc[-1]
3955 filename = last_type.findall('./filename')[0].text
3956 duration = float(last_type.findall('./duration')[0].text)
3957
3958 video_url = 'http://video2.spiegel.de/flash/' + filename
3959 video_ext = filename.rpartition('.')[2]
3960 info = {
3961 'id': video_id,
3962 'url': video_url,
3963 'ext': video_ext,
3964 'title': video_title,
3965 'duration': duration,
3966 }
3967 return [info]
3968
3969 class LiveLeakIE(InfoExtractor):
3970
3971 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3972 IE_NAME = u'liveleak'
3973
3974 def _real_extract(self, url):
3975 mobj = re.match(self._VALID_URL, url)
3976 if mobj is None:
3977 raise ExtractorError(u'Invalid URL: %s' % url)
3978
3979 video_id = mobj.group('video_id')
3980
3981 webpage = self._download_webpage(url, video_id)
3982
3983 m = re.search(r'file: "(.*?)",', webpage)
3984 if not m:
3985 raise ExtractorError(u'Unable to find video url')
3986 video_url = m.group(1)
3987
3988 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3989 if not m:
3990 raise ExtractorError(u'Cannot find video title')
3991 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3992
3993 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3994 if m:
3995 desc = unescapeHTML(m.group('desc'))
3996 else:
3997 desc = None
3998
3999 m = re.search(r'By:.*?(\w+)</a>', webpage)
4000 if m:
4001 uploader = clean_html(m.group(1))
4002 else:
4003 uploader = None
4004
4005 info = {
4006 'id': video_id,
4007 'url': video_url,
4008 'ext': 'mp4',
4009 'title': title,
4010 'description': desc,
4011 'uploader': uploader
4012 }
4013
4014 return [info]
4015
4016 class ARDIE(InfoExtractor):
4017 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4018 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4019 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4020
4021 def _real_extract(self, url):
4022 # determine video id from url
4023 m = re.match(self._VALID_URL, url)
4024
4025 numid = re.search(r'documentId=([0-9]+)', url)
4026 if numid:
4027 video_id = numid.group(1)
4028 else:
4029 video_id = m.group('video_id')
4030
4031 # determine title and media streams from webpage
4032 html = self._download_webpage(url, video_id)
4033 title = re.search(self._TITLE, html).group('title')
4034 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4035 if not streams:
4036 assert '"fsk"' in html
4037 raise ExtractorError(u'This video is only available after 8:00 pm')
4038
4039 # choose default media type and highest quality for now
4040 stream = max([s for s in streams if int(s["media_type"]) == 0],
4041 key=lambda s: int(s["quality"]))
4042
4043 # there's two possibilities: RTMP stream or HTTP download
4044 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4045 if stream['rtmp_url']:
4046 self.to_screen(u'RTMP download detected')
4047 assert stream['video_url'].startswith('mp4:')
4048 info["url"] = stream["rtmp_url"]
4049 info["play_path"] = stream['video_url']
4050 else:
4051 assert stream["video_url"].endswith('.mp4')
4052 info["url"] = stream["video_url"]
4053 return [info]
4054
4055 class TumblrIE(InfoExtractor):
4056 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4057
4058 def _real_extract(self, url):
4059 m_url = re.match(self._VALID_URL, url)
4060 video_id = m_url.group('id')
4061 blog = m_url.group('blog_name')
4062
4063 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4064 webpage = self._download_webpage(url, video_id)
4065
4066 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4067 video = re.search(re_video, webpage)
4068 if video is None:
4069 self.to_screen("No video found")
4070 return []
4071 video_url = video.group('video_url')
4072 ext = video.group('ext')
4073
4074 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4075 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4076
4077 # The only place where you can get a title, it's not complete,
4078 # but searching in other places doesn't work for all videos
4079 re_title = r'<title>(?P<title>.*?)</title>'
4080 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4081
4082 return [{'id': video_id,
4083 'url': video_url,
4084 'title': title,
4085 'thumbnail': thumb,
4086 'ext': ext
4087 }]
4088
4089 class BandcampIE(InfoExtractor):
4090 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4091
4092 def _real_extract(self, url):
4093 mobj = re.match(self._VALID_URL, url)
4094 title = mobj.group('title')
4095 webpage = self._download_webpage(url, title)
4096 # We get the link to the free download page
4097 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4098 if m_download is None:
4099 raise ExtractorError(u'No free songs founded')
4100
4101 download_link = m_download.group(1)
4102 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4103 webpage, re.MULTILINE|re.DOTALL).group('id')
4104
4105 download_webpage = self._download_webpage(download_link, id,
4106 'Downloading free downloads page')
4107 # We get the dictionary of the track from some javascrip code
4108 info = re.search(r'items: (.*?),$',
4109 download_webpage, re.MULTILINE).group(1)
4110 info = json.loads(info)[0]
4111 # We pick mp3-320 for now, until format selection can be easily implemented.
4112 mp3_info = info[u'downloads'][u'mp3-320']
4113 # If we try to use this url it says the link has expired
4114 initial_url = mp3_info[u'url']
4115 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4116 m_url = re.match(re_url, initial_url)
4117 #We build the url we will use to get the final track url
4118 # This url is build in Bandcamp in the script download_bunde_*.js
4119 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4120 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4121 # If we could correctly generate the .rand field the url would be
4122 #in the "download_url" key
4123 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4124
4125 track_info = {'id':id,
4126 'title' : info[u'title'],
4127 'ext' : 'mp3',
4128 'url' : final_url,
4129 'thumbnail' : info[u'thumb_url'],
4130 'uploader' : info[u'artist']
4131 }
4132
4133 return [track_info]
4134
4135 class RedTubeIE(InfoExtractor):
4136 """Information Extractor for redtube"""
4137 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4138
4139 def _real_extract(self,url):
4140 mobj = re.match(self._VALID_URL, url)
4141 if mobj is None:
4142 raise ExtractorError(u'Invalid URL: %s' % url)
4143
4144 video_id = mobj.group('id')
4145 video_extension = 'mp4'
4146 webpage = self._download_webpage(url, video_id)
4147 self.report_extraction(video_id)
4148 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4149
4150 if mobj is None:
4151 raise ExtractorError(u'Unable to extract media URL')
4152
4153 video_url = mobj.group(1)
4154 mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4155 if mobj is None:
4156 raise ExtractorError(u'Unable to extract title')
4157 video_title = mobj.group(1)
4158
4159 return [{
4160 'id': video_id,
4161 'url': video_url,
4162 'ext': video_extension,
4163 'title': video_title,
4164 }]
4165
4166 class InaIE(InfoExtractor):
4167 """Information Extractor for Ina.fr"""
4168 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4169
4170 def _real_extract(self,url):
4171 mobj = re.match(self._VALID_URL, url)
4172
4173 video_id = mobj.group('id')
4174 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4175 video_extension = 'mp4'
4176 webpage = self._download_webpage(mrss_url, video_id)
4177
4178 mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4179 if mobj is None:
4180 raise ExtractorError(u'Unable to extract media URL')
4181 video_url = mobj.group(1)
4182
4183 mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4184 if mobj is None:
4185 raise ExtractorError(u'Unable to extract title')
4186 video_title = mobj.group(1)
4187
4188 return [{
4189 'id': video_id,
4190 'url': video_url,
4191 'ext': video_extension,
4192 'title': video_title,
4193 }]
4194
4195 class HowcastIE(InfoExtractor):
4196 """Information Extractor for Howcast.com"""
4197 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4198
4199 def _real_extract(self, url):
4200 mobj = re.match(self._VALID_URL, url)
4201
4202 video_id = mobj.group('id')
4203 webpage_url = 'http://www.howcast.com/videos/' + video_id
4204 webpage = self._download_webpage(webpage_url, video_id)
4205
4206 self.report_extraction(video_id)
4207
4208 mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
4209 if mobj is None:
4210 raise ExtractorError(u'Unable to extract video URL')
4211 video_url = mobj.group(1)
4212
4213 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
4214 if mobj is None:
4215 raise ExtractorError(u'Unable to extract title')
4216 video_title = mobj.group(1) or mobj.group(2)
4217
4218 mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
4219 if mobj is None:
4220 self._downloader.report_warning(u'unable to extract description')
4221 video_description = None
4222 else:
4223 video_description = mobj.group(1) or mobj.group(2)
4224
4225 mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
4226 if mobj is None:
4227 raise ExtractorError(u'Unable to extract thumbnail')
4228 thumbnail = mobj.group(1)
4229
4230 return [{
4231 'id': video_id,
4232 'url': video_url,
4233 'ext': 'mp4',
4234 'title': video_title,
4235 'description': video_description,
4236 'thumbnail': thumbnail,
4237 }]
4238
4239 class VineIE(InfoExtractor):
4240 """Information Extractor for Vine.co"""
4241 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4242
4243 def _real_extract(self, url):
4244
4245 mobj = re.match(self._VALID_URL, url)
4246
4247 video_id = mobj.group('id')
4248 webpage_url = 'https://vine.co/v/' + video_id
4249 webpage = self._download_webpage(webpage_url, video_id)
4250
4251 self.report_extraction(video_id)
4252
4253 mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
4254 if mobj is None:
4255 raise ExtractorError(u'Unable to extract video URL')
4256 video_url = mobj.group(1)
4257
4258 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4259 if mobj is None:
4260 raise ExtractorError(u'Unable to extract title')
4261 video_title = mobj.group(1)
4262
4263 mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
4264 if mobj is None:
4265 raise ExtractorError(u'Unable to extract thumbnail')
4266 thumbnail = mobj.group(1)
4267
4268 mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
4269 if mobj is None:
4270 raise ExtractorError(u'Unable to extract uploader')
4271 uploader = mobj.group(1)
4272
4273 return [{
4274 'id': video_id,
4275 'url': video_url,
4276 'ext': 'mp4',
4277 'title': video_title,
4278 'thumbnail': thumbnail,
4279 'uploader': uploader,
4280 }]
4281
4282 class FlickrIE(InfoExtractor):
4283 """Information Extractor for Flickr videos"""
4284 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4285
4286 def _real_extract(self, url):
4287 mobj = re.match(self._VALID_URL, url)
4288
4289 video_id = mobj.group('id')
4290 video_uploader_id = mobj.group('uploader_id')
4291 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4292 webpage = self._download_webpage(webpage_url, video_id)
4293
4294 mobj = re.search(r"photo_secret: '(\w+)'", webpage)
4295 if mobj is None:
4296 raise ExtractorError(u'Unable to extract video secret')
4297 secret = mobj.group(1)
4298
4299 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4300 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4301
4302 mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
4303 if mobj is None:
4304 raise ExtractorError(u'Unable to extract node_id')
4305 node_id = mobj.group(1)
4306
4307 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4308 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4309
4310 self.report_extraction(video_id)
4311
4312 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4313 if mobj is None:
4314 raise ExtractorError(u'Unable to extract video url')
4315 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4316
4317 mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4318 if mobj is None:
4319 raise ExtractorError(u'Unable to extract title')
4320 video_title = mobj.group(1) or mobj.group(2)
4321
4322 mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4323 if mobj is None:
4324 self._downloader.report_warning(u'unable to extract description')
4325 video_description = None
4326 else:
4327 video_description = mobj.group(1) or mobj.group(2)
4328
4329 mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
4330 if mobj is None:
4331 raise ExtractorError(u'Unable to extract thumbnail')
4332 thumbnail = mobj.group(1) or mobj.group(2)
4333
4334 return [{
4335 'id': video_id,
4336 'url': video_url,
4337 'ext': 'mp4',
4338 'title': video_title,
4339 'description': video_description,
4340 'thumbnail': thumbnail,
4341 'uploader_id': video_uploader_id,
4342 }]
4343
4344 class TeamcocoIE(InfoExtractor):
4345 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4346
4347 def _real_extract(self, url):
4348 mobj = re.match(self._VALID_URL, url)
4349 if mobj is None:
4350 raise ExtractorError(u'Invalid URL: %s' % url)
4351 url_title = mobj.group('url_title')
4352 webpage = self._download_webpage(url, url_title)
4353
4354 mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
4355 video_id = mobj.group(1)
4356
4357 self.report_extraction(video_id)
4358
4359 mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
4360 if mobj is None:
4361 raise ExtractorError(u'Unable to extract title')
4362 video_title = mobj.group(1)
4363
4364 mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
4365 if mobj is None:
4366 raise ExtractorError(u'Unable to extract thumbnail')
4367 thumbnail = mobj.group(1)
4368
4369 mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
4370 if mobj is None:
4371 raise ExtractorError(u'Unable to extract description')
4372 description = mobj.group(1)
4373
4374 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4375 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4376 mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
4377 if mobj is None:
4378 raise ExtractorError(u'Unable to extract video url')
4379 video_url = mobj.group(1)
4380
4381 return [{
4382 'id': video_id,
4383 'url': video_url,
4384 'ext': 'mp4',
4385 'title': video_title,
4386 'thumbnail': thumbnail,
4387 'description': description,
4388 }]
4389
4390 def gen_extractors():
4391 """ Return a list of an instance of every supported extractor.
4392 The order does matter; the first extractor matched is the one handling the URL.
4393 """
4394 return [
4395 YoutubePlaylistIE(),
4396 YoutubeChannelIE(),
4397 YoutubeUserIE(),
4398 YoutubeSearchIE(),
4399 YoutubeIE(),
4400 MetacafeIE(),
4401 DailymotionIE(),
4402 GoogleSearchIE(),
4403 PhotobucketIE(),
4404 YahooIE(),
4405 YahooSearchIE(),
4406 DepositFilesIE(),
4407 FacebookIE(),
4408 BlipTVUserIE(),
4409 BlipTVIE(),
4410 VimeoIE(),
4411 MyVideoIE(),
4412 ComedyCentralIE(),
4413 EscapistIE(),
4414 CollegeHumorIE(),
4415 XVideosIE(),
4416 SoundcloudSetIE(),
4417 SoundcloudIE(),
4418 InfoQIE(),
4419 MixcloudIE(),
4420 StanfordOpenClassroomIE(),
4421 MTVIE(),
4422 YoukuIE(),
4423 XNXXIE(),
4424 YouJizzIE(),
4425 PornotubeIE(),
4426 YouPornIE(),
4427 GooglePlusIE(),
4428 ArteTvIE(),
4429 NBAIE(),
4430 WorldStarHipHopIE(),
4431 JustinTVIE(),
4432 FunnyOrDieIE(),
4433 SteamIE(),
4434 UstreamIE(),
4435 RBMARadioIE(),
4436 EightTracksIE(),
4437 KeekIE(),
4438 TEDIE(),
4439 MySpassIE(),
4440 SpiegelIE(),
4441 LiveLeakIE(),
4442 ARDIE(),
4443 TumblrIE(),
4444 BandcampIE(),
4445 RedTubeIE(),
4446 InaIE(),
4447 HowcastIE(),
4448 VineIE(),
4449 FlickrIE(),
4450 TeamcocoIE(),
4451 GenericIE()
4452 ]
4453
4454 def get_info_extractor(ie_name):
4455 """Returns the info extractor class with the given ie_name"""
4456 return globals()[ie_name+'IE']