2 # -*- coding: utf-8 -*-
4 from __future__
import absolute_import
15 import xml
.etree
.ElementTree
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information about the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title, author and
32 others. The information is stored in a dictionary which is then
33 passed to the FileDownloader. The FileDownloader processes this
34 information possibly downloading the video to the file system, among
35 other possible outcomes.
37 The dictionaries must include the following fields:
41 title: Video title, unescaped.
42 ext: Video filename extension.
44 The following fields are optional:
46 format: The video format, defaults to ext (used for --get-format)
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents.
55 urlhandle: [internal] The urlHandle to be used to download the file,
56 like returned by urllib.request.urlopen
58 The fields should all be Unicode strings.
60 Subclasses of this one should re-define the _real_initialize() and
61 _real_extract() methods and define a _VALID_URL regexp.
62 Probably, they should also be added to the list of extractors.
64 _real_extract() must return a *list* of information dictionaries as
67 Finally, the _WORKING attribute should be set to False for broken IEs
68 in order to warn the users and skip the tests.
75 def __init__(self
, downloader
=None):
76 """Constructor. Receives an optional downloader."""
78 self
.set_downloader(downloader
)
81 def suitable(cls
, url
):
82 """Receives a URL and returns True if suitable for this IE."""
83 return re
.match(cls
._VALID
_URL
, url
) is not None
87 """Getter method for _WORKING."""
91 """Initializes an instance (authentication, etc)."""
93 self
._real
_initialize
()
96 def extract(self
, url
):
97 """Extracts URL information and returns it in list of dicts."""
99 return self
._real
_extract
(url
)
101 def set_downloader(self
, downloader
):
102 """Sets the downloader for this IE."""
103 self
._downloader
= downloader
105 def _real_initialize(self
):
106 """Real initialization process. Redefine in subclasses."""
109 def _real_extract(self
, url
):
110 """Real extraction process. Redefine in subclasses."""
115 return type(self
).__name
__[:-2]
117 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
118 """ Returns the response handle """
120 self
.report_download_webpage(video_id
)
121 elif note
is not False:
122 self
.to_screen(u
'%s: %s' % (video_id
, note
))
124 return compat_urllib_request
.urlopen(url_or_request
)
125 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
127 errnote
= u
'Unable to download webpage'
128 raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2])
130 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None):
131 """ Returns a tuple (page content as string, URL handle) """
132 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
)
133 content_type
= urlh
.headers
.get('Content-Type', '')
134 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
136 encoding
= m
.group(1)
139 webpage_bytes
= urlh
.read()
140 if self
._downloader
.params
.get('dump_intermediate_pages', False):
142 url
= url_or_request
.get_full_url()
143 except AttributeError:
145 self
.to_screen(u
'Dumping request to ' + url
)
146 dump
= base64
.b64encode(webpage_bytes
).decode('ascii')
147 self
._downloader
.to_screen(dump
)
148 content
= webpage_bytes
.decode(encoding
, 'replace')
149 return (content
, urlh
)
151 def _download_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
152 """ Returns the data of the page as a string """
153 return self
._download
_webpage
_handle
(url_or_request
, video_id
, note
, errnote
)[0]
155 def to_screen(self
, msg
):
156 """Print msg to screen, prefixing it with '[ie_name]'"""
157 self
._downloader
.to_screen(u
'[%s] %s' % (self
.IE_NAME
, msg
))
159 def report_extraction(self
, id_or_name
):
160 """Report information extraction."""
161 self
.to_screen(u
'%s: Extracting information' % id_or_name
)
163 def report_download_webpage(self
, video_id
):
164 """Report webpage download."""
165 self
.to_screen(u
'%s: Downloading webpage' % video_id
)
167 def report_age_confirmation(self
):
168 """Report attempt to confirm age."""
169 self
.to_screen(u
'Confirming age')
171 #Methods for following #608
172 #They set the correct value of the '_type' key
173 def video_result(self
, video_info
):
174 """Returns a video"""
175 video_info
['_type'] = 'video'
177 def url_result(self
, url
, ie
=None):
178 """Returns a url that points to a page that should be processed"""
179 #TODO: ie should be the class used for getting the info
180 video_info
= {'_type': 'url',
184 def playlist_result(self
, entries
, playlist_id
=None, playlist_title
=None):
185 """Returns a playlist"""
186 video_info
= {'_type': 'playlist',
189 video_info
['id'] = playlist_id
191 video_info
['title'] = playlist_title
194 def _search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
196 Perform a regex search on the given string, using a single or a list of
197 patterns returning the first matching group.
198 In case of failure return a default value or raise a WARNING or a
199 ExtractorError, depending on fatal, specifying the field name.
201 if isinstance(pattern
, (str, compat_str
, compiled_regex_type
)):
202 mobj
= re
.search(pattern
, string
, flags
)
205 mobj
= re
.search(p
, string
, flags
)
208 if sys
.stderr
.isatty() and os
.name
!= 'nt':
209 _name
= u
'\033[0;34m%s\033[0m' % name
214 # return the first matching group
215 return next(g
for g
in mobj
.groups() if g
is not None)
216 elif default
is not None:
219 raise ExtractorError(u
'Unable to extract %s' % _name
)
221 self
._downloader
.report_warning(u
'unable to extract %s; '
222 u
'please report this issue on GitHub.' % _name
)
225 def _html_search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
227 Like _search_regex, but strips HTML tags and unescapes entities.
229 res
= self
._search
_regex
(pattern
, string
, name
, default
, fatal
, flags
)
231 return clean_html(res
).strip()
235 class SearchInfoExtractor(InfoExtractor
):
237 Base class for paged search queries extractors.
238 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
239 Instances should define _SEARCH_KEY and _MAX_RESULTS.
243 def _make_valid_url(cls
):
244 return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
247 def suitable(cls
, url
):
248 return re
.match(cls
._make
_valid
_url
(), url
) is not None
250 def _real_extract(self
, query
):
251 mobj
= re
.match(self
._make
_valid
_url
(), query
)
253 raise ExtractorError(u
'Invalid search query "%s"' % query
)
255 prefix
= mobj
.group('prefix')
256 query
= mobj
.group('query')
258 return self
._get
_n
_results
(query
, 1)
259 elif prefix
== 'all':
260 return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
)
264 raise ExtractorError(u
'invalid download number %s for query "%s"' % (n
, query
))
265 elif n
> self
._MAX
_RESULTS
:
266 self
._downloader
.report_warning(u
'%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
))
267 n
= self
._MAX
_RESULTS
268 return self
._get
_n
_results
(query
, n
)
270 def _get_n_results(self
, query
, n
):
271 """Get a specified number of results for a query"""
272 raise NotImplementedError("This method must be implemented by sublclasses")
275 class YoutubeIE(InfoExtractor
):
276 """Information extractor for youtube.com."""
280 (?:https?://)? # http(s):// (optional)
281 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
282 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
283 (?:.*?\#/)? # handle anchor (#/) redirect urls
284 (?: # the various things that can precede the ID:
285 (?:(?:v|embed|e)/) # v/ or embed/ or e/
286 |(?: # or the v= param in all its forms
287 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
288 (?:\?|\#!?) # the params delimiter ? or # or #!
289 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
292 )? # optional -> youtube.com/xxxx is OK
293 )? # all until now is optional -> you can pass the naked ID
294 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
295 (?(1).+)? # if we found the ID, everything can follow
297 _LANG_URL
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
298 _LOGIN_URL
= 'https://accounts.google.com/ServiceLogin'
299 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
300 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
301 _NETRC_MACHINE
= 'youtube'
302 # Listed in order of quality
303 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
304 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
305 _video_extensions
= {
311 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
317 _video_dimensions
= {
336 def suitable(cls
, url
):
337 """Receives a URL and returns True if suitable for this IE."""
338 if YoutubePlaylistIE
.suitable(url
): return False
339 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
341 def report_lang(self
):
342 """Report attempt to set language."""
343 self
.to_screen(u
'Setting language')
345 def report_login(self
):
346 """Report attempt to log in."""
347 self
.to_screen(u
'Logging in')
349 def report_video_webpage_download(self
, video_id
):
350 """Report attempt to download video webpage."""
351 self
.to_screen(u
'%s: Downloading video webpage' % video_id
)
353 def report_video_info_webpage_download(self
, video_id
):
354 """Report attempt to download video info webpage."""
355 self
.to_screen(u
'%s: Downloading video info webpage' % video_id
)
357 def report_video_subtitles_download(self
, video_id
):
358 """Report attempt to download video info webpage."""
359 self
.to_screen(u
'%s: Checking available subtitles' % video_id
)
361 def report_video_subtitles_request(self
, video_id
, sub_lang
, format
):
362 """Report attempt to download video info webpage."""
363 self
.to_screen(u
'%s: Downloading video subtitles for %s.%s' % (video_id
, sub_lang
, format
))
365 def report_video_subtitles_available(self
, video_id
, sub_lang_list
):
366 """Report available subtitles."""
367 sub_lang
= ",".join(list(sub_lang_list
.keys()))
368 self
.to_screen(u
'%s: Available subtitles for video: %s' % (video_id
, sub_lang
))
370 def report_information_extraction(self
, video_id
):
371 """Report attempt to extract video information."""
372 self
.to_screen(u
'%s: Extracting video information' % video_id
)
374 def report_unavailable_format(self
, video_id
, format
):
375 """Report extracted video URL."""
376 self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
))
378 def report_rtmp_download(self
):
379 """Indicate the download will use the RTMP protocol."""
380 self
.to_screen(u
'RTMP download detected')
382 def _get_available_subtitles(self
, video_id
):
383 self
.report_video_subtitles_download(video_id
)
384 request
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
386 sub_list
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
387 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
388 return (u
'unable to download video subtitles: %s' % compat_str(err
), None)
389 sub_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
)
390 sub_lang_list
= dict((l
[1], l
[0]) for l
in sub_lang_list
)
391 if not sub_lang_list
:
392 return (u
'video doesn\'t have subtitles', None)
395 def _list_available_subtitles(self
, video_id
):
396 sub_lang_list
= self
._get
_available
_subtitles
(video_id
)
397 self
.report_video_subtitles_available(video_id
, sub_lang_list
)
399 def _request_subtitle(self
, sub_lang
, sub_name
, video_id
, format
):
402 (error_message, sub_lang, sub)
404 self
.report_video_subtitles_request(video_id
, sub_lang
, format
)
405 params
= compat_urllib_parse
.urlencode({
411 url
= 'http://www.youtube.com/api/timedtext?' + params
413 sub
= compat_urllib_request
.urlopen(url
).read().decode('utf-8')
414 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
415 return (u
'unable to download video subtitles: %s' % compat_str(err
), None, None)
417 return (u
'Did not fetch video subtitles', None, None)
418 return (None, sub_lang
, sub
)
420 def _request_automatic_caption(self
, video_id
, webpage
):
421 """We need the webpage for getting the captions url, pass it as an
422 argument to speed up the process."""
423 sub_lang
= self
._downloader
.params
.get('subtitleslang')
424 sub_format
= self
._downloader
.params
.get('subtitlesformat')
425 self
.to_screen(u
'%s: Looking for automatic captions' % video_id
)
426 mobj
= re
.search(r
';ytplayer.config = ({.*?});', webpage
)
427 err_msg
= u
'Couldn\'t find automatic captions for "%s"' % sub_lang
429 return [(err_msg
, None, None)]
430 player_config
= json
.loads(mobj
.group(1))
432 args
= player_config
[u
'args']
433 caption_url
= args
[u
'ttsurl']
434 timestamp
= args
[u
'timestamp']
435 params
= compat_urllib_parse
.urlencode({
442 subtitles_url
= caption_url
+ '&' + params
443 sub
= self
._download
_webpage
(subtitles_url
, video_id
, u
'Downloading automatic captions')
444 return [(None, sub_lang
, sub
)]
446 return [(err_msg
, None, None)]
448 def _extract_subtitle(self
, video_id
):
450 Return a list with a tuple:
451 [(error_message, sub_lang, sub)]
453 sub_lang_list
= self
._get
_available
_subtitles
(video_id
)
454 sub_format
= self
._downloader
.params
.get('subtitlesformat')
455 if isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles
456 return [(sub_lang_list
[0], None, None)]
457 if self
._downloader
.params
.get('subtitleslang', False):
458 sub_lang
= self
._downloader
.params
.get('subtitleslang')
459 elif 'en' in sub_lang_list
:
462 sub_lang
= list(sub_lang_list
.keys())[0]
463 if not sub_lang
in sub_lang_list
:
464 return [(u
'no closed captions found in the specified language "%s"' % sub_lang
, None, None)]
466 subtitle
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
)
469 def _extract_all_subtitles(self
, video_id
):
470 sub_lang_list
= self
._get
_available
_subtitles
(video_id
)
471 sub_format
= self
._downloader
.params
.get('subtitlesformat')
472 if isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles
473 return [(sub_lang_list
[0], None, None)]
475 for sub_lang
in sub_lang_list
:
476 subtitle
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
)
477 subtitles
.append(subtitle
)
480 def _print_formats(self
, formats
):
481 print('Available formats:')
483 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???')))
485 def _real_initialize(self
):
486 if self
._downloader
is None:
491 downloader_params
= self
._downloader
.params
493 # Attempt to use provided username and password or .netrc data
494 if downloader_params
.get('username', None) is not None:
495 username
= downloader_params
['username']
496 password
= downloader_params
['password']
497 elif downloader_params
.get('usenetrc', False):
499 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
504 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
505 except (IOError, netrc
.NetrcParseError
) as err
:
506 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
))
510 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
513 compat_urllib_request
.urlopen(request
).read()
514 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
515 self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
))
518 # No authentication to be performed
522 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
)
524 login_page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
525 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
526 self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
))
531 match
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
)
533 galx
= match
.group(1)
535 match
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
)
541 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
545 u
'PersistentCookie': u
'yes',
547 u
'bgresponse': u
'js_disabled',
548 u
'checkConnection': u
'',
549 u
'checkedDomains': u
'youtube',
555 u
'signIn': u
'Sign in',
557 u
'service': u
'youtube',
561 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
563 login_form
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v
in login_form_strs
.items())
564 login_data
= compat_urllib_parse
.urlencode(login_form
).encode('ascii')
565 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
)
568 login_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
569 if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None:
570 self
._downloader
.report_warning(u
'unable to log in: bad username or password')
572 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
573 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
579 'action_confirm': 'Confirm',
581 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
583 self
.report_age_confirmation()
584 age_results
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
585 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
586 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
588 def _extract_id(self
, url
):
589 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
591 raise ExtractorError(u
'Invalid URL: %s' % url
)
592 video_id
= mobj
.group(2)
595 def _real_extract(self
, url
):
596 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
597 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
599 url
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
600 video_id
= self
._extract
_id
(url
)
603 self
.report_video_webpage_download(video_id
)
604 url
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
605 request
= compat_urllib_request
.Request(url
)
607 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
608 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
609 raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
))
611 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
613 # Attempt to extract SWF player URL
614 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
616 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
621 self
.report_video_info_webpage_download(video_id
)
622 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
623 video_info_url
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
624 % (video_id
, el_type
))
625 video_info_webpage
= self
._download
_webpage
(video_info_url
, video_id
,
627 errnote
='unable to download video info webpage')
628 video_info
= compat_parse_qs(video_info_webpage
)
629 if 'token' in video_info
:
631 if 'token' not in video_info
:
632 if 'reason' in video_info
:
633 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0])
635 raise ExtractorError(u
'"token" parameter not in video info for unknown reason')
637 # Check for "rental" videos
638 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
639 raise ExtractorError(u
'"rental" videos not supported')
641 # Start extracting information
642 self
.report_information_extraction(video_id
)
645 if 'author' not in video_info
:
646 raise ExtractorError(u
'Unable to extract uploader name')
647 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
650 video_uploader_id
= None
651 mobj
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
)
653 video_uploader_id
= mobj
.group(1)
655 self
._downloader
.report_warning(u
'unable to extract uploader nickname')
658 if 'title' not in video_info
:
659 raise ExtractorError(u
'Unable to extract video title')
660 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
663 if 'thumbnail_url' not in video_info
:
664 self
._downloader
.report_warning(u
'unable to extract video thumbnail')
666 else: # don't panic if we can't find it
667 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
671 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
673 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
674 upload_date
= unified_strdate(upload_date
)
677 video_description
= get_element_by_id("eow-description", video_webpage
)
678 if video_description
:
679 video_description
= clean_html(video_description
)
681 fd_mobj
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
)
683 video_description
= unescapeHTML(fd_mobj
.group(1))
685 video_description
= u
''
688 video_subtitles
= None
690 if self
._downloader
.params
.get('writesubtitles', False):
691 video_subtitles
= self
._extract
_subtitle
(video_id
)
693 (sub_error
, sub_lang
, sub
) = video_subtitles
[0]
695 # We try with the automatic captions
696 video_subtitles
= self
._request
_automatic
_caption
(video_id
, video_webpage
)
697 (sub_error_auto
, sub_lang
, sub
) = video_subtitles
[0]
701 # We report the original error
702 self
._downloader
.report_error(sub_error
)
704 if self
._downloader
.params
.get('allsubtitles', False):
705 video_subtitles
= self
._extract
_all
_subtitles
(video_id
)
706 for video_subtitle
in video_subtitles
:
707 (sub_error
, sub_lang
, sub
) = video_subtitle
709 self
._downloader
.report_error(sub_error
)
711 if self
._downloader
.params
.get('listsubtitles', False):
712 sub_lang_list
= self
._list
_available
_subtitles
(video_id
)
715 if 'length_seconds' not in video_info
:
716 self
._downloader
.report_warning(u
'unable to extract video duration')
719 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
722 video_token
= compat_urllib_parse
.unquote_plus(video_info
['token'][0])
724 # Decide which formats to download
725 req_format
= self
._downloader
.params
.get('format', None)
727 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
728 self
.report_rtmp_download()
729 video_url_list
= [(None, video_info
['conn'][0])]
730 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
732 for url_data_str
in video_info
['url_encoded_fmt_stream_map'][0].split(','):
733 url_data
= compat_parse_qs(url_data_str
)
734 if 'itag' in url_data
and 'url' in url_data
:
735 url
= url_data
['url'][0] + '&signature=' + url_data
['sig'][0]
736 if not 'ratebypass' in url
: url
+= '&ratebypass=yes'
737 url_map
[url_data
['itag'][0]] = url
739 format_limit
= self
._downloader
.params
.get('format_limit', None)
740 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
741 if format_limit
is not None and format_limit
in available_formats
:
742 format_list
= available_formats
[available_formats
.index(format_limit
):]
744 format_list
= available_formats
745 existing_formats
= [x
for x
in format_list
if x
in url_map
]
746 if len(existing_formats
) == 0:
747 raise ExtractorError(u
'no known formats available for video')
748 if self
._downloader
.params
.get('listformats', None):
749 self
._print
_formats
(existing_formats
)
751 if req_format
is None or req_format
== 'best':
752 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
753 elif req_format
== 'worst':
754 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
755 elif req_format
in ('-1', 'all'):
756 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
758 # Specific formats. We pick the first in a slash-delimeted sequence.
759 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
760 req_formats
= req_format
.split('/')
761 video_url_list
= None
762 for rf
in req_formats
:
764 video_url_list
= [(rf
, url_map
[rf
])]
766 if video_url_list
is None:
767 raise ExtractorError(u
'requested format not available')
769 raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info')
772 for format_param
, video_real_url
in video_url_list
:
774 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
776 video_format
= '{0} - {1}'.format(format_param
if format_param
else video_extension
,
777 self
._video
_dimensions
.get(format_param
, '???'))
781 'url': video_real_url
,
782 'uploader': video_uploader
,
783 'uploader_id': video_uploader_id
,
784 'upload_date': upload_date
,
785 'title': video_title
,
786 'ext': video_extension
,
787 'format': video_format
,
788 'thumbnail': video_thumbnail
,
789 'description': video_description
,
790 'player_url': player_url
,
791 'subtitles': video_subtitles
,
792 'duration': video_duration
797 class MetacafeIE(InfoExtractor
):
798 """Information Extractor for metacafe.com."""
800 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
801 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
802 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
803 IE_NAME
= u
'metacafe'
805 def report_disclaimer(self
):
806 """Report disclaimer retrieval."""
807 self
.to_screen(u
'Retrieving disclaimer')
809 def _real_initialize(self
):
810 # Retrieve disclaimer
811 request
= compat_urllib_request
.Request(self
._DISCLAIMER
)
813 self
.report_disclaimer()
814 disclaimer
= compat_urllib_request
.urlopen(request
).read()
815 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
816 raise ExtractorError(u
'Unable to retrieve disclaimer: %s' % compat_str(err
))
821 'submit': "Continue - I'm over 18",
823 request
= compat_urllib_request
.Request(self
._FILTER
_POST
, compat_urllib_parse
.urlencode(disclaimer_form
))
825 self
.report_age_confirmation()
826 disclaimer
= compat_urllib_request
.urlopen(request
).read()
827 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
828 raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
))
830 def _real_extract(self
, url
):
831 # Extract id and simplified title from URL
832 mobj
= re
.match(self
._VALID
_URL
, url
)
834 raise ExtractorError(u
'Invalid URL: %s' % url
)
836 video_id
= mobj
.group(1)
838 # Check if video comes from YouTube
839 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
840 if mobj2
is not None:
841 return [self
.url_result('http://www.youtube.com/watch?v=%s' % mobj2
.group(1), 'Youtube')]
843 # Retrieve video webpage to extract further information
844 webpage
= self
._download
_webpage
('http://www.metacafe.com/watch/%s/' % video_id
, video_id
)
846 # Extract URL, uploader and title from webpage
847 self
.report_extraction(video_id
)
848 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
850 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
851 video_extension
= mediaURL
[-3:]
853 # Extract gdaKey if available
854 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
858 gdaKey
= mobj
.group(1)
859 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
861 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
863 raise ExtractorError(u
'Unable to extract media URL')
864 vardict
= compat_parse_qs(mobj
.group(1))
865 if 'mediaData' not in vardict
:
866 raise ExtractorError(u
'Unable to extract media URL')
867 mobj
= re
.search(r
'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict
['mediaData'][0])
869 raise ExtractorError(u
'Unable to extract media URL')
870 mediaURL
= mobj
.group('mediaURL').replace('\\/', '/')
871 video_extension
= mediaURL
[-3:]
872 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group('key'))
874 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
876 raise ExtractorError(u
'Unable to extract title')
877 video_title
= mobj
.group(1).decode('utf-8')
879 mobj
= re
.search(r
'submitter=(.*?);', webpage
)
881 raise ExtractorError(u
'Unable to extract uploader nickname')
882 video_uploader
= mobj
.group(1)
885 'id': video_id
.decode('utf-8'),
886 'url': video_url
.decode('utf-8'),
887 'uploader': video_uploader
.decode('utf-8'),
889 'title': video_title
,
890 'ext': video_extension
.decode('utf-8'),
893 class DailymotionIE(InfoExtractor
):
894 """Information Extractor for Dailymotion"""
896 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
897 IE_NAME
= u
'dailymotion'
899 def _real_extract(self
, url
):
900 # Extract id and simplified title from URL
901 mobj
= re
.match(self
._VALID
_URL
, url
)
903 raise ExtractorError(u
'Invalid URL: %s' % url
)
905 video_id
= mobj
.group(1).split('_')[0].split('?')[0]
907 video_extension
= 'mp4'
909 # Retrieve video webpage to extract further information
910 request
= compat_urllib_request
.Request(url
)
911 request
.add_header('Cookie', 'family_filter=off')
912 webpage
= self
._download
_webpage
(request
, video_id
)
914 # Extract URL, uploader and title from webpage
915 self
.report_extraction(video_id
)
916 mobj
= re
.search(r
'\s*var flashvars = (.*)', webpage
)
918 raise ExtractorError(u
'Unable to extract media URL')
919 flashvars
= compat_urllib_parse
.unquote(mobj
.group(1))
921 for key
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
924 self
.to_screen(u
'Using %s' % key
)
927 raise ExtractorError(u
'Unable to extract video URL')
929 mobj
= re
.search(r
'"' + max_quality
+ r
'":"(.+?)"', flashvars
)
931 raise ExtractorError(u
'Unable to extract video URL')
933 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)).replace('\\/', '/')
935 # TODO: support choosing qualities
937 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
939 raise ExtractorError(u
'Unable to extract title')
940 video_title
= unescapeHTML(mobj
.group('title'))
942 video_uploader
= None
943 video_uploader
= self
._search
_regex
([r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
944 # Looking for official user
945 r
'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
946 webpage
, 'video uploader')
948 video_upload_date
= None
949 mobj
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
)
951 video_upload_date
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1)
956 'uploader': video_uploader
,
957 'upload_date': video_upload_date
,
958 'title': video_title
,
959 'ext': video_extension
,
963 class PhotobucketIE(InfoExtractor
):
964 """Information extractor for photobucket.com."""
966 # TODO: the original _VALID_URL was:
967 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
968 # Check if it's necessary to keep the old extracion process
969 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
970 IE_NAME
= u
'photobucket'
972 def _real_extract(self
, url
):
973 # Extract id from URL
974 mobj
= re
.match(self
._VALID
_URL
, url
)
976 raise ExtractorError(u
'Invalid URL: %s' % url
)
978 video_id
= mobj
.group('id')
980 video_extension
= mobj
.group('ext')
982 # Retrieve video webpage to extract further information
983 webpage
= self
._download
_webpage
(url
, video_id
)
985 # Extract URL, uploader, and title from webpage
986 self
.report_extraction(video_id
)
987 # We try first by looking the javascript code:
988 mobj
= re
.search(r
'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage
)
990 info
= json
.loads(mobj
.group('json'))
993 'url': info
[u
'downloadUrl'],
994 'uploader': info
[u
'username'],
995 'upload_date': datetime
.date
.fromtimestamp(info
[u
'creationDate']).strftime('%Y%m%d'),
996 'title': info
[u
'title'],
997 'ext': video_extension
,
998 'thumbnail': info
[u
'thumbUrl'],
1001 # We try looking in other parts of the webpage
1002 video_url
= self
._search
_regex
(r
'<link rel="video_src" href=".*\?file=([^"]+)" />',
1003 webpage
, u
'video URL')
1005 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1007 raise ExtractorError(u
'Unable to extract title')
1008 video_title
= mobj
.group(1).decode('utf-8')
1009 video_uploader
= mobj
.group(2).decode('utf-8')
1012 'id': video_id
.decode('utf-8'),
1013 'url': video_url
.decode('utf-8'),
1014 'uploader': video_uploader
,
1015 'upload_date': None,
1016 'title': video_title
,
1017 'ext': video_extension
.decode('utf-8'),
1021 class YahooIE(InfoExtractor
):
1022 """Information extractor for screen.yahoo.com."""
1023 _VALID_URL
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
1025 def _real_extract(self
, url
):
1026 mobj
= re
.match(self
._VALID
_URL
, url
)
1028 raise ExtractorError(u
'Invalid URL: %s' % url
)
1029 video_id
= mobj
.group('id')
1030 webpage
= self
._download
_webpage
(url
, video_id
)
1031 m_id
= re
.search(r
'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage
)
1034 # TODO: Check which url parameters are required
1035 info_url
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1036 webpage
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info webpage')
1037 info_re
= r
'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
1038 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
1039 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
1040 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
1042 self
.report_extraction(video_id
)
1043 m_info
= re
.search(info_re
, webpage
, re
.VERBOSE|re
.DOTALL
)
1045 raise ExtractorError(u
'Unable to extract video info')
1046 video_title
= m_info
.group('title')
1047 video_description
= m_info
.group('description')
1048 video_thumb
= m_info
.group('thumb')
1049 video_date
= m_info
.group('date')
1050 video_date
= datetime
.datetime
.strptime(video_date
, '%m/%d/%Y').strftime('%Y%m%d')
1052 # TODO: Find a way to get mp4 videos
1053 rest_url
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
1054 webpage
= self
._download
_webpage
(rest_url
, video_id
, u
'Downloading video url webpage')
1055 m_rest
= re
.search(r
'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage
)
1056 video_url
= m_rest
.group('url')
1057 video_path
= m_rest
.group('path')
1059 raise ExtractorError(u
'Unable to extract video url')
1061 else: # We have to use a different method if another id is defined
1062 long_id
= m_id
.group('new_id')
1063 info_url
= 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id
+ '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
1064 webpage
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info json')
1065 json_str
= re
.search(r
'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage
).group(1)
1066 info
= json
.loads(json_str
)
1067 res
= info
[u
'query'][u
'results'][u
'mediaObj'][0]
1068 stream
= res
[u
'streams'][0]
1069 video_path
= stream
[u
'path']
1070 video_url
= stream
[u
'host']
1072 video_title
= meta
[u
'title']
1073 video_description
= meta
[u
'description']
1074 video_thumb
= meta
[u
'thumbnail']
1075 video_date
= None # I can't find it
1080 'play_path': video_path
,
1081 'title':video_title
,
1082 'description': video_description
,
1083 'thumbnail': video_thumb
,
1084 'upload_date': video_date
,
1089 class VimeoIE(InfoExtractor
):
1090 """Information extractor for vimeo.com."""
1092 # _VALID_URL matches Vimeo URLs
1093 _VALID_URL
= r
'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1096 def _real_extract(self
, url
, new_video
=True):
1097 # Extract ID from URL
1098 mobj
= re
.match(self
._VALID
_URL
, url
)
1100 raise ExtractorError(u
'Invalid URL: %s' % url
)
1102 video_id
= mobj
.group('id')
1103 if not mobj
.group('proto'):
1104 url
= 'https://' + url
1105 if mobj
.group('direct_link') or mobj
.group('pro'):
1106 url
= 'https://vimeo.com/' + video_id
1108 # Retrieve video webpage to extract further information
1109 request
= compat_urllib_request
.Request(url
, None, std_headers
)
1110 webpage
= self
._download
_webpage
(request
, video_id
)
1112 # Now we begin extracting as much information as we can from what we
1113 # retrieved. First we extract the information common to all extractors,
1114 # and latter we extract those that are Vimeo specific.
1115 self
.report_extraction(video_id
)
1117 # Extract the config JSON
1119 config
= webpage
.split(' = {config:')[1].split(',assets:')[0]
1120 config
= json
.loads(config
)
1122 if re
.search('The creator of this video has not given you permission to embed it on this domain.', webpage
):
1123 raise ExtractorError(u
'The author has restricted the access to this video, try with the "--referer" option')
1125 raise ExtractorError(u
'Unable to extract info section')
1128 video_title
= config
["video"]["title"]
1130 # Extract uploader and uploader_id
1131 video_uploader
= config
["video"]["owner"]["name"]
1132 video_uploader_id
= config
["video"]["owner"]["url"].split('/')[-1] if config
["video"]["owner"]["url"] else None
1134 # Extract video thumbnail
1135 video_thumbnail
= config
["video"]["thumbnail"]
1137 # Extract video description
1138 video_description
= get_element_by_attribute("itemprop", "description", webpage
)
1139 if video_description
: video_description
= clean_html(video_description
)
1140 else: video_description
= u
''
1142 # Extract upload date
1143 video_upload_date
= None
1144 mobj
= re
.search(r
'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage
)
1145 if mobj
is not None:
1146 video_upload_date
= mobj
.group(1) + mobj
.group(2) + mobj
.group(3)
1148 # Vimeo specific: extract request signature and timestamp
1149 sig
= config
['request']['signature']
1150 timestamp
= config
['request']['timestamp']
1152 # Vimeo specific: extract video codec and quality information
1153 # First consider quality, then codecs, then take everything
1154 # TODO bind to format param
1155 codecs
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1156 files
= { 'hd': [], 'sd': [], 'other': []}
1157 for codec_name
, codec_extension
in codecs
:
1158 if codec_name
in config
["video"]["files"]:
1159 if 'hd' in config
["video"]["files"][codec_name
]:
1160 files
['hd'].append((codec_name
, codec_extension
, 'hd'))
1161 elif 'sd' in config
["video"]["files"][codec_name
]:
1162 files
['sd'].append((codec_name
, codec_extension
, 'sd'))
1164 files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0]))
1166 for quality
in ('hd', 'sd', 'other'):
1167 if len(files
[quality
]) > 0:
1168 video_quality
= files
[quality
][0][2]
1169 video_codec
= files
[quality
][0][0]
1170 video_extension
= files
[quality
][0][1]
1171 self
.to_screen(u
'%s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
))
1174 raise ExtractorError(u
'No known codec found')
1176 video_url
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper())
1182 'uploader': video_uploader
,
1183 'uploader_id': video_uploader_id
,
1184 'upload_date': video_upload_date
,
1185 'title': video_title
,
1186 'ext': video_extension
,
1187 'thumbnail': video_thumbnail
,
1188 'description': video_description
,
1192 class ArteTvIE(InfoExtractor
):
1193 """arte.tv information extractor."""
1195 _VALID_URL
= r
'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL
= r
'index-[0-9]+\.html$'
1198 IE_NAME
= u
'arte.tv'
1200 def fetch_webpage(self
, url
):
1201 request
= compat_urllib_request
.Request(url
)
1203 self
.report_download_webpage(url
)
1204 webpage
= compat_urllib_request
.urlopen(request
).read()
1205 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1206 raise ExtractorError(u
'Unable to retrieve video webpage: %s' % compat_str(err
))
1207 except ValueError as err
:
1208 raise ExtractorError(u
'Invalid URL: %s' % url
)
1211 def grep_webpage(self
, url
, regex
, regexFlags
, matchTuples
):
1212 page
= self
.fetch_webpage(url
)
1213 mobj
= re
.search(regex
, page
, regexFlags
)
1217 raise ExtractorError(u
'Invalid URL: %s' % url
)
1219 for (i
, key
, err
) in matchTuples
:
1220 if mobj
.group(i
) is None:
1221 raise ExtractorError(err
)
1223 info
[key
] = mobj
.group(i
)
1227 def extractLiveStream(self
, url
):
1228 video_lang
= url
.split('/')[-4]
1229 info
= self
.grep_webpage(
1231 r
'src="(.*?/videothek_js.*?\.js)',
1234 (1, 'url', u
'Invalid URL: %s' % url
)
1237 http_host
= url
.split('/')[2]
1238 next_url
= 'http://%s%s' % (http_host
, compat_urllib_parse
.unquote(info
.get('url')))
1239 info
= self
.grep_webpage(
1241 r
'(s_artestras_scst_geoFRDE_' + video_lang
+ '.*?)\'.*?' +
1242 '(http://.*?\.swf).*?' +
1246 (1, 'path', u
'could not extract video path: %s' % url
),
1247 (2, 'player', u
'could not extract video player: %s' % url
),
1248 (3, 'url', u
'could not extract video url: %s' % url
)
1251 video_url
= u
'%s/%s' % (info
.get('url'), info
.get('path'))
1253 def extractPlus7Stream(self
, url
):
1254 video_lang
= url
.split('/')[-3]
1255 info
= self
.grep_webpage(
1257 r
'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1260 (1, 'url', u'Invalid URL: %s' % url)
1263 next_url = compat_urllib_parse.unquote(info.get('url'))
1264 info = self.grep_webpage(
1266 r'<video lang="%s" ref="(http
[^
\'"&]*)' % video_lang,
1269 (1, 'url', u'Could not find <video> tag: %s' % url)
1272 next_url = compat_urllib_parse.unquote(info.get('url'))
1274 info = self.grep_webpage(
1276 r'<video id="(.*?
)".*?>.*?' +
1277 '<name>(.*?)</name>.*?' +
1278 '<dateVideo>(.*?)</dateVideo>.*?' +
1279 '<url quality="hd
">(.*?)</url>',
1282 (1, 'id', u'could not extract video id: %s' % url),
1283 (2, 'title', u'could not extract video title: %s' % url),
1284 (3, 'date', u'could not extract video date: %s' % url),
1285 (4, 'url', u'could not extract video url: %s' % url)
1290 'id': info.get('id'),
1291 'url': compat_urllib_parse.unquote(info.get('url')),
1292 'uploader': u'arte.tv',
1293 'upload_date': unified_strdate(info.get('date')),
1294 'title': info.get('title').decode('utf-8'),
1300 def _real_extract(self, url):
1301 video_id = url.split('/')[-1]
1302 self.report_extraction(video_id)
1304 if re.search(self._LIVE_URL, video_id) is not None:
1305 self.extractLiveStream(url)
1308 info = self.extractPlus7Stream(url)
1313 class GenericIE(InfoExtractor):
1314 """Generic last-resort information extractor."""
1317 IE_NAME = u'generic'
1319 def report_download_webpage(self, video_id):
1320 """Report webpage download."""
1321 if not self._downloader.params.get('test', False):
1322 self._downloader.report_warning(u'Falling back on generic information extractor.')
1323 super(GenericIE, self).report_download_webpage(video_id)
1325 def report_following_redirect(self, new_url):
1326 """Report information extraction."""
1327 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1329 def _test_redirect(self, url):
1330 """Check if it is a redirect, like url shorteners, in case return the new url."""
1331 class HeadRequest(compat_urllib_request.Request):
1332 def get_method(self):
1335 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1337 Subclass the HTTPRedirectHandler to make it use our
1338 HeadRequest also on the redirected URL
1340 def redirect_request(self, req, fp, code, msg, headers, newurl):
1341 if code in (301, 302, 303, 307):
1342 newurl = newurl.replace(' ', '%20')
1343 newheaders = dict((k,v) for k,v in req.headers.items()
1344 if k.lower() not in ("content
-length
", "content
-type"))
1345 return HeadRequest(newurl,
1347 origin_req_host=req.get_origin_req_host(),
1350 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1352 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1354 Fallback to GET if HEAD is not allowed (405 HTTP error)
1356 def http_error_405(self, req, fp, code, msg, headers):
1360 newheaders = dict((k,v) for k,v in req.headers.items()
1361 if k.lower() not in ("content
-length
", "content
-type"))
1362 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1364 origin_req_host=req.get_origin_req_host(),
1368 opener = compat_urllib_request.OpenerDirector()
1369 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1370 HTTPMethodFallback, HEADRedirectHandler,
1371 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1372 opener.add_handler(handler())
1374 response = opener.open(HeadRequest(url))
1375 if response is None:
1376 raise ExtractorError(u'Invalid URL protocol')
1377 new_url = response.geturl()
1382 self.report_following_redirect(new_url)
1385 def _real_extract(self, url):
1386 new_url = self._test_redirect(url)
1387 if new_url: return [self.url_result(new_url)]
1389 video_id = url.split('/')[-1]
1391 webpage = self._download_webpage(url, video_id)
1392 except ValueError as err:
1393 # since this is the last-resort InfoExtractor, if
1394 # this error is thrown, it'll be thrown here
1395 raise ExtractorError(u'Invalid URL: %s' % url)
1397 self.report_extraction(video_id)
1398 # Start with something easy: JW Player in SWFObject
1399 mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage)
1401 # Broaden the search a little bit
1402 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1404 # Broaden the search a little bit: JWPlayer JS loader
1405 mobj = re.search(r'[^A
-Za
-z0
-9]?
file:\s
*["\'](http[^\'"&]*)', webpage)
1407 # Try to find twitter cards info
1408 mobj = re.search(r'<meta (?
:property|name
)="twitter:player:stream" (?
:content|value
)="(.+?)"', webpage)
1410 raise ExtractorError(u'Invalid URL
: %s' % url)
1412 # It's possible that one of the regexes
1413 # matched, but returned an empty group:
1414 if mobj
.group(1) is None:
1415 raise ExtractorError(u
'Invalid URL: %s' % url
)
1417 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
1418 video_id
= os
.path
.basename(video_url
)
1420 # here's a fun little line of code for you:
1421 video_extension
= os
.path
.splitext(video_id
)[1][1:]
1422 video_id
= os
.path
.splitext(video_id
)[0]
1424 # it's tempting to parse this further, but you would
1425 # have to take into account all the variations like
1426 # Video Title - Site Name
1427 # Site Name | Video Title
1428 # Video Title - Tagline | Site Name
1429 # and so on and so forth; it's just not practical
1430 video_title
= self
._html
_search
_regex
(r
'<title>(.*)</title>',
1431 webpage
, u
'video title')
1433 # video uploader is domain name
1434 video_uploader
= self
._search
_regex
(r
'(?:https?://)?([^/]*)/.*',
1435 url
, u
'video uploader')
1440 'uploader': video_uploader
,
1441 'upload_date': None,
1442 'title': video_title
,
1443 'ext': video_extension
,
1447 class YoutubeSearchIE(SearchInfoExtractor
):
1448 """Information Extractor for YouTube search queries."""
1449 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1451 IE_NAME
= u
'youtube:search'
1452 _SEARCH_KEY
= 'ytsearch'
1454 def report_download_page(self
, query
, pagenum
):
1455 """Report attempt to download search page with given number."""
1456 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1458 def _get_n_results(self
, query
, n
):
1459 """Get a specified number of results for a query"""
1465 while (50 * pagenum
) < limit
:
1466 self
.report_download_page(query
, pagenum
+1)
1467 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1468 request
= compat_urllib_request
.Request(result_url
)
1470 data
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1471 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1472 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
))
1473 api_response
= json
.loads(data
)['data']
1475 if not 'items' in api_response
:
1476 raise ExtractorError(u
'[youtube] No video results')
1478 new_ids
= list(video
['id'] for video
in api_response
['items'])
1479 video_ids
+= new_ids
1481 limit
= min(n
, api_response
['totalItems'])
1484 if len(video_ids
) > n
:
1485 video_ids
= video_ids
[:n
]
1486 videos
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
]
1487 return self
.playlist_result(videos
, query
)
1490 class GoogleSearchIE(SearchInfoExtractor
):
1491 """Information Extractor for Google Video search queries."""
1492 _MORE_PAGES_INDICATOR
= r
'id="pnnext" class="pn"'
1494 IE_NAME
= u
'video.google:search'
1495 _SEARCH_KEY
= 'gvsearch'
1497 def _get_n_results(self
, query
, n
):
1498 """Get a specified number of results for a query"""
1501 '_type': 'playlist',
1506 for pagenum
in itertools
.count(1):
1507 result_url
= u
'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse
.quote_plus(query
), pagenum
*10)
1508 webpage
= self
._download
_webpage
(result_url
, u
'gvsearch:' + query
,
1509 note
='Downloading result page ' + str(pagenum
))
1511 for mobj
in re
.finditer(r
'<h3 class="r"><a href="([^"]+)"', webpage
):
1514 'url': mobj
.group(1)
1516 res
['entries'].append(e
)
1518 if (pagenum
* 10 > n
) or not re
.search(self
._MORE
_PAGES
_INDICATOR
, webpage
):
1521 class YahooSearchIE(SearchInfoExtractor
):
1522 """Information Extractor for Yahoo! Video search queries."""
1525 IE_NAME
= u
'screen.yahoo:search'
1526 _SEARCH_KEY
= 'yvsearch'
1528 def _get_n_results(self
, query
, n
):
1529 """Get a specified number of results for a query"""
1532 '_type': 'playlist',
1536 for pagenum
in itertools
.count(0):
1537 result_url
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum
* 30)
1538 webpage
= self
._download
_webpage
(result_url
, query
,
1539 note
='Downloading results page '+str(pagenum
+1))
1540 info
= json
.loads(webpage
)
1542 results
= info
[u
'results']
1544 for (i
, r
) in enumerate(results
):
1545 if (pagenum
* 30) +i
>= n
:
1547 mobj
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
)
1548 e
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo')
1549 res
['entries'].append(e
)
1550 if (pagenum
* 30 +i
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )):
1556 class YoutubePlaylistIE(InfoExtractor
):
1557 """Information Extractor for YouTube playlists."""
1559 _VALID_URL
= r
"""(?:
1564 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1565 \? (?:.*?&)*? (?:p|a|list)=
1568 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1571 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1573 _TEMPLATE_URL
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1575 IE_NAME
= u
'youtube:playlist'
1578 def suitable(cls
, url
):
1579 """Receives a URL and returns True if suitable for this IE."""
1580 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
1582 def _real_extract(self
, url
):
1583 # Extract playlist id
1584 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
1586 raise ExtractorError(u
'Invalid URL: %s' % url
)
1588 # Download playlist videos from API
1589 playlist_id
= mobj
.group(1) or mobj
.group(2)
1594 url
= self
._TEMPLATE
_URL
% (playlist_id
, self
._MAX
_RESULTS
, self
._MAX
_RESULTS
* (page_num
- 1) + 1)
1595 page
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
)
1598 response
= json
.loads(page
)
1599 except ValueError as err
:
1600 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
))
1602 if 'feed' not in response
:
1603 raise ExtractorError(u
'Got a malformed response from YouTube API')
1604 playlist_title
= response
['feed']['title']['$t']
1605 if 'entry' not in response
['feed']:
1606 # Number of videos is a multiple of self._MAX_RESULTS
1609 videos
+= [ (entry
['yt$position']['$t'], entry
['content']['src'])
1610 for entry
in response
['feed']['entry']
1611 if 'content' in entry
]
1613 if len(response
['feed']['entry']) < self
._MAX
_RESULTS
:
1617 videos
= [v
[1] for v
in sorted(videos
)]
1619 url_results
= [self
.url_result(url
, 'Youtube') for url
in videos
]
1620 return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)]
1623 class YoutubeChannelIE(InfoExtractor
):
1624 """Information Extractor for YouTube channels."""
1626 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1627 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1628 _MORE_PAGES_INDICATOR
= 'yt-uix-load-more'
1629 _MORE_PAGES_URL
= 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1630 IE_NAME
= u
'youtube:channel'
1632 def extract_videos_from_page(self
, page
):
1634 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
):
1635 if mobj
.group(1) not in ids_in_page
:
1636 ids_in_page
.append(mobj
.group(1))
1639 def _real_extract(self
, url
):
1640 # Extract channel id
1641 mobj
= re
.match(self
._VALID
_URL
, url
)
1643 raise ExtractorError(u
'Invalid URL: %s' % url
)
1645 # Download channel page
1646 channel_id
= mobj
.group(1)
1650 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
1651 page
= self
._download
_webpage
(url
, channel_id
,
1652 u
'Downloading page #%s' % pagenum
)
1654 # Extract video identifiers
1655 ids_in_page
= self
.extract_videos_from_page(page
)
1656 video_ids
.extend(ids_in_page
)
1658 # Download any subsequent channel pages using the json-based channel_ajax query
1659 if self
._MORE
_PAGES
_INDICATOR
in page
:
1661 pagenum
= pagenum
+ 1
1663 url
= self
._MORE
_PAGES
_URL
% (pagenum
, channel_id
)
1664 page
= self
._download
_webpage
(url
, channel_id
,
1665 u
'Downloading page #%s' % pagenum
)
1667 page
= json
.loads(page
)
1669 ids_in_page
= self
.extract_videos_from_page(page
['content_html'])
1670 video_ids
.extend(ids_in_page
)
1672 if self
._MORE
_PAGES
_INDICATOR
not in page
['load_more_widget_html']:
1675 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1677 urls
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
]
1678 url_entries
= [self
.url_result(url
, 'Youtube') for url
in urls
]
1679 return [self
.playlist_result(url_entries
, channel_id
)]
1682 class YoutubeUserIE(InfoExtractor
):
1683 """Information Extractor for YouTube users."""
1685 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1686 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1687 _GDATA_PAGE_SIZE
= 50
1688 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1689 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1690 IE_NAME
= u
'youtube:user'
1692 def _real_extract(self
, url
):
1694 mobj
= re
.match(self
._VALID
_URL
, url
)
1696 raise ExtractorError(u
'Invalid URL: %s' % url
)
1698 username
= mobj
.group(1)
1700 # Download video ids using YouTube Data API. Result size per
1701 # query is limited (currently to 50 videos) so we need to query
1702 # page by page until there are no video ids - it means we got
1709 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1711 gdata_url
= self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
)
1712 page
= self
._download
_webpage
(gdata_url
, username
,
1713 u
'Downloading video ids from %d to %d' % (start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1715 # Extract video identifiers
1718 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1719 if mobj
.group(1) not in ids_in_page
:
1720 ids_in_page
.append(mobj
.group(1))
1722 video_ids
.extend(ids_in_page
)
1724 # A little optimization - if current page is not
1725 # "full", ie. does not contain PAGE_SIZE video ids then
1726 # we can assume that this page is the last one - there
1727 # are no more ids on further pages - no need to query
1730 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1735 urls
= ['http://www.youtube.com/watch?v=%s' % video_id
for video_id
in video_ids
]
1736 url_results
= [self
.url_result(url
, 'Youtube') for url
in urls
]
1737 return [self
.playlist_result(url_results
, playlist_title
= username
)]
1740 class BlipTVUserIE(InfoExtractor
):
1741 """Information Extractor for blip.tv users."""
1743 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1745 IE_NAME
= u
'blip.tv:user'
1747 def _real_extract(self
, url
):
1749 mobj
= re
.match(self
._VALID
_URL
, url
)
1751 raise ExtractorError(u
'Invalid URL: %s' % url
)
1753 username
= mobj
.group(1)
1755 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1757 page
= self
._download
_webpage
(url
, username
, u
'Downloading user page')
1758 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
1759 page_base
= page_base
% mobj
.group(1)
1762 # Download video ids using BlipTV Ajax calls. Result size per
1763 # query is limited (currently to 12 videos) so we need to query
1764 # page by page until there are no video ids - it means we got
1771 url
= page_base
+ "&page=" + str(pagenum
)
1772 page
= self
._download
_webpage
(url
, username
,
1773 u
'Downloading video ids from page %d' % pagenum
)
1775 # Extract video identifiers
1778 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
1779 if mobj
.group(1) not in ids_in_page
:
1780 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
1782 video_ids
.extend(ids_in_page
)
1784 # A little optimization - if current page is not
1785 # "full", ie. does not contain PAGE_SIZE video ids then
1786 # we can assume that this page is the last one - there
1787 # are no more ids on further pages - no need to query
1790 if len(ids_in_page
) < self
._PAGE
_SIZE
:
1795 urls
= [u
'http://blip.tv/%s' % video_id
for video_id
in video_ids
]
1796 url_entries
= [self
.url_result(url
, 'BlipTV') for url
in urls
]
1797 return [self
.playlist_result(url_entries
, playlist_title
= username
)]
1800 class DepositFilesIE(InfoExtractor
):
1801 """Information extractor for depositfiles.com"""
1803 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1805 def _real_extract(self
, url
):
1806 file_id
= url
.split('/')[-1]
1807 # Rebuild url in english locale
1808 url
= 'http://depositfiles.com/en/files/' + file_id
1810 # Retrieve file webpage with 'Free download' button pressed
1811 free_download_indication
= { 'gateway_result' : '1' }
1812 request
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
))
1814 self
.report_download_webpage(file_id
)
1815 webpage
= compat_urllib_request
.urlopen(request
).read()
1816 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1817 raise ExtractorError(u
'Unable to retrieve file webpage: %s' % compat_str(err
))
1819 # Search for the real file URL
1820 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
1821 if (mobj
is None) or (mobj
.group(1) is None):
1822 # Try to figure out reason of the error.
1823 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
1824 if (mobj
is not None) and (mobj
.group(1) is not None):
1825 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
1826 raise ExtractorError(u
'%s' % restriction_message
)
1828 raise ExtractorError(u
'Unable to extract download URL from: %s' % url
)
1830 file_url
= mobj
.group(1)
1831 file_extension
= os
.path
.splitext(file_url
)[1][1:]
1833 # Search for file title
1834 file_title
= self
._search
_regex
(r
'<b title="(.*?)">', webpage
, u
'title')
1837 'id': file_id
.decode('utf-8'),
1838 'url': file_url
.decode('utf-8'),
1840 'upload_date': None,
1841 'title': file_title
,
1842 'ext': file_extension
.decode('utf-8'),
1846 class FacebookIE(InfoExtractor
):
1847 """Information Extractor for Facebook"""
1849 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1850 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1851 _NETRC_MACHINE
= 'facebook'
1852 IE_NAME
= u
'facebook'
1854 def report_login(self
):
1855 """Report attempt to log in."""
1856 self
.to_screen(u
'Logging in')
1858 def _real_initialize(self
):
1859 if self
._downloader
is None:
1864 downloader_params
= self
._downloader
.params
1866 # Attempt to use provided username and password or .netrc data
1867 if downloader_params
.get('username', None) is not None:
1868 useremail
= downloader_params
['username']
1869 password
= downloader_params
['password']
1870 elif downloader_params
.get('usenetrc', False):
1872 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
1873 if info
is not None:
1877 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
1878 except (IOError, netrc
.NetrcParseError
) as err
:
1879 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
))
1882 if useremail
is None:
1891 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
1894 login_results
= compat_urllib_request
.urlopen(request
).read()
1895 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
1896 self
._downloader
.report_warning(u
'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1898 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1899 self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
))
1902 def _real_extract(self
, url
):
1903 mobj
= re
.match(self
._VALID
_URL
, url
)
1905 raise ExtractorError(u
'Invalid URL: %s' % url
)
1906 video_id
= mobj
.group('ID')
1908 url
= 'https://www.facebook.com/video/video.php?v=%s' % video_id
1909 webpage
= self
._download
_webpage
(url
, video_id
)
1911 BEFORE
= '{swf.addParam(param[0], param[1]);});\n'
1912 AFTER
= '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1913 m
= re
.search(re
.escape(BEFORE
) + '(.*?)' + re
.escape(AFTER
), webpage
)
1915 raise ExtractorError(u
'Cannot parse data')
1916 data
= dict(json
.loads(m
.group(1)))
1917 params_raw
= compat_urllib_parse
.unquote(data
['params'])
1918 params
= json
.loads(params_raw
)
1919 video_data
= params
['video_data'][0]
1920 video_url
= video_data
.get('hd_src')
1922 video_url
= video_data
['sd_src']
1924 raise ExtractorError(u
'Cannot find video URL')
1925 video_duration
= int(video_data
['video_duration'])
1926 thumbnail
= video_data
['thumbnail_src']
1928 video_title
= self
._html
_search
_regex
('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1933 'title': video_title
,
1936 'duration': video_duration
,
1937 'thumbnail': thumbnail
,
1942 class BlipTVIE(InfoExtractor
):
1943 """Information extractor for blip.tv"""
1945 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1946 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
1947 IE_NAME
= u
'blip.tv'
1949 def report_direct_download(self
, title
):
1950 """Report information extraction."""
1951 self
.to_screen(u
'%s: Direct download detected' % title
)
1953 def _real_extract(self
, url
):
1954 mobj
= re
.match(self
._VALID
_URL
, url
)
1956 raise ExtractorError(u
'Invalid URL: %s' % url
)
1958 # See https://github.com/rg3/youtube-dl/issues/857
1959 api_mobj
= re
.match(r
'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url
)
1960 if api_mobj
is not None:
1961 url
= 'http://blip.tv/play/g_%s' % api_mobj
.group('video_id')
1962 urlp
= compat_urllib_parse_urlparse(url
)
1963 if urlp
.path
.startswith('/play/'):
1964 request
= compat_urllib_request
.Request(url
)
1965 response
= compat_urllib_request
.urlopen(request
)
1966 redirecturl
= response
.geturl()
1967 rurlp
= compat_urllib_parse_urlparse(redirecturl
)
1968 file_id
= compat_parse_qs(rurlp
.fragment
)['file'][0].rpartition('/')[2]
1969 url
= 'http://blip.tv/a/a-' + file_id
1970 return self
._real
_extract
(url
)
1977 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
1978 request
= compat_urllib_request
.Request(json_url
)
1979 request
.add_header('User-Agent', 'iTunes/10.6.1')
1980 self
.report_extraction(mobj
.group(1))
1983 urlh
= compat_urllib_request
.urlopen(request
)
1984 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
1985 basename
= url
.split('/')[-1]
1986 title
,ext
= os
.path
.splitext(basename
)
1987 title
= title
.decode('UTF-8')
1988 ext
= ext
.replace('.', '')
1989 self
.report_direct_download(title
)
1994 'upload_date': None,
1999 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2000 raise ExtractorError(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
2001 if info
is None: # Regular URL
2003 json_code_bytes
= urlh
.read()
2004 json_code
= json_code_bytes
.decode('utf-8')
2005 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2006 raise ExtractorError(u
'Unable to read video info webpage: %s' % compat_str(err
))
2009 json_data
= json
.loads(json_code
)
2010 if 'Post' in json_data
:
2011 data
= json_data
['Post']
2015 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2016 video_url
= data
['media']['url']
2017 umobj
= re
.match(self
._URL
_EXT
, video_url
)
2019 raise ValueError('Can not determine filename extension')
2020 ext
= umobj
.group(1)
2023 'id': data
['item_id'],
2025 'uploader': data
['display_name'],
2026 'upload_date': upload_date
,
2027 'title': data
['title'],
2029 'format': data
['media']['mimeType'],
2030 'thumbnail': data
['thumbnailUrl'],
2031 'description': data
['description'],
2032 'player_url': data
['embedUrl'],
2033 'user_agent': 'iTunes/10.6.1',
2035 except (ValueError,KeyError) as err
:
2036 raise ExtractorError(u
'Unable to parse video information: %s' % repr(err
))
2041 class MyVideoIE(InfoExtractor
):
2042 """Information Extractor for myvideo.de."""
2044 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2045 IE_NAME
= u
'myvideo'
2047 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
2048 # Released into the Public Domain by Tristan Fischer on 2013-05-19
2049 # https://github.com/rg3/youtube-dl/pull/842
2050 def __rc4crypt(self
,data
, key
):
2052 box
= list(range(256))
2053 for i
in list(range(256)):
2054 x
= (x
+ box
[i
] + compat_ord(key
[i
% len(key
)])) % 256
2055 box
[i
], box
[x
] = box
[x
], box
[i
]
2061 y
= (y
+ box
[x
]) % 256
2062 box
[x
], box
[y
] = box
[y
], box
[x
]
2063 out
+= chr(compat_ord(char
) ^ box
[(box
[x
] + box
[y
]) % 256])
2067 return hashlib
.md5(s
).hexdigest().encode()
2069 def _real_extract(self
,url
):
2070 mobj
= re
.match(self
._VALID
_URL
, url
)
2072 raise ExtractorError(u
'invalid URL: %s' % url
)
2074 video_id
= mobj
.group(1)
2077 b
'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
2078 b
'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
2079 b
'TnpsbA0KTVRkbU1tSTRNdz09'
2083 webpage_url
= 'http://www.myvideo.de/watch/%s' % video_id
2084 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
2086 mobj
= re
.search('source src=\'(.+?)[.]([^.]+)\'', webpage
)
2087 if mobj
is not None:
2088 self
.report_extraction(video_id
)
2089 video_url
= mobj
.group(1) + '.flv'
2091 video_title
= self
._html
_search
_regex
('<title>([^<]+)</title>',
2094 video_ext
= self
._search
_regex
('[.](.+?)$', video_url
, u
'extension')
2100 'upload_date': None,
2101 'title': video_title
,
2106 mobj
= re
.search('var flashvars={(.+?)}', webpage
)
2108 raise ExtractorError(u
'Unable to extract video')
2113 for (a
, b
) in re
.findall('(.+?):\'(.+?)\',?', sec
):
2114 if not a
== '_encxml':
2117 encxml
= compat_urllib_parse
.unquote(b
)
2118 if not params
.get('domain'):
2119 params
['domain'] = 'www.myvideo.de'
2120 xmldata_url
= '%s?%s' % (encxml
, compat_urllib_parse
.urlencode(params
))
2121 if 'flash_playertype=MTV' in xmldata_url
:
2122 self
._downloader
.report_warning(u
'avoiding MTV player')
2124 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
2125 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
2129 enc_data
= self
._download
_webpage
(xmldata_url
, video_id
).split('=')[1]
2130 enc_data_b
= binascii
.unhexlify(enc_data
)
2132 base64
.b64decode(base64
.b64decode(GK
)) +
2134 str(video_id
).encode('utf-8')
2137 dec_data
= self
.__rc
4crypt
(enc_data_b
, sk
)
2140 self
.report_extraction(video_id
)
2143 mobj
= re
.search('connectionurl=\'(.*?)\'', dec_data
)
2145 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
2146 if 'myvideo2flash' in video_url
:
2147 self
._downloader
.report_warning(u
'forcing RTMPT ...')
2148 video_url
= video_url
.replace('rtmpe://', 'rtmpt://')
2151 # extract non rtmp videos
2152 mobj
= re
.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data
)
2154 raise ExtractorError(u
'unable to extract url')
2155 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)) + compat_urllib_parse
.unquote(mobj
.group(2))
2157 video_file
= self
._search
_regex
('source=\'(.*?)\'', dec_data
, u
'video file')
2158 video_file
= compat_urllib_parse
.unquote(video_file
)
2160 if not video_file
.endswith('f4m'):
2161 ppath
, prefix
= video_file
.split('.')
2162 video_playpath
= '%s:%s' % (prefix
, ppath
)
2163 video_hls_playlist
= ''
2166 video_hls_playlist
= (
2167 video_filepath
+ video_file
2168 ).replace('.f4m', '.m3u8')
2170 video_swfobj
= self
._search
_regex
('swfobject.embedSWF\(\'(.+?)\'', webpage
, u
'swfobj')
2171 video_swfobj
= compat_urllib_parse
.unquote(video_swfobj
)
2173 video_title
= self
._html
_search
_regex
("<h1(?: class='globalHd')?>(.*?)</h1>",
2179 'tc_url': video_url
,
2181 'upload_date': None,
2182 'title': video_title
,
2184 'play_path': video_playpath
,
2185 'video_file': video_file
,
2186 'video_hls_playlist': video_hls_playlist
,
2187 'player_url': video_swfobj
,
2191 class ComedyCentralIE(InfoExtractor
):
2192 """Information extractor for The Daily Show and Colbert Report """
2194 # urls can be abbreviations like :thedailyshow or :colbert
2195 # urls for episodes like:
2196 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2197 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2198 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2199 _VALID_URL
= r
"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2200 |(https?://)?(www\.)?
2201 (?P<showname>thedailyshow|colbertnation)\.com/
2202 (full-episodes/(?P<episode>.*)|
2204 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2205 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2208 _available_formats
= ['3500', '2200', '1700', '1200', '750', '400']
2210 _video_extensions
= {
2218 _video_dimensions
= {
2228 def suitable(cls
, url
):
2229 """Receives a URL and returns True if suitable for this IE."""
2230 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
2232 def _print_formats(self
, formats
):
2233 print('Available formats:')
2235 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'mp4'), self
._video
_dimensions
.get(x
, '???')))
2238 def _real_extract(self
, url
):
2239 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
2241 raise ExtractorError(u
'Invalid URL: %s' % url
)
2243 if mobj
.group('shortname'):
2244 if mobj
.group('shortname') in ('tds', 'thedailyshow'):
2245 url
= u
'http://www.thedailyshow.com/full-episodes/'
2247 url
= u
'http://www.colbertnation.com/full-episodes/'
2248 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
2249 assert mobj
is not None
2251 if mobj
.group('clip'):
2252 if mobj
.group('showname') == 'thedailyshow':
2253 epTitle
= mobj
.group('tdstitle')
2255 epTitle
= mobj
.group('cntitle')
2258 dlNewest
= not mobj
.group('episode')
2260 epTitle
= mobj
.group('showname')
2262 epTitle
= mobj
.group('episode')
2264 self
.report_extraction(epTitle
)
2265 webpage
,htmlHandle
= self
._download
_webpage
_handle
(url
, epTitle
)
2267 url
= htmlHandle
.geturl()
2268 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
2270 raise ExtractorError(u
'Invalid redirected URL: ' + url
)
2271 if mobj
.group('episode') == '':
2272 raise ExtractorError(u
'Redirected URL is still not specific: ' + url
)
2273 epTitle
= mobj
.group('episode')
2275 mMovieParams
= re
.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage
)
2277 if len(mMovieParams
) == 0:
2278 # The Colbert Report embeds the information in a without
2279 # a URL prefix; so extract the alternate reference
2280 # and then add the URL prefix manually.
2282 altMovieParams
= re
.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage
)
2283 if len(altMovieParams
) == 0:
2284 raise ExtractorError(u
'unable to find Flash URL in webpage ' + url
)
2286 mMovieParams
= [("http://media.mtvnservices.com/" + altMovieParams
[0], altMovieParams
[0])]
2288 uri
= mMovieParams
[0][1]
2289 indexUrl
= 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse
.urlencode({'uri': uri
})
2290 indexXml
= self
._download
_webpage
(indexUrl
, epTitle
,
2291 u
'Downloading show index',
2292 u
'unable to download episode index')
2296 idoc
= xml
.etree
.ElementTree
.fromstring(indexXml
)
2297 itemEls
= idoc
.findall('.//item')
2298 for partNum
,itemEl
in enumerate(itemEls
):
2299 mediaId
= itemEl
.findall('./guid')[0].text
2300 shortMediaId
= mediaId
.split(':')[-1]
2301 showId
= mediaId
.split(':')[-2].replace('.com', '')
2302 officialTitle
= itemEl
.findall('./title')[0].text
2303 officialDate
= unified_strdate(itemEl
.findall('./pubDate')[0].text
)
2305 configUrl
= ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2306 compat_urllib_parse
.urlencode({'uri': mediaId
}))
2307 configXml
= self
._download
_webpage
(configUrl
, epTitle
,
2308 u
'Downloading configuration for %s' % shortMediaId
)
2310 cdoc
= xml
.etree
.ElementTree
.fromstring(configXml
)
2312 for rendition
in cdoc
.findall('.//rendition'):
2313 finfo
= (rendition
.attrib
['bitrate'], rendition
.findall('./src')[0].text
)
2317 self
._downloader
.report_error(u
'unable to download ' + mediaId
+ ': No videos found')
2320 if self
._downloader
.params
.get('listformats', None):
2321 self
._print
_formats
([i
[0] for i
in turls
])
2324 # For now, just pick the highest bitrate
2325 format
,rtmp_video_url
= turls
[-1]
2327 # Get the format arg from the arg stream
2328 req_format
= self
._downloader
.params
.get('format', None)
2330 # Select format if we can find one
2333 format
, rtmp_video_url
= f
, v
2336 m
= re
.match(r
'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url
)
2338 raise ExtractorError(u
'Cannot transform RTMP url')
2339 base
= 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2340 video_url
= base
+ m
.group('finalid')
2342 effTitle
= showId
+ u
'-' + epTitle
+ u
' part ' + compat_str(partNum
+1)
2347 'upload_date': officialDate
,
2352 'description': officialTitle
,
2354 results
.append(info
)
2359 class EscapistIE(InfoExtractor
):
2360 """Information extractor for The Escapist """
2362 _VALID_URL
= r
'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2363 IE_NAME
= u
'escapist'
2365 def _real_extract(self
, url
):
2366 mobj
= re
.match(self
._VALID
_URL
, url
)
2368 raise ExtractorError(u
'Invalid URL: %s' % url
)
2369 showName
= mobj
.group('showname')
2370 videoId
= mobj
.group('episode')
2372 self
.report_extraction(videoId
)
2373 webpage
= self
._download
_webpage
(url
, videoId
)
2375 videoDesc
= self
._html
_search
_regex
('<meta name="description" content="([^"]*)"',
2376 webpage
, u
'description', fatal
=False)
2378 imgUrl
= self
._html
_search
_regex
('<meta property="og:image" content="([^"]*)"',
2379 webpage
, u
'thumbnail', fatal
=False)
2381 playerUrl
= self
._html
_search
_regex
('<meta property="og:video" content="([^"]*)"',
2382 webpage
, u
'player url')
2384 title
= self
._html
_search
_regex
('<meta name="title" content="([^"]*)"',
2385 webpage
, u
'player url').split(' : ')[-1]
2387 configUrl
= self
._search
_regex
('config=(.*)$', playerUrl
, u
'config url')
2388 configUrl
= compat_urllib_parse
.unquote(configUrl
)
2390 configJSON
= self
._download
_webpage
(configUrl
, videoId
,
2391 u
'Downloading configuration',
2392 u
'unable to download configuration')
2394 # Technically, it's JavaScript, not JSON
2395 configJSON
= configJSON
.replace("'", '"')
2398 config
= json
.loads(configJSON
)
2399 except (ValueError,) as err
:
2400 raise ExtractorError(u
'Invalid JSON in configuration file: ' + compat_str(err
))
2402 playlist
= config
['playlist']
2403 videoUrl
= playlist
[1]['url']
2408 'uploader': showName
,
2409 'upload_date': None,
2412 'thumbnail': imgUrl
,
2413 'description': videoDesc
,
2414 'player_url': playerUrl
,
2419 class CollegeHumorIE(InfoExtractor
):
2420 """Information extractor for collegehumor.com"""
2423 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2424 IE_NAME
= u
'collegehumor'
2426 def report_manifest(self
, video_id
):
2427 """Report information extraction."""
2428 self
.to_screen(u
'%s: Downloading XML manifest' % video_id
)
2430 def _real_extract(self
, url
):
2431 mobj
= re
.match(self
._VALID
_URL
, url
)
2433 raise ExtractorError(u
'Invalid URL: %s' % url
)
2434 video_id
= mobj
.group('videoid')
2439 'upload_date': None,
2442 self
.report_extraction(video_id
)
2443 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
2445 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
2446 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2447 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
2449 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
2451 videoNode
= mdoc
.findall('./video')[0]
2452 info
['description'] = videoNode
.findall('./description')[0].text
2453 info
['title'] = videoNode
.findall('./caption')[0].text
2454 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
2455 manifest_url
= videoNode
.findall('./file')[0].text
2457 raise ExtractorError(u
'Invalid metadata XML file')
2459 manifest_url
+= '?hdcore=2.10.3'
2460 self
.report_manifest(video_id
)
2462 manifestXml
= compat_urllib_request
.urlopen(manifest_url
).read()
2463 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2464 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
2466 adoc
= xml
.etree
.ElementTree
.fromstring(manifestXml
)
2468 media_node
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2469 node_id
= media_node
.attrib
['url']
2470 video_id
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2471 except IndexError as err
:
2472 raise ExtractorError(u
'Invalid manifest file')
2474 url_pr
= compat_urllib_parse_urlparse(manifest_url
)
2475 url
= url_pr
.scheme
+ '://' + url_pr
.netloc
+ '/z' + video_id
[:-2] + '/' + node_id
+ 'Seg1-Frag1'
2482 class XVideosIE(InfoExtractor
):
2483 """Information extractor for xvideos.com"""
2485 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2486 IE_NAME
= u
'xvideos'
2488 def _real_extract(self
, url
):
2489 mobj
= re
.match(self
._VALID
_URL
, url
)
2491 raise ExtractorError(u
'Invalid URL: %s' % url
)
2492 video_id
= mobj
.group(1)
2494 webpage
= self
._download
_webpage
(url
, video_id
)
2496 self
.report_extraction(video_id
)
2499 video_url
= compat_urllib_parse
.unquote(self
._search
_regex
(r
'flv_url=(.+?)&',
2500 webpage
, u
'video URL'))
2503 video_title
= self
._html
_search
_regex
(r
'<title>(.*?)\s+-\s+XVID',
2506 # Extract video thumbnail
2507 video_thumbnail
= self
._search
_regex
(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2508 webpage
, u
'thumbnail', fatal
=False)
2514 'upload_date': None,
2515 'title': video_title
,
2517 'thumbnail': video_thumbnail
,
2518 'description': None,
2524 class SoundcloudIE(InfoExtractor
):
2525 """Information extractor for soundcloud.com
2526 To access the media, the uid of the song and a stream token
2527 must be extracted from the page source and the script must make
2528 a request to media.soundcloud.com/crossdomain.xml. Then
2529 the media can be grabbed by requesting from an url composed
2530 of the stream token and uid
2533 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2534 IE_NAME
= u
'soundcloud'
2536 def report_resolve(self
, video_id
):
2537 """Report information extraction."""
2538 self
.to_screen(u
'%s: Resolving id' % video_id
)
2540 def _real_extract(self
, url
):
2541 mobj
= re
.match(self
._VALID
_URL
, url
)
2543 raise ExtractorError(u
'Invalid URL: %s' % url
)
2545 # extract uploader (which is in the url)
2546 uploader
= mobj
.group(1)
2547 # extract simple title (uploader + slug of song title)
2548 slug_title
= mobj
.group(2)
2549 simple_title
= uploader
+ u
'-' + slug_title
2550 full_title
= '%s/%s' % (uploader
, slug_title
)
2552 self
.report_resolve(full_title
)
2554 url
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
)
2555 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2556 info_json
= self
._download
_webpage
(resolv_url
, full_title
, u
'Downloading info JSON')
2558 info
= json
.loads(info_json
)
2559 video_id
= info
['id']
2560 self
.report_extraction(full_title
)
2562 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2563 stream_json
= self
._download
_webpage
(streams_url
, full_title
,
2564 u
'Downloading stream definitions',
2565 u
'unable to download stream definitions')
2567 streams
= json
.loads(stream_json
)
2568 mediaURL
= streams
['http_mp3_128_url']
2569 upload_date
= unified_strdate(info
['created_at'])
2574 'uploader': info
['user']['username'],
2575 'upload_date': upload_date
,
2576 'title': info
['title'],
2578 'description': info
['description'],
2581 class SoundcloudSetIE(InfoExtractor
):
2582 """Information extractor for soundcloud.com sets
2583 To access the media, the uid of the song and a stream token
2584 must be extracted from the page source and the script must make
2585 a request to media.soundcloud.com/crossdomain.xml. Then
2586 the media can be grabbed by requesting from an url composed
2587 of the stream token and uid
2590 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2591 IE_NAME
= u
'soundcloud:set'
2593 def report_resolve(self
, video_id
):
2594 """Report information extraction."""
2595 self
.to_screen(u
'%s: Resolving id' % video_id
)
2597 def _real_extract(self
, url
):
2598 mobj
= re
.match(self
._VALID
_URL
, url
)
2600 raise ExtractorError(u
'Invalid URL: %s' % url
)
2602 # extract uploader (which is in the url)
2603 uploader
= mobj
.group(1)
2604 # extract simple title (uploader + slug of song title)
2605 slug_title
= mobj
.group(2)
2606 simple_title
= uploader
+ u
'-' + slug_title
2607 full_title
= '%s/sets/%s' % (uploader
, slug_title
)
2609 self
.report_resolve(full_title
)
2611 url
= 'http://soundcloud.com/%s/sets/%s' % (uploader
, slug_title
)
2612 resolv_url
= 'http://api.soundcloud.com/resolve.json?url=' + url
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2613 info_json
= self
._download
_webpage
(resolv_url
, full_title
)
2616 info
= json
.loads(info_json
)
2617 if 'errors' in info
:
2618 for err
in info
['errors']:
2619 self
._downloader
.report_error(u
'unable to download video webpage: %s' % compat_str(err
['error_message']))
2622 self
.report_extraction(full_title
)
2623 for track
in info
['tracks']:
2624 video_id
= track
['id']
2626 streams_url
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2627 stream_json
= self
._download
_webpage
(streams_url
, video_id
, u
'Downloading track info JSON')
2629 self
.report_extraction(video_id
)
2630 streams
= json
.loads(stream_json
)
2631 mediaURL
= streams
['http_mp3_128_url']
2636 'uploader': track
['user']['username'],
2637 'upload_date': unified_strdate(track
['created_at']),
2638 'title': track
['title'],
2640 'description': track
['description'],
2645 class InfoQIE(InfoExtractor
):
2646 """Information extractor for infoq.com"""
2647 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2649 def _real_extract(self
, url
):
2650 mobj
= re
.match(self
._VALID
_URL
, url
)
2652 raise ExtractorError(u
'Invalid URL: %s' % url
)
2654 webpage
= self
._download
_webpage
(url
, video_id
=url
)
2655 self
.report_extraction(url
)
2658 mobj
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
)
2660 raise ExtractorError(u
'Unable to extract video url')
2661 real_id
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8'))
2662 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
2665 video_title
= self
._search
_regex
(r
'contentTitle = "(.*?)";',
2668 # Extract description
2669 video_description
= self
._html
_search
_regex
(r
'<meta name="description" content="(.*)"(?:\s*/)?>',
2670 webpage
, u
'description', fatal
=False)
2672 video_filename
= video_url
.split('/')[-1]
2673 video_id
, extension
= video_filename
.split('.')
2679 'upload_date': None,
2680 'title': video_title
,
2681 'ext': extension
, # Extension is always(?) mp4, but seems to be flv
2683 'description': video_description
,
2688 class MixcloudIE(InfoExtractor
):
2689 """Information extractor for www.mixcloud.com"""
2691 _WORKING
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2692 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME
= u
'mixcloud'
2695 def report_download_json(self
, file_id
):
2696 """Report JSON download."""
2697 self
.to_screen(u
'Downloading json')
2699 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
2700 """Get urls from 'audio_formats' section in json"""
2703 bitrate_list
= jsonData
[fmt
]
2704 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
2705 bitrate
= max(bitrate_list
) # select highest
2707 url_list
= jsonData
[fmt
][bitrate
]
2708 except TypeError: # we have no bitrate info.
2709 url_list
= jsonData
[fmt
]
2712 def check_urls(self
, url_list
):
2713 """Returns 1st active url from list"""
2714 for url
in url_list
:
2716 compat_urllib_request
.urlopen(url
)
2718 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2723 def _print_formats(self
, formats
):
2724 print('Available formats:')
2725 for fmt
in formats
.keys():
2726 for b
in formats
[fmt
]:
2728 ext
= formats
[fmt
][b
][0]
2729 print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]))
2730 except TypeError: # we have no bitrate info
2731 ext
= formats
[fmt
][0]
2732 print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]))
2735 def _real_extract(self
, url
):
2736 mobj
= re
.match(self
._VALID
_URL
, url
)
2738 raise ExtractorError(u
'Invalid URL: %s' % url
)
2739 # extract uploader & filename from url
2740 uploader
= mobj
.group(1).decode('utf-8')
2741 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
2743 # construct API request
2744 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
2745 # retrieve .json file with links to files
2746 request
= compat_urllib_request
.Request(file_url
)
2748 self
.report_download_json(file_url
)
2749 jsonData
= compat_urllib_request
.urlopen(request
).read()
2750 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2751 raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
))
2754 json_data
= json
.loads(jsonData
)
2755 player_url
= json_data
['player_swf_url']
2756 formats
= dict(json_data
['audio_formats'])
2758 req_format
= self
._downloader
.params
.get('format', None)
2761 if self
._downloader
.params
.get('listformats', None):
2762 self
._print
_formats
(formats
)
2765 if req_format
is None or req_format
== 'best':
2766 for format_param
in formats
.keys():
2767 url_list
= self
.get_urls(formats
, format_param
)
2769 file_url
= self
.check_urls(url_list
)
2770 if file_url
is not None:
2773 if req_format
not in formats
:
2774 raise ExtractorError(u
'Format is not available')
2776 url_list
= self
.get_urls(formats
, req_format
)
2777 file_url
= self
.check_urls(url_list
)
2778 format_param
= req_format
2781 'id': file_id
.decode('utf-8'),
2782 'url': file_url
.decode('utf-8'),
2783 'uploader': uploader
.decode('utf-8'),
2784 'upload_date': None,
2785 'title': json_data
['name'],
2786 'ext': file_url
.split('.')[-1].decode('utf-8'),
2787 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
2788 'thumbnail': json_data
['thumbnail_url'],
2789 'description': json_data
['description'],
2790 'player_url': player_url
.decode('utf-8'),
2793 class StanfordOpenClassroomIE(InfoExtractor
):
2794 """Information extractor for Stanford's Open ClassRoom"""
2796 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2797 IE_NAME
= u
'stanfordoc'
2799 def _real_extract(self
, url
):
2800 mobj
= re
.match(self
._VALID
_URL
, url
)
2802 raise ExtractorError(u
'Invalid URL: %s' % url
)
2804 if mobj
.group('course') and mobj
.group('video'): # A specific video
2805 course
= mobj
.group('course')
2806 video
= mobj
.group('video')
2808 'id': course
+ '_' + video
,
2810 'upload_date': None,
2813 self
.report_extraction(info
['id'])
2814 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
2815 xmlUrl
= baseUrl
+ video
+ '.xml'
2817 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
2818 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2819 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
2820 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
2822 info
['title'] = mdoc
.findall('./title')[0].text
2823 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
2825 raise ExtractorError(u
'Invalid metadata XML file')
2826 info
['ext'] = info
['url'].rpartition('.')[2]
2828 elif mobj
.group('course'): # A course page
2829 course
= mobj
.group('course')
2834 'upload_date': None,
2837 coursepage
= self
._download
_webpage
(url
, info
['id'],
2838 note
='Downloading course info page',
2839 errnote
='Unable to download course info page')
2841 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
2843 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
2844 coursepage
, u
'description', fatal
=False)
2846 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
2849 'type': 'reference',
2850 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
2854 for entry
in info
['list']:
2855 assert entry
['type'] == 'reference'
2856 results
+= self
.extract(entry
['url'])
2860 'id': 'Stanford OpenClassroom',
2863 'upload_date': None,
2866 self
.report_download_webpage(info
['id'])
2867 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2869 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
2870 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2871 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
2873 info
['title'] = info
['id']
2875 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
2878 'type': 'reference',
2879 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
2884 for entry
in info
['list']:
2885 assert entry
['type'] == 'reference'
2886 results
+= self
.extract(entry
['url'])
2889 class MTVIE(InfoExtractor
):
2890 """Information extractor for MTV.com"""
2892 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2895 def _real_extract(self
, url
):
2896 mobj
= re
.match(self
._VALID
_URL
, url
)
2898 raise ExtractorError(u
'Invalid URL: %s' % url
)
2899 if not mobj
.group('proto'):
2900 url
= 'http://' + url
2901 video_id
= mobj
.group('videoid')
2903 webpage
= self
._download
_webpage
(url
, video_id
)
2905 song_name
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>',
2906 webpage
, u
'song name', fatal
=False)
2908 video_title
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>',
2911 mtvn_uri
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>',
2912 webpage
, u
'mtvn_uri', fatal
=False)
2914 content_id
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2915 webpage
, u
'content id', fatal
=False)
2917 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2918 self
.report_extraction(video_id
)
2919 request
= compat_urllib_request
.Request(videogen_url
)
2921 metadataXml
= compat_urllib_request
.urlopen(request
).read()
2922 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2923 raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
))
2925 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
2926 renditions
= mdoc
.findall('.//rendition')
2928 # For now, always pick the highest quality.
2929 rendition
= renditions
[-1]
2932 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
2933 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
2934 video_url
= rendition
.find('./src').text
2936 raise ExtractorError('Invalid rendition field.')
2941 'uploader': performer
,
2942 'upload_date': None,
2943 'title': video_title
,
2951 class YoukuIE(InfoExtractor
):
2952 _VALID_URL
= r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2955 nowTime
= int(time
.time() * 1000)
2956 random1
= random
.randint(1000,1998)
2957 random2
= random
.randint(1000,9999)
2959 return "%d%d%d" %(nowTime
,random1
,random2
)
2961 def _get_file_ID_mix_string(self
, seed
):
2963 source
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2965 for i
in range(len(source
)):
2966 seed
= (seed
* 211 + 30031 ) % 65536
2967 index
= math
.floor(seed
/ 65536 * len(source
) )
2968 mixed
.append(source
[int(index
)])
2969 source
.remove(source
[int(index
)])
2970 #return ''.join(mixed)
2973 def _get_file_id(self
, fileId
, seed
):
2974 mixed
= self
._get
_file
_ID
_mix
_string
(seed
)
2975 ids
= fileId
.split('*')
2979 realId
.append(mixed
[int(ch
)])
2980 return ''.join(realId
)
2982 def _real_extract(self
, url
):
2983 mobj
= re
.match(self
._VALID
_URL
, url
)
2985 raise ExtractorError(u
'Invalid URL: %s' % url
)
2986 video_id
= mobj
.group('ID')
2988 info_url
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2990 jsondata
= self
._download
_webpage
(info_url
, video_id
)
2992 self
.report_extraction(video_id
)
2994 config
= json
.loads(jsondata
)
2996 video_title
= config
['data'][0]['title']
2997 seed
= config
['data'][0]['seed']
2999 format
= self
._downloader
.params
.get('format', None)
3000 supported_format
= list(config
['data'][0]['streamfileids'].keys())
3002 if format
is None or format
== 'best':
3003 if 'hd2' in supported_format
:
3008 elif format
== 'worst':
3016 fileid
= config
['data'][0]['streamfileids'][format
]
3017 keys
= [s
['k'] for s
in config
['data'][0]['segs'][format
]]
3018 except (UnicodeDecodeError, ValueError, KeyError):
3019 raise ExtractorError(u
'Unable to extract info section')
3022 sid
= self
._gen
_sid
()
3023 fileid
= self
._get
_file
_id
(fileid
, seed
)
3025 #column 8,9 of fileid represent the segment number
3026 #fileid[7:9] should be changed
3027 for index
, key
in enumerate(keys
):
3029 temp_fileid
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:])
3030 download_url
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
)
3033 'id': '%s_part%02d' % (video_id
, index
),
3034 'url': download_url
,
3036 'upload_date': None,
3037 'title': video_title
,
3040 files_info
.append(info
)
3045 class XNXXIE(InfoExtractor
):
3046 """Information extractor for xnxx.com"""
3048 _VALID_URL
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3050 VIDEO_URL_RE
= r
'flv_url=(.*?)&'
3051 VIDEO_TITLE_RE
= r
'<title>(.*?)\s+-\s+XNXX.COM'
3052 VIDEO_THUMB_RE
= r
'url_bigthumb=(.*?)&'
3054 def _real_extract(self
, url
):
3055 mobj
= re
.match(self
._VALID
_URL
, url
)
3057 raise ExtractorError(u
'Invalid URL: %s' % url
)
3058 video_id
= mobj
.group(1)
3060 # Get webpage content
3061 webpage
= self
._download
_webpage
(url
, video_id
)
3063 video_url
= self
._search
_regex
(self
.VIDEO_URL_RE
,
3064 webpage
, u
'video URL')
3065 video_url
= compat_urllib_parse
.unquote(video_url
)
3067 video_title
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
,
3070 video_thumbnail
= self
._search
_regex
(self
.VIDEO_THUMB_RE
,
3071 webpage
, u
'thumbnail', fatal
=False)
3077 'upload_date': None,
3078 'title': video_title
,
3080 'thumbnail': video_thumbnail
,
3081 'description': None,
3085 class GooglePlusIE(InfoExtractor
):
3086 """Information extractor for plus.google.com."""
3088 _VALID_URL
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3089 IE_NAME
= u
'plus.google'
3091 def _real_extract(self
, url
):
3092 # Extract id from URL
3093 mobj
= re
.match(self
._VALID
_URL
, url
)
3095 raise ExtractorError(u
'Invalid URL: %s' % url
)
3097 post_url
= mobj
.group(0)
3098 video_id
= mobj
.group(1)
3100 video_extension
= 'flv'
3102 # Step 1, Retrieve post webpage to extract further information
3103 webpage
= self
._download
_webpage
(post_url
, video_id
, u
'Downloading entry webpage')
3105 self
.report_extraction(video_id
)
3107 # Extract update date
3108 upload_date
= self
._html
_search
_regex
('title="Timestamp">(.*?)</a>',
3109 webpage
, u
'upload date', fatal
=False)
3111 # Convert timestring to a format suitable for filename
3112 upload_date
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d")
3113 upload_date
= upload_date
.strftime('%Y%m%d')
3116 uploader
= self
._html
_search
_regex
(r
'rel\="author".*?>(.*?)</a>',
3117 webpage
, u
'uploader', fatal
=False)
3120 # Get the first line for title
3121 video_title
= self
._html
_search
_regex
(r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
3122 webpage
, 'title', default
=u
'NA')
3124 # Step 2, Stimulate clicking the image box to launch video
3125 video_page
= self
._search
_regex
('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
3126 webpage
, u
'video page URL')
3127 webpage
= self
._download
_webpage
(video_page
, video_id
, u
'Downloading video page')
3129 # Extract video links on video page
3130 """Extract video links of all sizes"""
3131 pattern
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3132 mobj
= re
.findall(pattern
, webpage
)
3134 raise ExtractorError(u
'Unable to extract video links')
3136 # Sort in resolution
3137 links
= sorted(mobj
)
3139 # Choose the lowest of the sort, i.e. highest resolution
3140 video_url
= links
[-1]
3141 # Only get the url. The resolution part in the tuple has no use anymore
3142 video_url
= video_url
[-1]
3143 # Treat escaped \u0026 style hex
3145 video_url
= video_url
.decode("unicode_escape")
3146 except AttributeError: # Python 3
3147 video_url
= bytes(video_url
, 'ascii').decode('unicode-escape')
3153 'uploader': uploader
,
3154 'upload_date': upload_date
,
3155 'title': video_title
,
3156 'ext': video_extension
,
3159 class NBAIE(InfoExtractor
):
3160 _VALID_URL
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
3163 def _real_extract(self
, url
):
3164 mobj
= re
.match(self
._VALID
_URL
, url
)
3166 raise ExtractorError(u
'Invalid URL: %s' % url
)
3168 video_id
= mobj
.group(1)
3170 webpage
= self
._download
_webpage
(url
, video_id
)
3172 video_url
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id
+ '_nba_1280x720.mp4'
3174 shortened_video_id
= video_id
.rpartition('/')[2]
3175 title
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(.*?)"',
3176 webpage
, 'title', default
=shortened_video_id
).replace('NBA.com: ', '')
3178 # It isn't there in the HTML it returns to us
3179 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
3181 description
= self
._html
_search
_regex
(r
'<meta name="description" (?:content|value)="(.*?)" />', webpage
, 'description', fatal
=False)
3184 'id': shortened_video_id
,
3188 # 'uploader_date': uploader_date,
3189 'description': description
,
3193 class JustinTVIE(InfoExtractor
):
3194 """Information extractor for justin.tv and twitch.tv"""
3195 # TODO: One broadcast may be split into multiple videos. The key
3196 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3197 # starts at 1 and increases. Can we treat all parts as one video?
3199 _VALID_URL
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3201 (?P<channelid>[^/]+)|
3202 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3203 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3207 _JUSTIN_PAGE_LIMIT
= 100
3208 IE_NAME
= u
'justin.tv'
3210 def report_download_page(self
, channel
, offset
):
3211 """Report attempt to download a single page of videos."""
3212 self
.to_screen(u
'%s: Downloading video information from %d to %d' %
3213 (channel
, offset
, offset
+ self
._JUSTIN
_PAGE
_LIMIT
))
3215 # Return count of items, list of *valid* items
3216 def _parse_page(self
, url
, video_id
):
3217 webpage
= self
._download
_webpage
(url
, video_id
,
3218 u
'Downloading video info JSON',
3219 u
'unable to download video info JSON')
3221 response
= json
.loads(webpage
)
3222 if type(response
) != list:
3223 error_text
= response
.get('error', 'unknown error')
3224 raise ExtractorError(u
'Justin.tv API: %s' % error_text
)
3226 for clip
in response
:
3227 video_url
= clip
['video_file_url']
3229 video_extension
= os
.path
.splitext(video_url
)[1][1:]
3230 video_date
= re
.sub('-', '', clip
['start_time'][:10])
3231 video_uploader_id
= clip
.get('user_id', clip
.get('channel_id'))
3232 video_id
= clip
['id']
3233 video_title
= clip
.get('title', video_id
)
3237 'title': video_title
,
3238 'uploader': clip
.get('channel_name', video_uploader_id
),
3239 'uploader_id': video_uploader_id
,
3240 'upload_date': video_date
,
3241 'ext': video_extension
,
3243 return (len(response
), info
)
3245 def _real_extract(self
, url
):
3246 mobj
= re
.match(self
._VALID
_URL
, url
)
3248 raise ExtractorError(u
'invalid URL: %s' % url
)
3250 api_base
= 'http://api.justin.tv'
3252 if mobj
.group('channelid'):
3254 video_id
= mobj
.group('channelid')
3255 api
= api_base
+ '/channel/archives/%s.json' % video_id
3256 elif mobj
.group('chapterid'):
3257 chapter_id
= mobj
.group('chapterid')
3259 webpage
= self
._download
_webpage
(url
, chapter_id
)
3260 m
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
)
3262 raise ExtractorError(u
'Cannot find archive of a chapter')
3263 archive_id
= m
.group(1)
3265 api
= api_base
+ '/broadcast/by_chapter/%s.xml' % chapter_id
3266 chapter_info_xml
= self
._download
_webpage
(api
, chapter_id
,
3267 note
=u
'Downloading chapter information',
3268 errnote
=u
'Chapter information download failed')
3269 doc
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
)
3270 for a
in doc
.findall('.//archive'):
3271 if archive_id
== a
.find('./id').text
:
3274 raise ExtractorError(u
'Could not find chapter in chapter information')
3276 video_url
= a
.find('./video_file_url').text
3277 video_ext
= video_url
.rpartition('.')[2] or u
'flv'
3279 chapter_api_url
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
3280 chapter_info_json
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
,
3281 note
='Downloading chapter metadata',
3282 errnote
='Download of chapter metadata failed')
3283 chapter_info
= json
.loads(chapter_info_json
)
3285 bracket_start
= int(doc
.find('.//bracket_start').text
)
3286 bracket_end
= int(doc
.find('.//bracket_end').text
)
3288 # TODO determine start (and probably fix up file)
3289 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3290 #video_url += u'?start=' + TODO:start_timestamp
3291 # bracket_start is 13290, but we want 51670615
3292 self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. '
3293 u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
)))
3296 'id': u
'c' + chapter_id
,
3299 'title': chapter_info
['title'],
3300 'thumbnail': chapter_info
['preview'],
3301 'description': chapter_info
['description'],
3302 'uploader': chapter_info
['channel']['display_name'],
3303 'uploader_id': chapter_info
['channel']['name'],
3307 video_id
= mobj
.group('videoid')
3308 api
= api_base
+ '/broadcast/by_archive/%s.json' % video_id
3310 self
.report_extraction(video_id
)
3314 limit
= self
._JUSTIN
_PAGE
_LIMIT
3317 self
.report_download_page(video_id
, offset
)
3318 page_url
= api
+ ('?offset=%d&limit=%d' % (offset
, limit
))
3319 page_count
, page_info
= self
._parse
_page
(page_url
, video_id
)
3320 info
.extend(page_info
)
3321 if not paged
or page_count
!= limit
:
3326 class FunnyOrDieIE(InfoExtractor
):
3327 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3329 def _real_extract(self
, url
):
3330 mobj
= re
.match(self
._VALID
_URL
, url
)
3332 raise ExtractorError(u
'invalid URL: %s' % url
)
3334 video_id
= mobj
.group('id')
3335 webpage
= self
._download
_webpage
(url
, video_id
)
3337 video_url
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3338 webpage
, u
'video URL', flags
=re
.DOTALL
)
3340 title
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3341 r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
)
3343 video_description
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"',
3344 webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
)
3351 'description': video_description
,
3355 class SteamIE(InfoExtractor
):
3356 _VALID_URL
= r
"""http://store\.steampowered\.com/
3358 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3360 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3362 _VIDEO_PAGE_TEMPLATE
= 'http://store.steampowered.com/video/%s/'
3363 _AGECHECK_TEMPLATE
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3366 def suitable(cls
, url
):
3367 """Receives a URL and returns True if suitable for this IE."""
3368 return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None
3370 def _real_extract(self
, url
):
3371 m
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
3372 gameID
= m
.group('gameID')
3374 videourl
= self
._VIDEO
_PAGE
_TEMPLATE
% gameID
3375 webpage
= self
._download
_webpage
(videourl
, gameID
)
3377 if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None:
3378 videourl
= self
._AGECHECK
_TEMPLATE
% gameID
3379 self
.report_age_confirmation()
3380 webpage
= self
._download
_webpage
(videourl
, gameID
)
3382 self
.report_extraction(gameID
)
3383 game_title
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>',
3384 webpage
, 'game title')
3386 urlRE
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P
<videoURL
>[\w
:/\
.\?=]+)\"(,\s
*MOVIE_NAME
: \"(?P
<videoName
>[\w
:/\
.\?=\
+-]+)\")?\s
*\
},"
3387 mweb = re.finditer(urlRE, webpage)
3388 namesRE = r'<span class="title
">(?P<videoName>.+?)</span>'
3389 titles = re.finditer(namesRE, webpage)
3390 thumbsRE = r'<img class="movie_thumb
" src="(?P
<thumbnail
>.+?
)">'
3391 thumbs = re.finditer(thumbsRE, webpage)
3393 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3394 video_id = vid.group('videoID')
3395 title = vtitle.group('videoName')
3396 video_url = vid.group('videoURL')
3397 video_thumb = thumb.group('thumbnail')
3399 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3404 'title': unescapeHTML(title),
3405 'thumbnail': video_thumb
3408 return [self.playlist_result(videos, gameID, game_title)]
3410 class UstreamIE(InfoExtractor):
3411 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3412 IE_NAME = u'ustream'
3414 def _real_extract(self, url):
3415 m = re.match(self._VALID_URL, url)
3416 video_id = m.group('videoID')
3418 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3419 webpage = self._download_webpage(url, video_id)
3421 self.report_extraction(video_id)
3423 video_title = self._html_search_regex(r'data-title="(?P
<title
>.+)"',
3426 uploader = self._html_search_regex(r'data-content-type="channel
".*?>(?P<uploader>.*?)</a>',
3427 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3429 thumbnail = self._html_search_regex(r'<link rel="image_src
" href="(?P
<thumb
>.*?
)"',
3430 webpage, u'thumbnail', fatal=False)
3436 'title': video_title,
3437 'uploader': uploader,
3438 'thumbnail': thumbnail,
3442 class WorldStarHipHopIE(InfoExtractor):
3443 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3444 IE_NAME = u'WorldStarHipHop'
3446 def _real_extract(self, url):
3447 m = re.match(self._VALID_URL, url)
3448 video_id = m.group('id')
3450 webpage_src = self._download_webpage(url, video_id)
3452 video_url = self._search_regex(r'so\.addVariable\("file","(.*?
)"\)',
3453 webpage_src, u'video URL')
3455 if 'mp4' in video_url:
3460 video_title = self._html_search_regex(r"<title
>(.*)</title
>",
3461 webpage_src, u'title')
3463 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3464 thumbnail = self._html_search_regex(r'rel="image_src
" href="(.*)" />',
3465 webpage_src, u'thumbnail', fatal=False)
3468 _title = r"""candytitles.*>(.*)</span>"""
3469 mobj = re.search(_title, webpage_src)
3470 if mobj is not None:
3471 video_title = mobj.group(1)
3476 'title' : video_title,
3477 'thumbnail' : thumbnail,
3482 class RBMARadioIE(InfoExtractor):
3483 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3485 def _real_extract(self, url):
3486 m = re.match(self._VALID_URL, url)
3487 video_id = m.group('videoID')
3489 webpage = self._download_webpage(url, video_id)
3491 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3492 webpage, u'json data', flags=re.MULTILINE)
3495 data = json.loads(json_data)
3496 except ValueError as e:
3497 raise ExtractorError(u'Invalid JSON: ' + str(e))
3499 video_url = data['akamai_url'] + '&cbr=256'
3500 url_parts = compat_urllib_parse_urlparse(video_url)
3501 video_ext = url_parts.path.rpartition('.')[2]
3506 'title': data['title'],
3507 'description': data.get('teaser_text'),
3508 'location': data.get('country_of_origin'),
3509 'uploader': data.get('host', {}).get('name'),
3510 'uploader_id': data.get('host', {}).get('slug'),
3511 'thumbnail': data.get('image', {}).get('large_url_2x'),
3512 'duration': data.get('duration'),
3517 class YouPornIE(InfoExtractor):
3518 """Information extractor for youporn.com."""
3519 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3521 def _print_formats(self, formats):
3522 """Print all available formats"""
3523 print(u'Available formats:')
3524 print(u'ext\t\tformat')
3525 print(u'---------------------------------')
3526 for format in formats:
3527 print(u'%s\t\t%s' % (format['ext'], format['format']))
3529 def _specific(self, req_format, formats):
3531 if(x["format
"]==req_format):
3535 def _real_extract(self, url):
3536 mobj = re.match(self._VALID_URL, url)
3538 raise ExtractorError(u'Invalid URL: %s' % url)
3539 video_id = mobj.group('videoid')
3541 req = compat_urllib_request.Request(url)
3542 req.add_header('Cookie', 'age_verified=1')
3543 webpage = self._download_webpage(req, video_id)
3545 # Get JSON parameters
3546 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3548 params = json.loads(json_params)
3550 raise ExtractorError(u'Invalid JSON')
3552 self.report_extraction(video_id)
3554 video_title = params['title']
3555 upload_date = unified_strdate(params['release_date_f'])
3556 video_description = params['description']
3557 video_uploader = params['submitted_by']
3558 thumbnail = params['thumbnails'][0]['image']
3560 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3562 # Get all of the formats available
3563 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList
">(?P<download_list>.*?)</ul>'
3564 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3565 webpage, u'download list').strip()
3567 # Get all of the links from the page
3568 LINK_RE = r'(?s)<a href="(?P
<url
>[^
"]+)">'
3569 links = re.findall(LINK_RE, download_list_html)
3570 if(len(links) == 0):
3571 raise ExtractorError(u'ERROR
: no known formats available
for video
')
3573 self.to_screen(u'Links found
: %d' % len(links))
3578 # A link looks like this:
3579 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3580 # A path looks like this:
3581 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3582 video_url = unescapeHTML( link )
3583 path = compat_urllib_parse_urlparse( video_url ).path
3584 extension = os.path.splitext( path )[1][1:]
3585 format = path.split('/')[4].split('_
')[:2]
3588 format = "-".join( format )
3589 # title = u'%s-%s-%s' % (video_title, size, bitrate)
3594 'uploader
': video_uploader,
3595 'upload_date
': upload_date,
3596 'title
': video_title,
3599 'thumbnail
': thumbnail,
3600 'description
': video_description
3603 if self._downloader.params.get('listformats
', None):
3604 self._print_formats(formats)
3607 req_format = self._downloader.params.get('format
', None)
3608 self.to_screen(u'Format
: %s' % req_format)
3610 if req_format is None or req_format == 'best
':
3612 elif req_format == 'worst
':
3613 return [formats[-1]]
3614 elif req_format in ('-1', 'all
'):
3617 format = self._specific( req_format, formats )
3619 raise ExtractorError(u'Requested format
not available
')
3624 class PornotubeIE(InfoExtractor):
3625 """Information extractor for pornotube.com."""
3626 _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?pornotube\
.com(/c
/(?P
<channel
>[0-9]+))?
(/m
/(?P
<videoid
>[0-9]+))(/(?P
<title
>.+))$
'
3628 def _real_extract(self, url):
3629 mobj = re.match(self._VALID_URL, url)
3631 raise ExtractorError(u'Invalid URL
: %s' % url)
3633 video_id = mobj.group('videoid
')
3634 video_title = mobj.group('title
')
3636 # Get webpage content
3637 webpage = self._download_webpage(url, video_id)
3640 VIDEO_URL_RE = r'url
: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3641 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url
')
3642 video_url = compat_urllib_parse.unquote(video_url)
3644 #Get the uploaded date
3645 VIDEO_UPLOADED_RE = r'<div
class="video_added_by">Added (?P
<date
>[0-9\
/]+) by
'
3646 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date
', fatal=False)
3647 if upload_date: upload_date = unified_strdate(upload_date)
3649 info = {'id': video_id,
3652 'upload_date
': upload_date,
3653 'title
': video_title,
3659 class YouJizzIE(InfoExtractor):
3660 """Information extractor for youjizz.com."""
3661 _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?youjizz\
.com
/videos
/(?P
<videoid
>[^
.]+).html$
'
3663 def _real_extract(self, url):
3664 mobj = re.match(self._VALID_URL, url)
3666 raise ExtractorError(u'Invalid URL
: %s' % url)
3668 video_id = mobj.group('videoid
')
3670 # Get webpage content
3671 webpage = self._download_webpage(url, video_id)
3673 # Get the video title
3674 video_title = self._html_search_regex(r'<title
>(?P
<title
>.*)</title
>',
3675 webpage, u'title
').strip()
3677 # Get the embed page
3678 result = re.search(r'https?
://www
.youjizz
.com
/videos
/embed
/(?P
<videoid
>[0-9]+)', webpage)
3680 raise ExtractorError(u'ERROR
: unable to extract embed page
')
3682 embed_page_url = result.group(0).strip()
3683 video_id = result.group('videoid
')
3685 webpage = self._download_webpage(embed_page_url, video_id)
3688 video_url = self._search_regex(r'so
.addVariable\
("file",encodeURIComponent\
("(?P<source>[^"]+)"\)\);',
3689 webpage, u'video URL')
3691 info = {'id': video_id,
3693 'title': video_title,
3696 'player_url': embed_page_url}
3700 class EightTracksIE(InfoExtractor):
3702 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3704 def _real_extract(self, url):
3705 mobj = re.match(self._VALID_URL, url)
3707 raise ExtractorError(u'Invalid URL: %s' % url)
3708 playlist_id = mobj.group('id')
3710 webpage = self._download_webpage(url, playlist_id)
3712 json_like = self._search_regex(r"PAGE
.mix
= (.*?
);\n", webpage, u'trax information', flags=re.DOTALL)
3713 data = json.loads(json_like)
3715 session = str(random.randint(0, 1000000000))
3717 track_count = data['tracks_count']
3718 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3719 next_url = first_url
3721 for i in itertools.count():
3722 api_json = self._download_webpage(next_url, playlist_id,
3723 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3724 errnote=u'Failed to download song information')
3725 api_data = json.loads(api_json)
3726 track_data = api_data[u'set']['track']
3728 'id': track_data['id'],
3729 'url': track_data['track_file_stream_url'],
3730 'title': track_data['performer'] + u' - ' + track_data['name'],
3731 'raw_title': track_data['name'],
3732 'uploader_id': data['user']['login'],
3736 if api_data['set']['at_last_track']:
3738 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3741 class KeekIE(InfoExtractor):
3742 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3745 def _real_extract(self, url):
3746 m = re.match(self._VALID_URL, url)
3747 video_id = m.group('videoID')
3749 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3750 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3751 webpage = self._download_webpage(url, video_id)
3753 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"',
3756 uploader = self._html_search_regex(r'<div class="user
-name
-and-bio
">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3757 webpage, u'uploader', fatal=False)
3763 'title': video_title,
3764 'thumbnail': thumbnail,
3765 'uploader': uploader
3769 class TEDIE(InfoExtractor):
3770 _VALID_URL=r'''http://www\.ted\.com/
3772 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3774 ((?P<type_talk>talks)) # We have a simple talk
3776 (/lang/(.*?))? # The url may contain the language
3777 /(?P<name>\w+) # Here goes the name and then ".html
"
3781 def suitable(cls, url):
3782 """Receives a URL and returns True if suitable for this IE."""
3783 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3785 def _real_extract(self, url):
3786 m=re.match(self._VALID_URL, url, re.VERBOSE)
3787 if m.group('type_talk'):
3788 return [self._talk_info(url)]
3790 playlist_id=m.group('playlist_id')
3791 name=m.group('name')
3792 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3793 return [self._playlist_videos_info(url,name,playlist_id)]
3795 def _playlist_videos_info(self,url,name,playlist_id=0):
3796 '''Returns the videos of the playlist'''
3798 <li\ id="talk_(\d
+)"([.\s]*?)data-id="(?P
<video_id
>\d
+)"
3799 ([.\s]*?)data-playlist_item_id="(\d
+)"
3800 ([.\s]*?)data-mediaslug="(?P
<mediaSlug
>.+?
)"
3802 video_name_RE=r'<p\ class="talk
-title
"><a href="(?P
<talk_url
>/talks
/(.+).html
)">(?P<fullname>.+?)</a></p>'
3803 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3804 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3805 m_names=re.finditer(video_name_RE,webpage)
3807 playlist_title = self._html_search_regex(r'div class="headline
">\s*?<h1>\s*?<span>(.*?)</span>',
3808 webpage, 'playlist title')
3810 playlist_entries = []
3811 for m_video, m_name in zip(m_videos,m_names):
3812 video_id=m_video.group('video_id')
3813 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3814 playlist_entries.append(self.url_result(talk_url, 'TED'))
3815 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3817 def _talk_info(self, url, video_id=0):
3818 """Return the video for the talk in the url"""
3819 m = re.match(self._VALID_URL, url,re.VERBOSE)
3820 video_name = m.group('name')
3821 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3822 self.report_extraction(video_name)
3823 # If the url includes the language we get the title translated
3824 title = self._html_search_regex(r'<span id="altHeadline
" >(?P<title>.*)</span>',
3826 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3827 webpage, 'json data')
3828 info = json.loads(json_data)
3829 desc = self._html_search_regex(r'<div class="talk
-intro
">.*?<p.*?>(.*?)</p>',
3830 webpage, 'description', flags = re.DOTALL)
3832 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?
)"',
3833 webpage, 'thumbnail')
3836 'url': info['htmlStreams'][-1]['file'],
3839 'thumbnail': thumbnail,
3840 'description': desc,
3844 class MySpassIE(InfoExtractor):
3845 _VALID_URL = r'http://www.myspass.de/.*'
3847 def _real_extract(self, url):
3848 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3850 # video id is the last path element of the URL
3851 # usually there is a trailing slash, so also try the second but last
3852 url_path = compat_urllib_parse_urlparse(url).path
3853 url_parent_path, video_id = os.path.split(url_path)
3855 _, video_id = os.path.split(url_parent_path)
3858 metadata_url = META_DATA_URL_TEMPLATE % video_id
3859 metadata_text = self._download_webpage(metadata_url, video_id)
3860 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3862 # extract values from metadata
3863 url_flv_el = metadata.find('url_flv')
3864 if url_flv_el is None:
3865 raise ExtractorError(u'Unable to extract download url')
3866 video_url = url_flv_el.text
3867 extension = os.path.splitext(video_url)[1][1:]
3868 title_el = metadata.find('title')
3869 if title_el is None:
3870 raise ExtractorError(u'Unable to extract title')
3871 title = title_el.text
3872 format_id_el = metadata.find('format_id')
3873 if format_id_el is None:
3876 format = format_id_el.text
3877 description_el = metadata.find('description')
3878 if description_el is not None:
3879 description = description_el.text
3882 imagePreview_el = metadata.find('imagePreview')
3883 if imagePreview_el is not None:
3884 thumbnail = imagePreview_el.text
3893 'thumbnail': thumbnail,
3894 'description': description
3898 class SpiegelIE(InfoExtractor):
3899 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3901 def _real_extract(self, url):
3902 m = re.match(self._VALID_URL, url)
3903 video_id = m.group('videoID')
3905 webpage = self._download_webpage(url, video_id)
3907 video_title = self._html_search_regex(r'<div class="module
-title
">(.*?)</div>',
3910 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3911 xml_code = self._download_webpage(xml_url, video_id,
3912 note=u'Downloading XML', errnote=u'Failed to download XML')
3914 idoc = xml.etree.ElementTree.fromstring(xml_code)
3915 last_type = idoc[-1]
3916 filename = last_type.findall('./filename')[0].text
3917 duration = float(last_type.findall('./duration')[0].text)
3919 video_url = 'http://video2.spiegel.de/flash/' + filename
3920 video_ext = filename.rpartition('.')[2]
3925 'title': video_title,
3926 'duration': duration,
3930 class LiveLeakIE(InfoExtractor):
3932 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3933 IE_NAME = u'liveleak'
3935 def _real_extract(self, url):
3936 mobj = re.match(self._VALID_URL, url)
3938 raise ExtractorError(u'Invalid URL: %s' % url)
3940 video_id = mobj.group('video_id')
3942 webpage = self._download_webpage(url, video_id)
3944 video_url = self._search_regex(r'file: "(.*?
)",',
3945 webpage, u'video URL')
3947 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"',
3948 webpage, u'title').replace('LiveLeak.com -', '').strip()
3950 video_description = self._html_search_regex(r'<meta property="og
:description
" content="(?P
<desc
>.*?
)"',
3951 webpage, u'description', fatal=False)
3953 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3954 webpage, u'uploader', fatal=False)
3960 'title': video_title,
3961 'description': video_description,
3962 'uploader': video_uploader
3967 class ARDIE(InfoExtractor):
3968 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3969 _TITLE = r'<h1(?: class="boxTopHeadline
")?>(?P<title>.*)</h1>'
3970 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P
<rtmp_url
>[^
"]*)", "(?P<video_url>[^"]*)", "[^
"]*"\
)'
3972 def _real_extract(self, url):
3973 # determine video id from url
3974 m = re.match(self._VALID_URL, url)
3976 numid = re.search(r'documentId
=([0-9]+)', url)
3978 video_id = numid.group(1)
3980 video_id = m.group('video_id
')
3982 # determine title and media streams from webpage
3983 html = self._download_webpage(url, video_id)
3984 title = re.search(self._TITLE, html).group('title
')
3985 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3987 assert '"fsk"' in html
3988 raise ExtractorError(u'This video
is only available after
8:00 pm
')
3990 # choose default media type and highest quality for now
3991 stream = max([s for s in streams if int(s["media_type"]) == 0],
3992 key=lambda s: int(s["quality"]))
3994 # there's two possibilities
: RTMP stream
or HTTP download
3995 info
= {'id': video_id
, 'title': title
, 'ext': 'mp4'}
3996 if stream
['rtmp_url']:
3997 self
.to_screen(u
'RTMP download detected')
3998 assert stream
['video_url'].startswith('mp4:')
3999 info
["url"] = stream
["rtmp_url"]
4000 info
["play_path"] = stream
['video_url']
4002 assert stream
["video_url"].endswith('.mp4')
4003 info
["url"] = stream
["video_url"]
4006 class ZDFIE(InfoExtractor
):
4007 _VALID_URL
= r
'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4008 _TITLE
= r
'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
4009 _MEDIA_STREAM
= r
'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
4010 _MMS_STREAM
= r
'href="(?P<video_url>mms://[^"]*)"'
4011 _RTSP_STREAM
= r
'(?P<video_url>rtsp://[^"]*.mp4)'
4013 def _real_extract(self
, url
):
4014 mobj
= re
.match(self
._VALID
_URL
, url
)
4016 raise ExtractorError(u
'Invalid URL: %s' % url
)
4017 video_id
= mobj
.group('video_id')
4019 html
= self
._download
_webpage
(url
, video_id
)
4020 streams
= [m
.groupdict() for m
in re
.finditer(self
._MEDIA
_STREAM
, html
)]
4022 raise ExtractorError(u
'No media url found.')
4024 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
4025 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
4026 # choose first/default media type and highest quality for now
4027 for s
in streams
: #find 300 - dsl1000mbit
4028 if s
['quality'] == '300' and s
['media_type'] == 'wstreaming':
4031 for s
in streams
: #find veryhigh - dsl2000mbit
4032 if s
['quality'] == 'veryhigh' and s
['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
4036 raise ExtractorError(u
'No stream found.')
4038 media_link
= self
._download
_webpage
(stream_
['video_url'], video_id
,'Get stream URL')
4040 self
.report_extraction(video_id
)
4041 mobj
= re
.search(self
._TITLE
, html
)
4043 raise ExtractorError(u
'Cannot extract title')
4044 title
= unescapeHTML(mobj
.group('title'))
4046 mobj
= re
.search(self
._MMS
_STREAM
, media_link
)
4048 mobj
= re
.search(self
._RTSP
_STREAM
, media_link
)
4050 raise ExtractorError(u
'Cannot extract mms:// or rtsp:// URL')
4051 mms_url
= mobj
.group('video_url')
4053 mobj
= re
.search('(.*)[.](?P<ext>[^.]+)', mms_url
)
4055 raise ExtractorError(u
'Cannot extract extention')
4056 ext
= mobj
.group('ext')
4058 return [{'id': video_id
,
4064 class TumblrIE(InfoExtractor
):
4065 _VALID_URL
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4067 def _real_extract(self
, url
):
4068 m_url
= re
.match(self
._VALID
_URL
, url
)
4069 video_id
= m_url
.group('id')
4070 blog
= m_url
.group('blog_name')
4072 url
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
)
4073 webpage
= self
._download
_webpage
(url
, video_id
)
4075 re_video
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
)
4076 video
= re
.search(re_video
, webpage
)
4078 raise ExtractorError(u
'Unable to extract video')
4079 video_url
= video
.group('video_url')
4080 ext
= video
.group('ext')
4082 video_thumbnail
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
4083 webpage
, u
'thumbnail', fatal
=False) # We pick the first poster
4084 if video_thumbnail
: video_thumbnail
= video_thumbnail
.replace('\\', '')
4086 # The only place where you can get a title, it's not complete,
4087 # but searching in other places doesn't work for all videos
4088 video_title
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>',
4089 webpage
, u
'title', flags
=re
.DOTALL
)
4091 return [{'id': video_id
,
4093 'title': video_title
,
4094 'thumbnail': video_thumbnail
,
4098 class BandcampIE(InfoExtractor
):
4099 _VALID_URL
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4101 def _real_extract(self
, url
):
4102 mobj
= re
.match(self
._VALID
_URL
, url
)
4103 title
= mobj
.group('title')
4104 webpage
= self
._download
_webpage
(url
, title
)
4105 # We get the link to the free download page
4106 m_download
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
)
4107 if m_download
is None:
4108 raise ExtractorError(u
'No free songs found')
4110 download_link
= m_download
.group(1)
4111 id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4112 webpage
, re
.MULTILINE|re
.DOTALL
).group('id')
4114 download_webpage
= self
._download
_webpage
(download_link
, id,
4115 'Downloading free downloads page')
4116 # We get the dictionary of the track from some javascrip code
4117 info
= re
.search(r
'items: (.*?),$',
4118 download_webpage
, re
.MULTILINE
).group(1)
4119 info
= json
.loads(info
)[0]
4120 # We pick mp3-320 for now, until format selection can be easily implemented.
4121 mp3_info
= info
[u
'downloads'][u
'mp3-320']
4122 # If we try to use this url it says the link has expired
4123 initial_url
= mp3_info
[u
'url']
4124 re_url
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4125 m_url
= re
.match(re_url
, initial_url
)
4126 #We build the url we will use to get the final track url
4127 # This url is build in Bandcamp in the script download_bunde_*.js
4128 request_url
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts'))
4129 final_url_webpage
= self
._download
_webpage
(request_url
, id, 'Requesting download url')
4130 # If we could correctly generate the .rand field the url would be
4131 #in the "download_url" key
4132 final_url
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1)
4134 track_info
= {'id':id,
4135 'title' : info
[u
'title'],
4138 'thumbnail' : info
[u
'thumb_url'],
4139 'uploader' : info
[u
'artist']
4144 class RedTubeIE(InfoExtractor
):
4145 """Information Extractor for redtube"""
4146 _VALID_URL
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4148 def _real_extract(self
,url
):
4149 mobj
= re
.match(self
._VALID
_URL
, url
)
4151 raise ExtractorError(u
'Invalid URL: %s' % url
)
4153 video_id
= mobj
.group('id')
4154 video_extension
= 'mp4'
4155 webpage
= self
._download
_webpage
(url
, video_id
)
4157 self
.report_extraction(video_id
)
4159 video_url
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">',
4160 webpage
, u
'video URL')
4162 video_title
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
4168 'ext': video_extension
,
4169 'title': video_title
,
4172 class InaIE(InfoExtractor
):
4173 """Information Extractor for Ina.fr"""
4174 _VALID_URL
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4176 def _real_extract(self
,url
):
4177 mobj
= re
.match(self
._VALID
_URL
, url
)
4179 video_id
= mobj
.group('id')
4180 mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
4181 video_extension
= 'mp4'
4182 webpage
= self
._download
_webpage
(mrss_url
, video_id
)
4184 self
.report_extraction(video_id
)
4186 video_url
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4187 webpage
, u
'video URL')
4189 video_title
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4195 'ext': video_extension
,
4196 'title': video_title
,
4199 class HowcastIE(InfoExtractor
):
4200 """Information Extractor for Howcast.com"""
4201 _VALID_URL
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4203 def _real_extract(self
, url
):
4204 mobj
= re
.match(self
._VALID
_URL
, url
)
4206 video_id
= mobj
.group('id')
4207 webpage_url
= 'http://www.howcast.com/videos/' + video_id
4208 webpage
= self
._download
_webpage
(webpage_url
, video_id
)
4210 self
.report_extraction(video_id
)
4212 video_url
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4213 webpage, u'video URL')
4215 video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'',
4218 video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4219 webpage, u'description', fatal=False)
4221 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4222 webpage, u'thumbnail', fatal=False)
4228 'title': video_title,
4229 'description': video_description,
4230 'thumbnail': thumbnail,
4233 class VineIE(InfoExtractor):
4234 """Information Extractor for Vine.co"""
4235 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4237 def _real_extract(self, url):
4238 mobj = re.match(self._VALID_URL, url)
4240 video_id = mobj.group('id')
4241 webpage_url = 'https://vine.co/v/' + video_id
4242 webpage = self._download_webpage(webpage_url, video_id)
4244 self.report_extraction(video_id)
4246 video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"',
4247 webpage, u'video URL')
4249 video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"',
4252 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"',
4253 webpage, u'thumbnail', fatal=False)
4255 uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>',
4256 webpage, u'uploader', fatal=False, flags=re.DOTALL)
4262 'title': video_title,
4263 'thumbnail': thumbnail,
4264 'uploader': uploader,
4267 class FlickrIE(InfoExtractor):
4268 """Information Extractor for Flickr videos"""
4269 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4271 def _real_extract(self, url):
4272 mobj = re.match(self._VALID_URL, url)
4274 video_id = mobj.group('id')
4275 video_uploader_id = mobj.group('uploader_id')
4276 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4277 webpage = self._download_webpage(webpage_url, video_id)
4279 secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret')
4281 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4282 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4284 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4285 first_xml, u'node_id')
4287 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4288 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4290 self.report_extraction(video_id)
4292 mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml)
4294 raise ExtractorError(u'Unable to extract video url')
4295 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4297 video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
4298 webpage, u'video title
')
4300 video_description = self._html_search_regex(r'<meta
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')',
4301 webpage, u'description', fatal=False)
4303 thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')',
4304 webpage, u'thumbnail
', fatal=False)
4310 'title
': video_title,
4311 'description
': video_description,
4312 'thumbnail
': thumbnail,
4313 'uploader_id
': video_uploader_id,
4316 class TeamcocoIE(InfoExtractor):
4317 _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)'
4319 def _real_extract(self, url):
4320 mobj = re.match(self._VALID_URL, url)
4322 raise ExtractorError(u'Invalid URL
: %s' % url)
4323 url_title = mobj.group('url_title
')
4324 webpage = self._download_webpage(url, url_title)
4326 video_id = self._html_search_regex(r'<article
class="video" data
-id="(\d+?)"',
4327 webpage, u'video
id')
4329 self.report_extraction(video_id)
4331 video_title = self._html_search_regex(r'<meta
property="og:title" content
="(.+?)"',
4334 thumbnail = self._html_search_regex(r'<meta
property="og:image" content
="(.+?)"',
4335 webpage, u'thumbnail
', fatal=False)
4337 video_description = self._html_search_regex(r'<meta
property="og:description" content
="(.*?)"',
4338 webpage, u'description
', fatal=False)
4340 data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id
4341 data = self._download_webpage(data_url, video_id, 'Downloading data webpage
')
4343 video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>',
4350 'title
': video_title,
4351 'thumbnail
': thumbnail,
4352 'description
': video_description,
4355 class XHamsterIE(InfoExtractor):
4356 """Information Extractor for xHamster"""
4357 _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
'
4359 def _real_extract(self,url):
4360 mobj = re.match(self._VALID_URL, url)
4362 video_id = mobj.group('id')
4363 mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id
4364 webpage = self._download_webpage(mrss_url, video_id)
4366 mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage)
4368 raise ExtractorError(u'Unable to extract media URL
')
4369 if len(mobj.group('server
')) == 0:
4370 video_url = compat_urllib_parse.unquote(mobj.group('file'))
4372 video_url = mobj.group('server
')+'/key
='+mobj.group('file')
4373 video_extension = video_url.split('.')[-1]
4375 video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>',
4378 # Can't see the description anywhere
in the UI
4379 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4380 # webpage, u'description', fatal=False)
4381 # if video_description: video_description = unescapeHTML(video_description)
4383 mobj
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2}
[A
-Z
]{3,4}\'', webpage)
4385 video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
')
4387 video_upload_date = None
4388 self._downloader.report_warning(u'Unable to extract upload date
')
4390 video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)',
4391 webpage, u'uploader
id', default=u'anonymous
')
4393 video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'',
4394 webpage, u'thumbnail
', fatal=False)
4399 'ext
': video_extension,
4400 'title
': video_title,
4401 # 'description
': video_description,
4402 'upload_date
': video_upload_date,
4403 'uploader_id
': video_uploader_id,
4404 'thumbnail
': video_thumbnail
4407 class HypemIE(InfoExtractor):
4408 """Information Extractor for hypem"""
4409 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)'
4411 def _real_extract(self, url):
4412 mobj = re.match(self._VALID_URL, url)
4414 raise ExtractorError(u'Invalid URL
: %s' % url)
4415 track_id = mobj.group(1)
4417 data = { 'ax
': 1, 'ts
': time.time() }
4418 data_encoded = compat_urllib_parse.urlencode(data)
4419 complete_url = url + "?" + data_encoded
4420 request = compat_urllib_request.Request(complete_url)
4421 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage
with the url
')
4422 cookie = urlh.headers.get('Set
-Cookie
', '')
4424 self.report_extraction(track_id)
4426 html_tracks = self._html_search_regex(r'<script
type="application/json" id="displayList-data">(.*?
)</script
>',
4427 response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip()
4429 track_list = json.loads(html_tracks)
4430 track = track_list[u'tracks
'][0]
4432 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
4435 track_id = track[u"id"]
4436 artist = track[u"artist"]
4437 title = track[u"song"]
4439 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4440 request = compat_urllib_request.Request(serve_url, "" , {'Content
-Type
': 'application
/json
'})
4441 request.add_header('cookie
', cookie)
4442 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
')
4444 song_data = json.loads(song_data_json)
4446 raise ExtractorError(u'Hypemachine contained invalid JSON
.')
4447 final_url = song_data[u"url"]
4457 class Vbox7IE(InfoExtractor):
4458 """Information Extractor for Vbox7"""
4459 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)'
4461 def _real_extract(self,url):
4462 mobj = re.match(self._VALID_URL, url)
4464 raise ExtractorError(u'Invalid URL
: %s' % url)
4465 video_id = mobj.group(1)
4467 redirect_page, urlh = self._download_webpage_handle(url, video_id)
4468 new_location = self._search_regex(r'window\
.location
= \'(.*)\';', redirect_page, u'redirect location
')
4469 redirect_url = urlh.geturl() + new_location
4470 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
')
4472 title = self._html_search_regex(r'<title
>(.*)</title
>',
4473 webpage, u'title
').split('/')[0].strip()
4476 info_url = "http://vbox7.com/play/magare.do"
4477 data = compat_urllib_parse.urlencode({'as3
':'1','vid
':video_id})
4478 info_request = compat_urllib_request.Request(info_url, data)
4479 info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
')
4480 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
')
4481 if info_response is None:
4482 raise ExtractorError(u'Unable to extract the media url
')
4483 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4490 'thumbnail
': thumbnail_url,
4493 class GametrailersIE(InfoExtractor):
4494 _VALID_URL = r'http
://www
.gametrailers
.com
/(?P
<type>videos|reviews|full
-episodes
)/(?P
<id>.*?
)/(?P
<title
>.*)'
4496 def _real_extract(self, url):
4497 mobj = re.match(self._VALID_URL, url)
4499 raise ExtractorError(u'Invalid URL
: %s' % url)
4500 video_id = mobj.group('id')
4501 video_type = mobj.group('type')
4502 webpage = self._download_webpage(url, video_id)
4503 if video_type == 'full
-episodes
':
4504 mgid_re = r'data
-video
="(?P<mgid>mgid:.*?)"'
4506 mgid_re = r'data
-contentId
=\'(?P
<mgid
>mgid
:.*?
)\''
4507 mgid = self._search_regex(mgid_re, webpage, u'mgid
')
4508 data = compat_urllib_parse.urlencode({'uri
': mgid, 'acceptMethods
': 'fms
'})
4510 info_page = self._download_webpage('http
://www
.gametrailers
.com
/feeds
/mrss?
' + data,
4511 video_id, u'Downloading video info
')
4512 links_webpage = self._download_webpage('http
://www
.gametrailers
.com
/feeds
/mediagen
/?
' + data,
4513 video_id, u'Downloading video urls info
')
4515 self.report_extraction(video_id)
4516 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4517 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4519 <url>(?P<thumb>.*?)</url>.*
4522 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4524 raise ExtractorError(u'Unable to extract video info
')
4525 video_title = m_info.group('title
')
4526 video_description = m_info.group('description
')
4527 video_thumb = m_info.group('thumb
')
4529 m_urls = list(re.finditer(r'<src
>(?P
<url
>.*)</src
>', links_webpage))
4530 if m_urls is None or len(m_urls) == 0:
4531 raise ExtractError(u'Unable to extrat video url
')
4532 # They are sorted from worst to best quality
4533 video_url = m_urls[-1].group('url
')
4535 return {'url
': video_url,
4537 'title
': video_title,
4538 # Videos are actually flv not mp4
4540 'thumbnail
': video_thumb,
4541 'description
': video_description,
4544 def gen_extractors():
4545 """ Return a list of an instance of every supported extractor.
4546 The order does matter; the first extractor matched is the one handling the URL.
4549 YoutubePlaylistIE(),
4574 StanfordOpenClassroomIE(),
4584 WorldStarHipHopIE(),
4613 def get_info_extractor(ie_name):
4614 """Returns the info extractor class with the given ie_name"""
4615 return globals()[ie_name+'IE
']