2 # -*- coding: utf-8 -*- 
   4 from __future__ 
import absolute_import
 
  15 import xml
.etree
.ElementTree
 
  26 class InfoExtractor(object): 
  27     """Information Extractor class. 
  29     Information extractors are the classes that, given a URL, extract 
  30     information about the video (or videos) the URL refers to. This 
  31     information includes the real video URL, the video title, author and 
  32     others. The information is stored in a dictionary which is then 
  33     passed to the FileDownloader. The FileDownloader processes this 
  34     information possibly downloading the video to the file system, among 
  35     other possible outcomes. 
  37     The dictionaries must include the following fields: 
  41     title:          Video title, unescaped. 
  42     ext:            Video filename extension. 
  44     The following fields are optional: 
  46     format:         The video format, defaults to ext (used for --get-format) 
  47     thumbnail:      Full URL to a video thumbnail image. 
  48     description:    One-line video description. 
  49     uploader:       Full name of the video uploader. 
  50     upload_date:    Video upload date (YYYYMMDD). 
  51     uploader_id:    Nickname or id of the video uploader. 
  52     location:       Physical location of the video. 
  53     player_url:     SWF Player URL (used for rtmpdump). 
  54     subtitles:      The subtitle file contents. 
  55     urlhandle:      [internal] The urlHandle to be used to download the file, 
  56                     like returned by urllib.request.urlopen 
  58     The fields should all be Unicode strings. 
  60     Subclasses of this one should re-define the _real_initialize() and 
  61     _real_extract() methods and define a _VALID_URL regexp. 
  62     Probably, they should also be added to the list of extractors. 
  64     _real_extract() must return a *list* of information dictionaries as 
  67     Finally, the _WORKING attribute should be set to False for broken IEs 
  68     in order to warn the users and skip the tests. 
  75     def __init__(self
, downloader
=None): 
  76         """Constructor. Receives an optional downloader.""" 
  78         self
.set_downloader(downloader
) 
  81     def suitable(cls
, url
): 
  82         """Receives a URL and returns True if suitable for this IE.""" 
  83         return re
.match(cls
._VALID
_URL
, url
) is not None 
  87         """Getter method for _WORKING.""" 
  91         """Initializes an instance (authentication, etc).""" 
  93             self
._real
_initialize
() 
  96     def extract(self
, url
): 
  97         """Extracts URL information and returns it in list of dicts.""" 
  99         return self
._real
_extract
(url
) 
 101     def set_downloader(self
, downloader
): 
 102         """Sets the downloader for this IE.""" 
 103         self
._downloader 
= downloader
 
 105     def _real_initialize(self
): 
 106         """Real initialization process. Redefine in subclasses.""" 
 109     def _real_extract(self
, url
): 
 110         """Real extraction process. Redefine in subclasses.""" 
 115         return type(self
).__name
__[:-2] 
 117     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 118         """ Returns the response handle """ 
 120             self
.report_download_webpage(video_id
) 
 121         elif note 
is not False: 
 122             self
.to_screen(u
'%s: %s' % (video_id
, note
)) 
 124             return compat_urllib_request
.urlopen(url_or_request
) 
 125         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 127                 errnote 
= u
'Unable to download webpage' 
 128             raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2]) 
 130     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 131         """ Returns a tuple (page content as string, URL handle) """ 
 132         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
) 
 133         content_type 
= urlh
.headers
.get('Content-Type', '') 
 134         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 136             encoding 
= m
.group(1) 
 139         webpage_bytes 
= urlh
.read() 
 140         if self
._downloader
.params
.get('dump_intermediate_pages', False): 
 142                 url 
= url_or_request
.get_full_url() 
 143             except AttributeError: 
 145             self
.to_screen(u
'Dumping request to ' + url
) 
 146             dump 
= base64
.b64encode(webpage_bytes
).decode('ascii') 
 147             self
._downloader
.to_screen(dump
) 
 148         content 
= webpage_bytes
.decode(encoding
, 'replace') 
 149         return (content
, urlh
) 
 151     def _download_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 152         """ Returns the data of the page as a string """ 
 153         return self
._download
_webpage
_handle
(url_or_request
, video_id
, note
, errnote
)[0] 
 155     def to_screen(self
, msg
): 
 156         """Print msg to screen, prefixing it with '[ie_name]'""" 
 157         self
._downloader
.to_screen(u
'[%s] %s' % (self
.IE_NAME
, msg
)) 
 159     def report_extraction(self
, id_or_name
): 
 160         """Report information extraction.""" 
 161         self
.to_screen(u
'%s: Extracting information' % id_or_name
) 
 163     def report_download_webpage(self
, video_id
): 
 164         """Report webpage download.""" 
 165         self
.to_screen(u
'%s: Downloading webpage' % video_id
) 
 167     def report_age_confirmation(self
): 
 168         """Report attempt to confirm age.""" 
 169         self
.to_screen(u
'Confirming age') 
 171     #Methods for following #608 
 172     #They set the correct value of the '_type' key 
 173     def video_result(self
, video_info
): 
 174         """Returns a video""" 
 175         video_info
['_type'] = 'video' 
 177     def url_result(self
, url
, ie
=None): 
 178         """Returns a url that points to a page that should be processed""" 
 179         #TODO: ie should be the class used for getting the info 
 180         video_info 
= {'_type': 'url', 
 184     def playlist_result(self
, entries
, playlist_id
=None, playlist_title
=None): 
 185         """Returns a playlist""" 
 186         video_info 
= {'_type': 'playlist', 
 189             video_info
['id'] = playlist_id
 
 191             video_info
['title'] = playlist_title
 
 194     def _search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0): 
 196         Perform a regex search on the given string, using a single or a list of 
 197         patterns returning the first matching group. 
 198         In case of failure return a default value or raise a WARNING or a 
 199         ExtractorError, depending on fatal, specifying the field name. 
 201         if isinstance(pattern
, (str, compat_str
, compiled_regex_type
)): 
 202             mobj 
= re
.search(pattern
, string
, flags
) 
 205                 mobj 
= re
.search(p
, string
, flags
) 
 208         if sys
.stderr
.isatty() and os
.name 
!= 'nt': 
 209             _name 
= u
'\033[0;34m%s\033[0m' % name
 
 214             # return the first matching group 
 215             return next(g 
for g 
in mobj
.groups() if g 
is not None) 
 216         elif default 
is not None: 
 219             raise ExtractorError(u
'Unable to extract %s' % _name
) 
 221             self
._downloader
.report_warning(u
'unable to extract %s; ' 
 222                 u
'please report this issue on GitHub.' % _name
) 
 225     def _html_search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0): 
 227         Like _search_regex, but strips HTML tags and unescapes entities. 
 229         res 
= self
._search
_regex
(pattern
, string
, name
, default
, fatal
, flags
) 
 231             return clean_html(res
).strip() 
 235 class SearchInfoExtractor(InfoExtractor
): 
 237     Base class for paged search queries extractors. 
 238     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} 
 239     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
 243     def _make_valid_url(cls
): 
 244         return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
 
 247     def suitable(cls
, url
): 
 248         return re
.match(cls
._make
_valid
_url
(), url
) is not None 
 250     def _real_extract(self
, query
): 
 251         mobj 
= re
.match(self
._make
_valid
_url
(), query
) 
 253             raise ExtractorError(u
'Invalid search query "%s"' % query
) 
 255         prefix 
= mobj
.group('prefix') 
 256         query 
= mobj
.group('query') 
 258             return self
._get
_n
_results
(query
, 1) 
 259         elif prefix 
== 'all': 
 260             return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
) 
 264                 raise ExtractorError(u
'invalid download number %s for query "%s"' % (n
, query
)) 
 265             elif n 
> self
._MAX
_RESULTS
: 
 266                 self
._downloader
.report_warning(u
'%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
)) 
 267                 n 
= self
._MAX
_RESULTS
 
 268             return self
._get
_n
_results
(query
, n
) 
 270     def _get_n_results(self
, query
, n
): 
 271         """Get a specified number of results for a query""" 
 272         raise NotImplementedError("This method must be implemented by sublclasses") 
 275 class YoutubeIE(InfoExtractor
): 
 276     """Information extractor for youtube.com.""" 
 280                          (?:https?://)?                                       # http(s):// (optional) 
 281                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| 
 282                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains 
 283                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls 
 284                          (?:                                                  # the various things that can precede the ID: 
 285                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/ 
 286                              |(?:                                             # or the v= param in all its forms 
 287                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx) 
 288                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #! 
 289                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx) 
 292                          )?                                                   # optional -> youtube.com/xxxx is OK 
 293                      )?                                                       # all until now is optional -> you can pass the naked ID 
 294                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID 
 295                      (?(1).+)?                                                # if we found the ID, everything can follow 
 297     _LANG_URL 
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 
 298     _LOGIN_URL 
= 'https://accounts.google.com/ServiceLogin' 
 299     _AGE_URL 
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 
 300     _NEXT_URL_RE 
= r
'[\?&]next_url=([^&]+)' 
 301     _NETRC_MACHINE 
= 'youtube' 
 302     # Listed in order of quality 
 303     _available_formats 
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] 
 304     _available_formats_prefer_free 
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] 
 305     _video_extensions 
= { 
 311         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 
 317     _video_dimensions 
= { 
 336     def suitable(cls
, url
): 
 337         """Receives a URL and returns True if suitable for this IE.""" 
 338         if YoutubePlaylistIE
.suitable(url
): return False 
 339         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
 341     def report_lang(self
): 
 342         """Report attempt to set language.""" 
 343         self
.to_screen(u
'Setting language') 
 345     def report_login(self
): 
 346         """Report attempt to log in.""" 
 347         self
.to_screen(u
'Logging in') 
 349     def report_video_webpage_download(self
, video_id
): 
 350         """Report attempt to download video webpage.""" 
 351         self
.to_screen(u
'%s: Downloading video webpage' % video_id
) 
 353     def report_video_info_webpage_download(self
, video_id
): 
 354         """Report attempt to download video info webpage.""" 
 355         self
.to_screen(u
'%s: Downloading video info webpage' % video_id
) 
 357     def report_video_subtitles_download(self
, video_id
): 
 358         """Report attempt to download video info webpage.""" 
 359         self
.to_screen(u
'%s: Checking available subtitles' % video_id
) 
 361     def report_video_subtitles_request(self
, video_id
, sub_lang
, format
): 
 362         """Report attempt to download video info webpage.""" 
 363         self
.to_screen(u
'%s: Downloading video subtitles for %s.%s' % (video_id
, sub_lang
, format
)) 
 365     def report_video_subtitles_available(self
, video_id
, sub_lang_list
): 
 366         """Report available subtitles.""" 
 367         sub_lang 
= ",".join(list(sub_lang_list
.keys())) 
 368         self
.to_screen(u
'%s: Available subtitles for video: %s' % (video_id
, sub_lang
)) 
 370     def report_information_extraction(self
, video_id
): 
 371         """Report attempt to extract video information.""" 
 372         self
.to_screen(u
'%s: Extracting video information' % video_id
) 
 374     def report_unavailable_format(self
, video_id
, format
): 
 375         """Report extracted video URL.""" 
 376         self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
)) 
 378     def report_rtmp_download(self
): 
 379         """Indicate the download will use the RTMP protocol.""" 
 380         self
.to_screen(u
'RTMP download detected') 
 382     def _get_available_subtitles(self
, video_id
): 
 383         self
.report_video_subtitles_download(video_id
) 
 384         request 
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
) 
 386             sub_list 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 387         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 388             return (u
'unable to download video subtitles: %s' % compat_str(err
), None) 
 389         sub_lang_list 
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
) 
 390         sub_lang_list 
= dict((l
[1], l
[0]) for l 
in sub_lang_list
) 
 391         if not sub_lang_list
: 
 392             return (u
'video doesn\'t have subtitles', None) 
 395     def _list_available_subtitles(self
, video_id
): 
 396         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 397         self
.report_video_subtitles_available(video_id
, sub_lang_list
) 
 399     def _request_subtitle(self
, sub_lang
, sub_name
, video_id
, format
): 
 402         (error_message, sub_lang, sub) 
 404         self
.report_video_subtitles_request(video_id
, sub_lang
, format
) 
 405         params 
= compat_urllib_parse
.urlencode({ 
 411         url 
= 'http://www.youtube.com/api/timedtext?' + params
 
 413             sub 
= compat_urllib_request
.urlopen(url
).read().decode('utf-8') 
 414         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 415             return (u
'unable to download video subtitles: %s' % compat_str(err
), None, None) 
 417             return (u
'Did not fetch video subtitles', None, None) 
 418         return (None, sub_lang
, sub
) 
 420     def _request_automatic_caption(self
, video_id
, webpage
): 
 421         """We need the webpage for getting the captions url, pass it as an 
 422            argument to speed up the process.""" 
 423         sub_lang 
= self
._downloader
.params
.get('subtitleslang') 
 424         sub_format 
= self
._downloader
.params
.get('subtitlesformat') 
 425         self
.to_screen(u
'%s: Looking for automatic captions' % video_id
) 
 426         mobj 
= re
.search(r
';ytplayer.config = ({.*?});', webpage
) 
 427         err_msg 
= u
'Couldn\'t find automatic captions for "%s"' % sub_lang
 
 429             return [(err_msg
, None, None)] 
 430         player_config 
= json
.loads(mobj
.group(1)) 
 432             args 
= player_config
[u
'args'] 
 433             caption_url 
= args
[u
'ttsurl'] 
 434             timestamp 
= args
[u
'timestamp'] 
 435             params 
= compat_urllib_parse
.urlencode({ 
 442             subtitles_url 
= caption_url 
+ '&' + params
 
 443             sub 
= self
._download
_webpage
(subtitles_url
, video_id
, u
'Downloading automatic captions') 
 444             return [(None, sub_lang
, sub
)] 
 446             return [(err_msg
, None, None)] 
 448     def _extract_subtitle(self
, video_id
): 
 450         Return a list with a tuple: 
 451         [(error_message, sub_lang, sub)] 
 453         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 454         sub_format 
= self
._downloader
.params
.get('subtitlesformat') 
 455         if  isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles 
 456             return [(sub_lang_list
[0], None, None)] 
 457         if self
._downloader
.params
.get('subtitleslang', False): 
 458             sub_lang 
= self
._downloader
.params
.get('subtitleslang') 
 459         elif 'en' in sub_lang_list
: 
 462             sub_lang 
= list(sub_lang_list
.keys())[0] 
 463         if not sub_lang 
in sub_lang_list
: 
 464             return [(u
'no closed captions found in the specified language "%s"' % sub_lang
, None, None)] 
 466         subtitle 
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
) 
 469     def _extract_all_subtitles(self
, video_id
): 
 470         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 471         sub_format 
= self
._downloader
.params
.get('subtitlesformat') 
 472         if  isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles 
 473             return [(sub_lang_list
[0], None, None)] 
 475         for sub_lang 
in sub_lang_list
: 
 476             subtitle 
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
) 
 477             subtitles
.append(subtitle
) 
 480     def _print_formats(self
, formats
): 
 481         print('Available formats:') 
 483             print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???'))) 
 485     def _real_initialize(self
): 
 486         if self
._downloader 
is None: 
 491         downloader_params 
= self
._downloader
.params
 
 493         # Attempt to use provided username and password or .netrc data 
 494         if downloader_params
.get('username', None) is not None: 
 495             username 
= downloader_params
['username'] 
 496             password 
= downloader_params
['password'] 
 497         elif downloader_params
.get('usenetrc', False): 
 499                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
 504                     raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
 505             except (IOError, netrc
.NetrcParseError
) as err
: 
 506                 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
)) 
 510         request 
= compat_urllib_request
.Request(self
._LANG
_URL
) 
 513             compat_urllib_request
.urlopen(request
).read() 
 514         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 515             self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
)) 
 518         # No authentication to be performed 
 522         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
) 
 524             login_page 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 525         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 526             self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
)) 
 531         match 
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
) 
 533           galx 
= match
.group(1) 
 535         match 
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
) 
 541                 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', 
 545                 u
'PersistentCookie': u
'yes', 
 547                 u
'bgresponse': u
'js_disabled', 
 548                 u
'checkConnection': u
'', 
 549                 u
'checkedDomains': u
'youtube', 
 555                 u
'signIn': u
'Sign in', 
 557                 u
'service': u
'youtube', 
 561         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode 
 563         login_form 
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v 
in login_form_strs
.items()) 
 564         login_data 
= compat_urllib_parse
.urlencode(login_form
).encode('ascii') 
 565         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
) 
 568             login_results 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 569             if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None: 
 570                 self
._downloader
.report_warning(u
'unable to log in: bad username or password') 
 572         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 573             self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
)) 
 579                 'action_confirm':   'Confirm', 
 581         request 
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
)) 
 583             self
.report_age_confirmation() 
 584             age_results 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 585         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 586             raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
)) 
 588     def _extract_id(self
, url
): 
 589         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 591             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 592         video_id 
= mobj
.group(2) 
 595     def _real_extract(self
, url
): 
 596         # Extract original video URL from URL with redirection, like age verification, using next_url parameter 
 597         mobj 
= re
.search(self
._NEXT
_URL
_RE
, url
) 
 599             url 
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/') 
 600         video_id 
= self
._extract
_id
(url
) 
 603         self
.report_video_webpage_download(video_id
) 
 604         url 
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 
 605         request 
= compat_urllib_request
.Request(url
) 
 607             video_webpage_bytes 
= compat_urllib_request
.urlopen(request
).read() 
 608         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 609             raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
)) 
 611         video_webpage 
= video_webpage_bytes
.decode('utf-8', 'ignore') 
 613         # Attempt to extract SWF player URL 
 614         mobj 
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
) 
 616             player_url 
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1)) 
 621         self
.report_video_info_webpage_download(video_id
) 
 622         for el_type 
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 
 623             video_info_url 
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 
 624                     % (video_id
, el_type
)) 
 625             video_info_webpage 
= self
._download
_webpage
(video_info_url
, video_id
, 
 627                                     errnote
='unable to download video info webpage') 
 628             video_info 
= compat_parse_qs(video_info_webpage
) 
 629             if 'token' in video_info
: 
 631         if 'token' not in video_info
: 
 632             if 'reason' in video_info
: 
 633                 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0]) 
 635                 raise ExtractorError(u
'"token" parameter not in video info for unknown reason') 
 637         # Check for "rental" videos 
 638         if 'ypc_video_rental_bar_text' in video_info 
and 'author' not in video_info
: 
 639             raise ExtractorError(u
'"rental" videos not supported') 
 641         # Start extracting information 
 642         self
.report_information_extraction(video_id
) 
 645         if 'author' not in video_info
: 
 646             raise ExtractorError(u
'Unable to extract uploader name') 
 647         video_uploader 
= compat_urllib_parse
.unquote_plus(video_info
['author'][0]) 
 650         video_uploader_id 
= None 
 651         mobj 
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
) 
 653             video_uploader_id 
= mobj
.group(1) 
 655             self
._downloader
.report_warning(u
'unable to extract uploader nickname') 
 658         if 'title' not in video_info
: 
 659             raise ExtractorError(u
'Unable to extract video title') 
 660         video_title 
= compat_urllib_parse
.unquote_plus(video_info
['title'][0]) 
 663         if 'thumbnail_url' not in video_info
: 
 664             self
._downloader
.report_warning(u
'unable to extract video thumbnail') 
 666         else:   # don't panic if we can't find it 
 667             video_thumbnail 
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0]) 
 671         mobj 
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
) 
 673             upload_date 
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split()) 
 674             upload_date 
= unified_strdate(upload_date
) 
 677         video_description 
= get_element_by_id("eow-description", video_webpage
) 
 678         if video_description
: 
 679             video_description 
= clean_html(video_description
) 
 681             fd_mobj 
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
) 
 683                 video_description 
= unescapeHTML(fd_mobj
.group(1)) 
 685                 video_description 
= u
'' 
 688         video_subtitles 
= None 
 690         if self
._downloader
.params
.get('writesubtitles', False): 
 691             video_subtitles 
= self
._extract
_subtitle
(video_id
) 
 693                 (sub_error
, sub_lang
, sub
) = video_subtitles
[0] 
 695                     # We try with the automatic captions 
 696                     video_subtitles 
= self
._request
_automatic
_caption
(video_id
, video_webpage
) 
 697                     (sub_error_auto
, sub_lang
, sub
) = video_subtitles
[0] 
 701                         # We report the original error 
 702                         self
._downloader
.report_error(sub_error
) 
 704         if self
._downloader
.params
.get('allsubtitles', False): 
 705             video_subtitles 
= self
._extract
_all
_subtitles
(video_id
) 
 706             for video_subtitle 
in video_subtitles
: 
 707                 (sub_error
, sub_lang
, sub
) = video_subtitle
 
 709                     self
._downloader
.report_error(sub_error
) 
 711         if self
._downloader
.params
.get('listsubtitles', False): 
 712             sub_lang_list 
= self
._list
_available
_subtitles
(video_id
) 
 715         if 'length_seconds' not in video_info
: 
 716             self
._downloader
.report_warning(u
'unable to extract video duration') 
 719             video_duration 
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0]) 
 722         video_token 
= compat_urllib_parse
.unquote_plus(video_info
['token'][0]) 
 724         # Decide which formats to download 
 725         req_format 
= self
._downloader
.params
.get('format', None) 
 727         if 'conn' in video_info 
and video_info
['conn'][0].startswith('rtmp'): 
 728             self
.report_rtmp_download() 
 729             video_url_list 
= [(None, video_info
['conn'][0])] 
 730         elif 'url_encoded_fmt_stream_map' in video_info 
and len(video_info
['url_encoded_fmt_stream_map']) >= 1: 
 732             for url_data_str 
in video_info
['url_encoded_fmt_stream_map'][0].split(','): 
 733                 url_data 
= compat_parse_qs(url_data_str
) 
 734                 if 'itag' in url_data 
and 'url' in url_data
: 
 735                     url 
= url_data
['url'][0] + '&signature=' + url_data
['sig'][0] 
 736                     if not 'ratebypass' in url
: url 
+= '&ratebypass=yes' 
 737                     url_map
[url_data
['itag'][0]] = url
 
 739             format_limit 
= self
._downloader
.params
.get('format_limit', None) 
 740             available_formats 
= self
._available
_formats
_prefer
_free 
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
 
 741             if format_limit 
is not None and format_limit 
in available_formats
: 
 742                 format_list 
= available_formats
[available_formats
.index(format_limit
):] 
 744                 format_list 
= available_formats
 
 745             existing_formats 
= [x 
for x 
in format_list 
if x 
in url_map
] 
 746             if len(existing_formats
) == 0: 
 747                 raise ExtractorError(u
'no known formats available for video') 
 748             if self
._downloader
.params
.get('listformats', None): 
 749                 self
._print
_formats
(existing_formats
) 
 751             if req_format 
is None or req_format 
== 'best': 
 752                 video_url_list 
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality 
 753             elif req_format 
== 'worst': 
 754                 video_url_list 
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality 
 755             elif req_format 
in ('-1', 'all'): 
 756                 video_url_list 
= [(f
, url_map
[f
]) for f 
in existing_formats
] # All formats 
 758                 # Specific formats. We pick the first in a slash-delimeted sequence. 
 759                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. 
 760                 req_formats 
= req_format
.split('/') 
 761                 video_url_list 
= None 
 762                 for rf 
in req_formats
: 
 764                         video_url_list 
= [(rf
, url_map
[rf
])] 
 766                 if video_url_list 
is None: 
 767                     raise ExtractorError(u
'requested format not available') 
 769             raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info') 
 772         for format_param
, video_real_url 
in video_url_list
: 
 774             video_extension 
= self
._video
_extensions
.get(format_param
, 'flv') 
 776             video_format 
= '{0} - {1}'.format(format_param 
if format_param 
else video_extension
, 
 777                                               self
._video
_dimensions
.get(format_param
, '???')) 
 781                 'url':      video_real_url
, 
 782                 'uploader': video_uploader
, 
 783                 'uploader_id': video_uploader_id
, 
 784                 'upload_date':  upload_date
, 
 785                 'title':    video_title
, 
 786                 'ext':      video_extension
, 
 787                 'format':   video_format
, 
 788                 'thumbnail':    video_thumbnail
, 
 789                 'description':  video_description
, 
 790                 'player_url':   player_url
, 
 791                 'subtitles':    video_subtitles
, 
 792                 'duration':     video_duration
 
 797 class MetacafeIE(InfoExtractor
): 
 798     """Information Extractor for metacafe.com.""" 
 800     _VALID_URL 
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 
 801     _DISCLAIMER 
= 'http://www.metacafe.com/family_filter/' 
 802     _FILTER_POST 
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 
 803     IE_NAME 
= u
'metacafe' 
 805     def report_disclaimer(self
): 
 806         """Report disclaimer retrieval.""" 
 807         self
.to_screen(u
'Retrieving disclaimer') 
 809     def _real_initialize(self
): 
 810         # Retrieve disclaimer 
 811         request 
= compat_urllib_request
.Request(self
._DISCLAIMER
) 
 813             self
.report_disclaimer() 
 814             disclaimer 
= compat_urllib_request
.urlopen(request
).read() 
 815         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 816             raise ExtractorError(u
'Unable to retrieve disclaimer: %s' % compat_str(err
)) 
 821             'submit': "Continue - I'm over 18", 
 823         request 
= compat_urllib_request
.Request(self
._FILTER
_POST
, compat_urllib_parse
.urlencode(disclaimer_form
)) 
 825             self
.report_age_confirmation() 
 826             disclaimer 
= compat_urllib_request
.urlopen(request
).read() 
 827         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 828             raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
)) 
 830     def _real_extract(self
, url
): 
 831         # Extract id and simplified title from URL 
 832         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 834             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 836         video_id 
= mobj
.group(1) 
 838         # Check if video comes from YouTube 
 839         mobj2 
= re
.match(r
'^yt-(.*)$', video_id
) 
 840         if mobj2 
is not None: 
 841             return [self
.url_result('http://www.youtube.com/watch?v=%s' % mobj2
.group(1), 'Youtube')] 
 843         # Retrieve video webpage to extract further information 
 844         webpage 
= self
._download
_webpage
('http://www.metacafe.com/watch/%s/' % video_id
, video_id
) 
 846         # Extract URL, uploader and title from webpage 
 847         self
.report_extraction(video_id
) 
 848         mobj 
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
) 
 850             mediaURL 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 851             video_extension 
= mediaURL
[-3:] 
 853             # Extract gdaKey if available 
 854             mobj 
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
) 
 858                 gdaKey 
= mobj
.group(1) 
 859                 video_url 
= '%s?__gda__=%s' % (mediaURL
, gdaKey
) 
 861             mobj 
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
) 
 863                 raise ExtractorError(u
'Unable to extract media URL') 
 864             vardict 
= compat_parse_qs(mobj
.group(1)) 
 865             if 'mediaData' not in vardict
: 
 866                 raise ExtractorError(u
'Unable to extract media URL') 
 867             mobj 
= re
.search(r
'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict
['mediaData'][0]) 
 869                 raise ExtractorError(u
'Unable to extract media URL') 
 870             mediaURL 
= mobj
.group('mediaURL').replace('\\/', '/') 
 871             video_extension 
= mediaURL
[-3:] 
 872             video_url 
= '%s?__gda__=%s' % (mediaURL
, mobj
.group('key')) 
 874         mobj 
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
) 
 876             raise ExtractorError(u
'Unable to extract title') 
 877         video_title 
= mobj
.group(1).decode('utf-8') 
 879         mobj 
= re
.search(r
'submitter=(.*?);', webpage
) 
 881             raise ExtractorError(u
'Unable to extract uploader nickname') 
 882         video_uploader 
= mobj
.group(1) 
 885             'id':       video_id
.decode('utf-8'), 
 886             'url':      video_url
.decode('utf-8'), 
 887             'uploader': video_uploader
.decode('utf-8'), 
 889             'title':    video_title
, 
 890             'ext':      video_extension
.decode('utf-8'), 
 893 class DailymotionIE(InfoExtractor
): 
 894     """Information Extractor for Dailymotion""" 
 896     _VALID_URL 
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' 
 897     IE_NAME 
= u
'dailymotion' 
 899     def _real_extract(self
, url
): 
 900         # Extract id and simplified title from URL 
 901         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 903             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 905         video_id 
= mobj
.group(1).split('_')[0].split('?')[0] 
 907         video_extension 
= 'mp4' 
 909         # Retrieve video webpage to extract further information 
 910         request 
= compat_urllib_request
.Request(url
) 
 911         request
.add_header('Cookie', 'family_filter=off') 
 912         webpage 
= self
._download
_webpage
(request
, video_id
) 
 914         # Extract URL, uploader and title from webpage 
 915         self
.report_extraction(video_id
) 
 916         mobj 
= re
.search(r
'\s*var flashvars = (.*)', webpage
) 
 918             raise ExtractorError(u
'Unable to extract media URL') 
 919         flashvars 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 921         for key 
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: 
 924                 self
.to_screen(u
'Using %s' % key
) 
 927             raise ExtractorError(u
'Unable to extract video URL') 
 929         mobj 
= re
.search(r
'"' + max_quality 
+ r
'":"(.+?)"', flashvars
) 
 931             raise ExtractorError(u
'Unable to extract video URL') 
 933         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)).replace('\\/', '/') 
 935         # TODO: support choosing qualities 
 937         mobj 
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
) 
 939             raise ExtractorError(u
'Unable to extract title') 
 940         video_title 
= unescapeHTML(mobj
.group('title')) 
 942         video_uploader 
= None 
 943         video_uploader 
= self
._search
_regex
([r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', 
 944                                              # Looking for official user 
 945                                              r
'<(?:span|a) .*?rel="author".*?>([^<]+?)</'], 
 946                                             webpage
, 'video uploader') 
 948         video_upload_date 
= None 
 949         mobj 
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
) 
 951             video_upload_date 
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1) 
 956             'uploader': video_uploader
, 
 957             'upload_date':  video_upload_date
, 
 958             'title':    video_title
, 
 959             'ext':      video_extension
, 
 963 class PhotobucketIE(InfoExtractor
): 
 964     """Information extractor for photobucket.com.""" 
 966     # TODO: the original _VALID_URL was: 
 967     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 
 968     # Check if it's necessary to keep the old extracion process 
 969     _VALID_URL 
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' 
 970     IE_NAME 
= u
'photobucket' 
 972     def _real_extract(self
, url
): 
 973         # Extract id from URL 
 974         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 976             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 978         video_id 
= mobj
.group('id') 
 980         video_extension 
= mobj
.group('ext') 
 982         # Retrieve video webpage to extract further information 
 983         webpage 
= self
._download
_webpage
(url
, video_id
) 
 985         # Extract URL, uploader, and title from webpage 
 986         self
.report_extraction(video_id
) 
 987         # We try first by looking the javascript code: 
 988         mobj 
= re
.search(r
'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage
) 
 990             info 
= json
.loads(mobj
.group('json')) 
 993                 'url':      info
[u
'downloadUrl'], 
 994                 'uploader': info
[u
'username'], 
 995                 'upload_date':  datetime
.date
.fromtimestamp(info
[u
'creationDate']).strftime('%Y%m%d'), 
 996                 'title':    info
[u
'title'], 
 997                 'ext':      video_extension
, 
 998                 'thumbnail': info
[u
'thumbUrl'], 
1001         # We try looking in other parts of the webpage 
1002         video_url 
= self
._search
_regex
(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', 
1003             webpage
, u
'video URL') 
1005         mobj 
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
) 
1007             raise ExtractorError(u
'Unable to extract title') 
1008         video_title 
= mobj
.group(1).decode('utf-8') 
1009         video_uploader 
= mobj
.group(2).decode('utf-8') 
1012             'id':       video_id
.decode('utf-8'), 
1013             'url':      video_url
.decode('utf-8'), 
1014             'uploader': video_uploader
, 
1015             'upload_date':  None, 
1016             'title':    video_title
, 
1017             'ext':      video_extension
.decode('utf-8'), 
1021 class YahooIE(InfoExtractor
): 
1022     """Information extractor for screen.yahoo.com.""" 
1023     _VALID_URL 
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' 
1025     def _real_extract(self
, url
): 
1026         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1028             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1029         video_id 
= mobj
.group('id') 
1030         webpage 
= self
._download
_webpage
(url
, video_id
) 
1031         m_id 
= re
.search(r
'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage
) 
1034             # TODO: Check which url parameters are required 
1035             info_url 
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 
1036             webpage 
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info webpage') 
1037             info_re 
= r
'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* 
1038                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.* 
1039                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.* 
1040                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB" 
1042             self
.report_extraction(video_id
) 
1043             m_info 
= re
.search(info_re
, webpage
, re
.VERBOSE|re
.DOTALL
) 
1045                 raise ExtractorError(u
'Unable to extract video info') 
1046             video_title 
= m_info
.group('title') 
1047             video_description 
= m_info
.group('description') 
1048             video_thumb 
= m_info
.group('thumb') 
1049             video_date 
= m_info
.group('date') 
1050             video_date 
= datetime
.datetime
.strptime(video_date
, '%m/%d/%Y').strftime('%Y%m%d') 
1052             # TODO: Find a way to get mp4 videos 
1053             rest_url 
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 
1054             webpage 
= self
._download
_webpage
(rest_url
, video_id
, u
'Downloading video url webpage') 
1055             m_rest 
= re
.search(r
'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage
) 
1056             video_url 
= m_rest
.group('url') 
1057             video_path 
= m_rest
.group('path') 
1059                 raise ExtractorError(u
'Unable to extract video url') 
1061         else: # We have to use a different method if another id is defined 
1062             long_id 
= m_id
.group('new_id') 
1063             info_url 
= 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id 
+ '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335' 
1064             webpage 
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info json') 
1065             json_str 
= re
.search(r
'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage
).group(1) 
1066             info 
= json
.loads(json_str
) 
1067             res 
= info
[u
'query'][u
'results'][u
'mediaObj'][0] 
1068             stream 
= res
[u
'streams'][0] 
1069             video_path 
= stream
[u
'path'] 
1070             video_url 
= stream
[u
'host'] 
1072             video_title 
= meta
[u
'title'] 
1073             video_description 
= meta
[u
'description'] 
1074             video_thumb 
= meta
[u
'thumbnail'] 
1075             video_date 
= None # I can't find it 
1080                      'play_path': video_path
, 
1081                      'title':video_title
, 
1082                      'description': video_description
, 
1083                      'thumbnail': video_thumb
, 
1084                      'upload_date': video_date
, 
1089 class VimeoIE(InfoExtractor
): 
1090     """Information extractor for vimeo.com.""" 
1092     # _VALID_URL matches Vimeo URLs 
1093     _VALID_URL 
= r
'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)' 
1096     def _real_extract(self
, url
, new_video
=True): 
1097         # Extract ID from URL 
1098         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1100             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1102         video_id 
= mobj
.group('id') 
1103         if not mobj
.group('proto'): 
1104             url 
= 'https://' + url
 
1105         if mobj
.group('direct_link') or mobj
.group('pro'): 
1106             url 
= 'https://vimeo.com/' + video_id
 
1108         # Retrieve video webpage to extract further information 
1109         request 
= compat_urllib_request
.Request(url
, None, std_headers
) 
1110         webpage 
= self
._download
_webpage
(request
, video_id
) 
1112         # Now we begin extracting as much information as we can from what we 
1113         # retrieved. First we extract the information common to all extractors, 
1114         # and latter we extract those that are Vimeo specific. 
1115         self
.report_extraction(video_id
) 
1117         # Extract the config JSON 
1119             config 
= webpage
.split(' = {config:')[1].split(',assets:')[0] 
1120             config 
= json
.loads(config
) 
1122             if re
.search('The creator of this video has not given you permission to embed it on this domain.', webpage
): 
1123                 raise ExtractorError(u
'The author has restricted the access to this video, try with the "--referer" option') 
1125                 raise ExtractorError(u
'Unable to extract info section') 
1128         video_title 
= config
["video"]["title"] 
1130         # Extract uploader and uploader_id 
1131         video_uploader 
= config
["video"]["owner"]["name"] 
1132         video_uploader_id 
= config
["video"]["owner"]["url"].split('/')[-1] if config
["video"]["owner"]["url"] else None 
1134         # Extract video thumbnail 
1135         video_thumbnail 
= config
["video"]["thumbnail"] 
1137         # Extract video description 
1138         video_description 
= get_element_by_attribute("itemprop", "description", webpage
) 
1139         if video_description
: video_description 
= clean_html(video_description
) 
1140         else: video_description 
= u
'' 
1142         # Extract upload date 
1143         video_upload_date 
= None 
1144         mobj 
= re
.search(r
'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage
) 
1145         if mobj 
is not None: 
1146             video_upload_date 
= mobj
.group(1) + mobj
.group(2) + mobj
.group(3) 
1148         # Vimeo specific: extract request signature and timestamp 
1149         sig 
= config
['request']['signature'] 
1150         timestamp 
= config
['request']['timestamp'] 
1152         # Vimeo specific: extract video codec and quality information 
1153         # First consider quality, then codecs, then take everything 
1154         # TODO bind to format param 
1155         codecs 
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] 
1156         files 
= { 'hd': [], 'sd': [], 'other': []} 
1157         for codec_name
, codec_extension 
in codecs
: 
1158             if codec_name 
in config
["video"]["files"]: 
1159                 if 'hd' in config
["video"]["files"][codec_name
]: 
1160                     files
['hd'].append((codec_name
, codec_extension
, 'hd')) 
1161                 elif 'sd' in config
["video"]["files"][codec_name
]: 
1162                     files
['sd'].append((codec_name
, codec_extension
, 'sd')) 
1164                     files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0])) 
1166         for quality 
in ('hd', 'sd', 'other'): 
1167             if len(files
[quality
]) > 0: 
1168                 video_quality 
= files
[quality
][0][2] 
1169                 video_codec 
= files
[quality
][0][0] 
1170                 video_extension 
= files
[quality
][0][1] 
1171                 self
.to_screen(u
'%s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
)) 
1174             raise ExtractorError(u
'No known codec found') 
1176         video_url 
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 
1177                     %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper()) 
1182             'uploader': video_uploader
, 
1183             'uploader_id': video_uploader_id
, 
1184             'upload_date':  video_upload_date
, 
1185             'title':    video_title
, 
1186             'ext':      video_extension
, 
1187             'thumbnail':    video_thumbnail
, 
1188             'description':  video_description
, 
1192 class ArteTvIE(InfoExtractor
): 
1193     """arte.tv information extractor.""" 
1195     _VALID_URL 
= r
'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' 
1196     _LIVE_URL 
= r
'index-[0-9]+\.html$' 
1198     IE_NAME 
= u
'arte.tv' 
1200     def fetch_webpage(self
, url
): 
1201         request 
= compat_urllib_request
.Request(url
) 
1203             self
.report_download_webpage(url
) 
1204             webpage 
= compat_urllib_request
.urlopen(request
).read() 
1205         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1206             raise ExtractorError(u
'Unable to retrieve video webpage: %s' % compat_str(err
)) 
1207         except ValueError as err
: 
1208             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1211     def grep_webpage(self
, url
, regex
, regexFlags
, matchTuples
): 
1212         page 
= self
.fetch_webpage(url
) 
1213         mobj 
= re
.search(regex
, page
, regexFlags
) 
1217             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1219         for (i
, key
, err
) in matchTuples
: 
1220             if mobj
.group(i
) is None: 
1221                 raise ExtractorError(err
) 
1223                 info
[key
] = mobj
.group(i
) 
1227     def extractLiveStream(self
, url
): 
1228         video_lang 
= url
.split('/')[-4] 
1229         info 
= self
.grep_webpage( 
1231             r
'src="(.*?/videothek_js.*?\.js)', 
1234                 (1, 'url', u
'Invalid URL: %s' % url
) 
1237         http_host 
= url
.split('/')[2] 
1238         next_url 
= 'http://%s%s' % (http_host
, compat_urllib_parse
.unquote(info
.get('url'))) 
1239         info 
= self
.grep_webpage( 
1241             r
'(s_artestras_scst_geoFRDE_' + video_lang 
+ '.*?)\'.*?' + 
1242                 '(http://.*?\.swf).*?' + 
1246                 (1, 'path',   u
'could not extract video path: %s' % url
), 
1247                 (2, 'player', u
'could not extract video player: %s' % url
), 
1248                 (3, 'url',    u
'could not extract video url: %s' % url
) 
1251         video_url 
= u
'%s/%s' % (info
.get('url'), info
.get('path')) 
1253     def extractPlus7Stream(self
, url
): 
1254         video_lang 
= url
.split('/')[-3] 
1255         info 
= self
.grep_webpage( 
1257             r
'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', 
1260                 (1, 'url', u'Invalid URL: %s' % url) 
1263         next_url = compat_urllib_parse.unquote(info.get('url')) 
1264         info = self.grep_webpage( 
1266             r'<video lang="%s" ref="(http
[^
\'"&]*)' % video_lang, 
1269                 (1, 'url', u'Could not find <video> tag: %s' % url) 
1272         next_url = compat_urllib_parse.unquote(info.get('url')) 
1274         info = self.grep_webpage( 
1276             r'<video id="(.*?
)".*?>.*?' + 
1277                 '<name>(.*?)</name>.*?' + 
1278                 '<dateVideo>(.*?)</dateVideo>.*?' + 
1279                 '<url quality="hd
">(.*?)</url>', 
1282                 (1, 'id',    u'could not extract video id: %s' % url), 
1283                 (2, 'title', u'could not extract video title: %s' % url), 
1284                 (3, 'date',  u'could not extract video date: %s' % url), 
1285                 (4, 'url',   u'could not extract video url: %s' % url) 
1290             'id':           info.get('id'), 
1291             'url':          compat_urllib_parse.unquote(info.get('url')), 
1292             'uploader':     u'arte.tv', 
1293             'upload_date':  unified_strdate(info.get('date')), 
1294             'title':        info.get('title').decode('utf-8'), 
1300     def _real_extract(self, url): 
1301         video_id = url.split('/')[-1] 
1302         self.report_extraction(video_id) 
1304         if re.search(self._LIVE_URL, video_id) is not None: 
1305             self.extractLiveStream(url) 
1308             info = self.extractPlus7Stream(url) 
1313 class GenericIE(InfoExtractor): 
1314     """Generic last-resort information extractor.""" 
1317     IE_NAME = u'generic' 
1319     def report_download_webpage(self, video_id): 
1320         """Report webpage download.""" 
1321         if not self._downloader.params.get('test', False): 
1322             self._downloader.report_warning(u'Falling back on generic information extractor.') 
1323         super(GenericIE, self).report_download_webpage(video_id) 
1325     def report_following_redirect(self, new_url): 
1326         """Report information extraction.""" 
1327         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) 
1329     def _test_redirect(self, url): 
1330         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
1331         class HeadRequest(compat_urllib_request.Request): 
1332             def get_method(self): 
1335         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): 
1337             Subclass the HTTPRedirectHandler to make it use our 
1338             HeadRequest also on the redirected URL 
1340             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1341                 if code in (301, 302, 303, 307): 
1342                     newurl = newurl.replace(' ', '%20') 
1343                     newheaders = dict((k,v) for k,v in req.headers.items() 
1344                                       if k.lower() not in ("content
-length
", "content
-type")) 
1345                     return HeadRequest(newurl, 
1347                                        origin_req_host=req.get_origin_req_host(), 
1350                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1352         class HTTPMethodFallback(compat_urllib_request.BaseHandler): 
1354             Fallback to GET if HEAD is not allowed (405 HTTP error) 
1356             def http_error_405(self, req, fp, code, msg, headers): 
1360                 newheaders = dict((k,v) for k,v in req.headers.items() 
1361                                   if k.lower() not in ("content
-length
", "content
-type")) 
1362                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1364                                                  origin_req_host=req.get_origin_req_host(), 
1368         opener = compat_urllib_request.OpenerDirector() 
1369         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, 
1370                         HTTPMethodFallback, HEADRedirectHandler, 
1371                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: 
1372             opener.add_handler(handler()) 
1374         response = opener.open(HeadRequest(url)) 
1375         if response is None: 
1376             raise ExtractorError(u'Invalid URL protocol') 
1377         new_url = response.geturl() 
1382         self.report_following_redirect(new_url) 
1385     def _real_extract(self, url): 
1386         new_url = self._test_redirect(url) 
1387         if new_url: return [self.url_result(new_url)] 
1389         video_id = url.split('/')[-1] 
1391             webpage = self._download_webpage(url, video_id) 
1392         except ValueError as err: 
1393             # since this is the last-resort InfoExtractor, if 
1394             # this error is thrown, it'll be thrown here 
1395             raise ExtractorError(u'Invalid URL: %s' % url) 
1397         self.report_extraction(video_id) 
1398         # Start with something easy: JW Player in SWFObject 
1399         mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage) 
1401             # Broaden the search a little bit 
1402             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) 
1404             # Broaden the search a little bit: JWPlayer JS loader 
1405             mobj = re.search(r'[^A
-Za
-z0
-9]?
file:\s
*["\'](http[^\'"&]*)', webpage) 
1407             # Try to find twitter cards info 
1408             mobj = re.search(r'<meta (?
:property|name
)="twitter:player:stream" (?
:content|value
)="(.+?)"', webpage) 
1410             raise ExtractorError(u'Invalid URL
: %s' % url) 
1412         # It's possible that one of the regexes
 
1413         # matched, but returned an empty group: 
1414         if mobj
.group(1) is None: 
1415             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1417         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
1418         video_id 
= os
.path
.basename(video_url
) 
1420         # here's a fun little line of code for you: 
1421         video_extension 
= os
.path
.splitext(video_id
)[1][1:] 
1422         video_id 
= os
.path
.splitext(video_id
)[0] 
1424         # it's tempting to parse this further, but you would 
1425         # have to take into account all the variations like 
1426         #   Video Title - Site Name 
1427         #   Site Name | Video Title 
1428         #   Video Title - Tagline | Site Name 
1429         # and so on and so forth; it's just not practical 
1430         video_title 
= self
._html
_search
_regex
(r
'<title>(.*)</title>', 
1431             webpage
, u
'video title') 
1433         # video uploader is domain name 
1434         video_uploader 
= self
._search
_regex
(r
'(?:https?://)?([^/]*)/.*', 
1435             url
, u
'video uploader') 
1440             'uploader': video_uploader
, 
1441             'upload_date':  None, 
1442             'title':    video_title
, 
1443             'ext':      video_extension
, 
1447 class YoutubeSearchIE(SearchInfoExtractor
): 
1448     """Information Extractor for YouTube search queries.""" 
1449     _API_URL 
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' 
1451     IE_NAME 
= u
'youtube:search' 
1452     _SEARCH_KEY 
= 'ytsearch' 
1454     def report_download_page(self
, query
, pagenum
): 
1455         """Report attempt to download search page with given number.""" 
1456         self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
)) 
1458     def _get_n_results(self
, query
, n
): 
1459         """Get a specified number of results for a query""" 
1465         while (50 * pagenum
) < limit
: 
1466             self
.report_download_page(query
, pagenum
+1) 
1467             result_url 
= self
._API
_URL 
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1) 
1468             request 
= compat_urllib_request
.Request(result_url
) 
1470                 data 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
1471             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1472                 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
)) 
1473             api_response 
= json
.loads(data
)['data'] 
1475             if not 'items' in api_response
: 
1476                 raise ExtractorError(u
'[youtube] No video results') 
1478             new_ids 
= list(video
['id'] for video 
in api_response
['items']) 
1479             video_ids 
+= new_ids
 
1481             limit 
= min(n
, api_response
['totalItems']) 
1484         if len(video_ids
) > n
: 
1485             video_ids 
= video_ids
[:n
] 
1486         videos 
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
] 
1487         return self
.playlist_result(videos
, query
) 
1490 class GoogleSearchIE(SearchInfoExtractor
): 
1491     """Information Extractor for Google Video search queries.""" 
1492     _MORE_PAGES_INDICATOR 
= r
'id="pnnext" class="pn"' 
1494     IE_NAME 
= u
'video.google:search' 
1495     _SEARCH_KEY 
= 'gvsearch' 
1497     def _get_n_results(self
, query
, n
): 
1498         """Get a specified number of results for a query""" 
1501             '_type': 'playlist', 
1506         for pagenum 
in itertools
.count(1): 
1507             result_url 
= u
'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse
.quote_plus(query
), pagenum
*10) 
1508             webpage 
= self
._download
_webpage
(result_url
, u
'gvsearch:' + query
, 
1509                                              note
='Downloading result page ' + str(pagenum
)) 
1511             for mobj 
in re
.finditer(r
'<h3 class="r"><a href="([^"]+)"', webpage
): 
1514                     'url': mobj
.group(1) 
1516                 res
['entries'].append(e
) 
1518             if (pagenum 
* 10 > n
) or not re
.search(self
._MORE
_PAGES
_INDICATOR
, webpage
): 
1521 class YahooSearchIE(SearchInfoExtractor
): 
1522     """Information Extractor for Yahoo! Video search queries.""" 
1525     IE_NAME 
= u
'screen.yahoo:search' 
1526     _SEARCH_KEY 
= 'yvsearch' 
1528     def _get_n_results(self
, query
, n
): 
1529         """Get a specified number of results for a query""" 
1532             '_type': 'playlist', 
1536         for pagenum 
in itertools
.count(0):  
1537             result_url 
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum 
* 30) 
1538             webpage 
= self
._download
_webpage
(result_url
, query
, 
1539                                              note
='Downloading results page '+str(pagenum
+1)) 
1540             info 
= json
.loads(webpage
) 
1542             results 
= info
[u
'results'] 
1544             for (i
, r
) in enumerate(results
): 
1545                 if (pagenum 
* 30) +i 
>= n
: 
1547                 mobj 
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
) 
1548                 e 
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo') 
1549                 res
['entries'].append(e
) 
1550             if (pagenum 
* 30 +i 
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )): 
1556 class YoutubePlaylistIE(InfoExtractor
): 
1557     """Information Extractor for YouTube playlists.""" 
1559     _VALID_URL 
= r
"""(?: 
1564                            (?:course|view_play_list|my_playlists|artist|playlist|watch) 
1565                            \? (?:.*?&)*? (?:p|a|list)= 
1568                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) 
1571                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) 
1573     _TEMPLATE_URL 
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' 
1575     IE_NAME 
= u
'youtube:playlist' 
1578     def suitable(cls
, url
): 
1579         """Receives a URL and returns True if suitable for this IE.""" 
1580         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
1582     def _real_extract(self
, url
): 
1583         # Extract playlist id 
1584         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
1586             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1588         # Download playlist videos from API 
1589         playlist_id 
= mobj
.group(1) or mobj
.group(2) 
1594             url 
= self
._TEMPLATE
_URL 
% (playlist_id
, self
._MAX
_RESULTS
, self
._MAX
_RESULTS 
* (page_num 
- 1) + 1) 
1595             page 
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
) 
1598                 response 
= json
.loads(page
) 
1599             except ValueError as err
: 
1600                 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
)) 
1602             if 'feed' not in response
: 
1603                 raise ExtractorError(u
'Got a malformed response from YouTube API') 
1604             playlist_title 
= response
['feed']['title']['$t'] 
1605             if 'entry' not in response
['feed']: 
1606                 # Number of videos is a multiple of self._MAX_RESULTS 
1609             videos 
+= [ (entry
['yt$position']['$t'], entry
['content']['src']) 
1610                         for entry 
in response
['feed']['entry'] 
1611                         if 'content' in entry 
] 
1613             if len(response
['feed']['entry']) < self
._MAX
_RESULTS
: 
1617         videos 
= [v
[1] for v 
in sorted(videos
)] 
1619         url_results 
= [self
.url_result(url
, 'Youtube') for url 
in videos
] 
1620         return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)] 
1623 class YoutubeChannelIE(InfoExtractor
): 
1624     """Information Extractor for YouTube channels.""" 
1626     _VALID_URL 
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" 
1627     _TEMPLATE_URL 
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' 
1628     _MORE_PAGES_INDICATOR 
= 'yt-uix-load-more' 
1629     _MORE_PAGES_URL 
= 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' 
1630     IE_NAME 
= u
'youtube:channel' 
1632     def extract_videos_from_page(self
, page
): 
1634         for mobj 
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
): 
1635             if mobj
.group(1) not in ids_in_page
: 
1636                 ids_in_page
.append(mobj
.group(1)) 
1639     def _real_extract(self
, url
): 
1640         # Extract channel id 
1641         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1643             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1645         # Download channel page 
1646         channel_id 
= mobj
.group(1) 
1650         url 
= self
._TEMPLATE
_URL 
% (channel_id
, pagenum
) 
1651         page 
= self
._download
_webpage
(url
, channel_id
, 
1652                                       u
'Downloading page #%s' % pagenum
) 
1654         # Extract video identifiers 
1655         ids_in_page 
= self
.extract_videos_from_page(page
) 
1656         video_ids
.extend(ids_in_page
) 
1658         # Download any subsequent channel pages using the json-based channel_ajax query 
1659         if self
._MORE
_PAGES
_INDICATOR 
in page
: 
1661                 pagenum 
= pagenum 
+ 1 
1663                 url 
= self
._MORE
_PAGES
_URL 
% (pagenum
, channel_id
) 
1664                 page 
= self
._download
_webpage
(url
, channel_id
, 
1665                                               u
'Downloading page #%s' % pagenum
) 
1667                 page 
= json
.loads(page
) 
1669                 ids_in_page 
= self
.extract_videos_from_page(page
['content_html']) 
1670                 video_ids
.extend(ids_in_page
) 
1672                 if self
._MORE
_PAGES
_INDICATOR  
not in page
['load_more_widget_html']: 
1675         self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
))) 
1677         urls 
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
] 
1678         url_entries 
= [self
.url_result(url
, 'Youtube') for url 
in urls
] 
1679         return [self
.playlist_result(url_entries
, channel_id
)] 
1682 class YoutubeUserIE(InfoExtractor
): 
1683     """Information Extractor for YouTube users.""" 
1685     _VALID_URL 
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' 
1686     _TEMPLATE_URL 
= 'http://gdata.youtube.com/feeds/api/users/%s' 
1687     _GDATA_PAGE_SIZE 
= 50 
1688     _GDATA_URL 
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' 
1689     _VIDEO_INDICATOR 
= r
'/watch\?v=(.+?)[\<&]' 
1690     IE_NAME 
= u
'youtube:user' 
1692     def _real_extract(self
, url
): 
1694         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1696             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1698         username 
= mobj
.group(1) 
1700         # Download video ids using YouTube Data API. Result size per 
1701         # query is limited (currently to 50 videos) so we need to query 
1702         # page by page until there are no video ids - it means we got 
1709             start_index 
= pagenum 
* self
._GDATA
_PAGE
_SIZE 
+ 1 
1711             gdata_url 
= self
._GDATA
_URL 
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
) 
1712             page 
= self
._download
_webpage
(gdata_url
, username
, 
1713                                           u
'Downloading video ids from %d to %d' % (start_index
, start_index 
+ self
._GDATA
_PAGE
_SIZE
)) 
1715             # Extract video identifiers 
1718             for mobj 
in re
.finditer(self
._VIDEO
_INDICATOR
, page
): 
1719                 if mobj
.group(1) not in ids_in_page
: 
1720                     ids_in_page
.append(mobj
.group(1)) 
1722             video_ids
.extend(ids_in_page
) 
1724             # A little optimization - if current page is not 
1725             # "full", ie. does not contain PAGE_SIZE video ids then 
1726             # we can assume that this page is the last one - there 
1727             # are no more ids on further pages - no need to query 
1730             if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
: 
1735         urls 
= ['http://www.youtube.com/watch?v=%s' % video_id 
for video_id 
in video_ids
] 
1736         url_results 
= [self
.url_result(url
, 'Youtube') for url 
in urls
] 
1737         return [self
.playlist_result(url_results
, playlist_title 
= username
)] 
1740 class BlipTVUserIE(InfoExtractor
): 
1741     """Information Extractor for blip.tv users.""" 
1743     _VALID_URL 
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' 
1745     IE_NAME 
= u
'blip.tv:user' 
1747     def _real_extract(self
, url
): 
1749         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1751             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1753         username 
= mobj
.group(1) 
1755         page_base 
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' 
1757         page 
= self
._download
_webpage
(url
, username
, u
'Downloading user page') 
1758         mobj 
= re
.search(r
'data-users-id="([^"]+)"', page
) 
1759         page_base 
= page_base 
% mobj
.group(1) 
1762         # Download video ids using BlipTV Ajax calls. Result size per 
1763         # query is limited (currently to 12 videos) so we need to query 
1764         # page by page until there are no video ids - it means we got 
1771             url 
= page_base 
+ "&page=" + str(pagenum
) 
1772             page 
= self
._download
_webpage
(url
, username
, 
1773                                           u
'Downloading video ids from page %d' % pagenum
) 
1775             # Extract video identifiers 
1778             for mobj 
in re
.finditer(r
'href="/([^"]+)"', page
): 
1779                 if mobj
.group(1) not in ids_in_page
: 
1780                     ids_in_page
.append(unescapeHTML(mobj
.group(1))) 
1782             video_ids
.extend(ids_in_page
) 
1784             # A little optimization - if current page is not 
1785             # "full", ie. does not contain PAGE_SIZE video ids then 
1786             # we can assume that this page is the last one - there 
1787             # are no more ids on further pages - no need to query 
1790             if len(ids_in_page
) < self
._PAGE
_SIZE
: 
1795         urls 
= [u
'http://blip.tv/%s' % video_id 
for video_id 
in video_ids
] 
1796         url_entries 
= [self
.url_result(url
, 'BlipTV') for url 
in urls
] 
1797         return [self
.playlist_result(url_entries
, playlist_title 
= username
)] 
1800 class DepositFilesIE(InfoExtractor
): 
1801     """Information extractor for depositfiles.com""" 
1803     _VALID_URL 
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)' 
1805     def _real_extract(self
, url
): 
1806         file_id 
= url
.split('/')[-1] 
1807         # Rebuild url in english locale 
1808         url 
= 'http://depositfiles.com/en/files/' + file_id
 
1810         # Retrieve file webpage with 'Free download' button pressed 
1811         free_download_indication 
= { 'gateway_result' : '1' } 
1812         request 
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
)) 
1814             self
.report_download_webpage(file_id
) 
1815             webpage 
= compat_urllib_request
.urlopen(request
).read() 
1816         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1817             raise ExtractorError(u
'Unable to retrieve file webpage: %s' % compat_str(err
)) 
1819         # Search for the real file URL 
1820         mobj 
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
) 
1821         if (mobj 
is None) or (mobj
.group(1) is None): 
1822             # Try to figure out reason of the error. 
1823             mobj 
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
) 
1824             if (mobj 
is not None) and (mobj
.group(1) is not None): 
1825                 restriction_message 
= re
.sub('\s+', ' ', mobj
.group(1)).strip() 
1826                 raise ExtractorError(u
'%s' % restriction_message
) 
1828                 raise ExtractorError(u
'Unable to extract download URL from: %s' % url
) 
1830         file_url 
= mobj
.group(1) 
1831         file_extension 
= os
.path
.splitext(file_url
)[1][1:] 
1833         # Search for file title 
1834         file_title 
= self
._search
_regex
(r
'<b title="(.*?)">', webpage
, u
'title') 
1837             'id':       file_id
.decode('utf-8'), 
1838             'url':      file_url
.decode('utf-8'), 
1840             'upload_date':  None, 
1841             'title':    file_title
, 
1842             'ext':      file_extension
.decode('utf-8'), 
1846 class FacebookIE(InfoExtractor
): 
1847     """Information Extractor for Facebook""" 
1849     _VALID_URL 
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' 
1850     _LOGIN_URL 
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' 
1851     _NETRC_MACHINE 
= 'facebook' 
1852     IE_NAME 
= u
'facebook' 
1854     def report_login(self
): 
1855         """Report attempt to log in.""" 
1856         self
.to_screen(u
'Logging in') 
1858     def _real_initialize(self
): 
1859         if self
._downloader 
is None: 
1864         downloader_params 
= self
._downloader
.params
 
1866         # Attempt to use provided username and password or .netrc data 
1867         if downloader_params
.get('username', None) is not None: 
1868             useremail 
= downloader_params
['username'] 
1869             password 
= downloader_params
['password'] 
1870         elif downloader_params
.get('usenetrc', False): 
1872                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
1873                 if info 
is not None: 
1877                     raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
1878             except (IOError, netrc
.NetrcParseError
) as err
: 
1879                 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
)) 
1882         if useremail 
is None: 
1891         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
)) 
1894             login_results 
= compat_urllib_request
.urlopen(request
).read() 
1895             if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None: 
1896                 self
._downloader
.report_warning(u
'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') 
1898         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1899             self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
)) 
1902     def _real_extract(self
, url
): 
1903         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1905             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1906         video_id 
= mobj
.group('ID') 
1908         url 
= 'https://www.facebook.com/video/video.php?v=%s' % video_id
 
1909         webpage 
= self
._download
_webpage
(url
, video_id
) 
1911         BEFORE 
= '{swf.addParam(param[0], param[1]);});\n' 
1912         AFTER 
= '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' 
1913         m 
= re
.search(re
.escape(BEFORE
) + '(.*?)' + re
.escape(AFTER
), webpage
) 
1915             raise ExtractorError(u
'Cannot parse data') 
1916         data 
= dict(json
.loads(m
.group(1))) 
1917         params_raw 
= compat_urllib_parse
.unquote(data
['params']) 
1918         params 
= json
.loads(params_raw
) 
1919         video_data 
= params
['video_data'][0] 
1920         video_url 
= video_data
.get('hd_src') 
1922             video_url 
= video_data
['sd_src'] 
1924             raise ExtractorError(u
'Cannot find video URL') 
1925         video_duration 
= int(video_data
['video_duration']) 
1926         thumbnail 
= video_data
['thumbnail_src'] 
1928         video_title 
= self
._html
_search
_regex
('<h2 class="uiHeaderTitle">([^<]+)</h2>', 
1933             'title': video_title
, 
1936             'duration': video_duration
, 
1937             'thumbnail': thumbnail
, 
1942 class BlipTVIE(InfoExtractor
): 
1943     """Information extractor for blip.tv""" 
1945     _VALID_URL 
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$' 
1946     _URL_EXT 
= r
'^.*\.([a-z0-9]+)$' 
1947     IE_NAME 
= u
'blip.tv' 
1949     def report_direct_download(self
, title
): 
1950         """Report information extraction.""" 
1951         self
.to_screen(u
'%s: Direct download detected' % title
) 
1953     def _real_extract(self
, url
): 
1954         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1956             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1958         # See https://github.com/rg3/youtube-dl/issues/857 
1959         api_mobj 
= re
.match(r
'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url
) 
1960         if api_mobj 
is not None: 
1961             url 
= 'http://blip.tv/play/g_%s' % api_mobj
.group('video_id') 
1962         urlp 
= compat_urllib_parse_urlparse(url
) 
1963         if urlp
.path
.startswith('/play/'): 
1964             request 
= compat_urllib_request
.Request(url
) 
1965             response 
= compat_urllib_request
.urlopen(request
) 
1966             redirecturl 
= response
.geturl() 
1967             rurlp 
= compat_urllib_parse_urlparse(redirecturl
) 
1968             file_id 
= compat_parse_qs(rurlp
.fragment
)['file'][0].rpartition('/')[2] 
1969             url 
= 'http://blip.tv/a/a-' + file_id
 
1970             return self
._real
_extract
(url
) 
1977         json_url 
= url 
+ cchar 
+ 'skin=json&version=2&no_wrap=1' 
1978         request 
= compat_urllib_request
.Request(json_url
) 
1979         request
.add_header('User-Agent', 'iTunes/10.6.1') 
1980         self
.report_extraction(mobj
.group(1)) 
1983             urlh 
= compat_urllib_request
.urlopen(request
) 
1984             if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download 
1985                 basename 
= url
.split('/')[-1] 
1986                 title
,ext 
= os
.path
.splitext(basename
) 
1987                 title 
= title
.decode('UTF-8') 
1988                 ext 
= ext
.replace('.', '') 
1989                 self
.report_direct_download(title
) 
1994                     'upload_date': None, 
1999         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2000             raise ExtractorError(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
)) 
2001         if info 
is None: # Regular URL 
2003                 json_code_bytes 
= urlh
.read() 
2004                 json_code 
= json_code_bytes
.decode('utf-8') 
2005             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2006                 raise ExtractorError(u
'Unable to read video info webpage: %s' % compat_str(err
)) 
2009                 json_data 
= json
.loads(json_code
) 
2010                 if 'Post' in json_data
: 
2011                     data 
= json_data
['Post'] 
2015                 upload_date 
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') 
2016                 video_url 
= data
['media']['url'] 
2017                 umobj 
= re
.match(self
._URL
_EXT
, video_url
) 
2019                     raise ValueError('Can not determine filename extension') 
2020                 ext 
= umobj
.group(1) 
2023                     'id': data
['item_id'], 
2025                     'uploader': data
['display_name'], 
2026                     'upload_date': upload_date
, 
2027                     'title': data
['title'], 
2029                     'format': data
['media']['mimeType'], 
2030                     'thumbnail': data
['thumbnailUrl'], 
2031                     'description': data
['description'], 
2032                     'player_url': data
['embedUrl'], 
2033                     'user_agent': 'iTunes/10.6.1', 
2035             except (ValueError,KeyError) as err
: 
2036                 raise ExtractorError(u
'Unable to parse video information: %s' % repr(err
)) 
2041 class MyVideoIE(InfoExtractor
): 
2042     """Information Extractor for myvideo.de.""" 
2044     _VALID_URL 
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' 
2045     IE_NAME 
= u
'myvideo' 
2047     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git 
2048     # Released into the Public Domain by Tristan Fischer on 2013-05-19 
2049     # https://github.com/rg3/youtube-dl/pull/842 
2050     def __rc4crypt(self
,data
, key
): 
2052         box 
= list(range(256)) 
2053         for i 
in list(range(256)): 
2054             x 
= (x 
+ box
[i
] + compat_ord(key
[i 
% len(key
)])) % 256 
2055             box
[i
], box
[x
] = box
[x
], box
[i
] 
2061             y 
= (y 
+ box
[x
]) % 256 
2062             box
[x
], box
[y
] = box
[y
], box
[x
] 
2063             out 
+= chr(compat_ord(char
) ^ box
[(box
[x
] + box
[y
]) % 256]) 
2067         return hashlib
.md5(s
).hexdigest().encode() 
2069     def _real_extract(self
,url
): 
2070         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2072             raise ExtractorError(u
'invalid URL: %s' % url
) 
2074         video_id 
= mobj
.group(1) 
2077           b
'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' 
2078           b
'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' 
2079           b
'TnpsbA0KTVRkbU1tSTRNdz09' 
2083         webpage_url 
= 'http://www.myvideo.de/watch/%s' % video_id
 
2084         webpage 
= self
._download
_webpage
(webpage_url
, video_id
) 
2086         mobj 
= re
.search('source src=\'(.+?)[.]([^.]+)\'', webpage
) 
2087         if mobj 
is not None: 
2088             self
.report_extraction(video_id
) 
2089             video_url 
= mobj
.group(1) + '.flv' 
2091             video_title 
= self
._html
_search
_regex
('<title>([^<]+)</title>', 
2094             video_ext 
= self
._search
_regex
('[.](.+?)$', video_url
, u
'extension') 
2100                 'upload_date':  None, 
2101                 'title':    video_title
, 
2106         mobj 
= re
.search('var flashvars={(.+?)}', webpage
) 
2108             raise ExtractorError(u
'Unable to extract video') 
2113         for (a
, b
) in re
.findall('(.+?):\'(.+?)\',?', sec
): 
2114             if not a 
== '_encxml': 
2117                 encxml 
= compat_urllib_parse
.unquote(b
) 
2118         if not params
.get('domain'): 
2119             params
['domain'] = 'www.myvideo.de' 
2120         xmldata_url 
= '%s?%s' % (encxml
, compat_urllib_parse
.urlencode(params
)) 
2121         if 'flash_playertype=MTV' in xmldata_url
: 
2122             self
._downloader
.report_warning(u
'avoiding MTV player') 
2124                 'http://www.myvideo.de/dynamic/get_player_video_xml.php' 
2125                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' 
2129         enc_data 
= self
._download
_webpage
(xmldata_url
, video_id
).split('=')[1] 
2130         enc_data_b 
= binascii
.unhexlify(enc_data
) 
2132             base64
.b64decode(base64
.b64decode(GK
)) + 
2134                 str(video_id
).encode('utf-8') 
2137         dec_data 
= self
.__rc
4crypt
(enc_data_b
, sk
) 
2140         self
.report_extraction(video_id
) 
2143         mobj 
= re
.search('connectionurl=\'(.*?)\'', dec_data
) 
2145             video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
2146             if 'myvideo2flash' in video_url
: 
2147                 self
._downloader
.report_warning(u
'forcing RTMPT ...') 
2148                 video_url 
= video_url
.replace('rtmpe://', 'rtmpt://') 
2151             # extract non rtmp videos 
2152             mobj 
= re
.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data
) 
2154                 raise ExtractorError(u
'unable to extract url') 
2155             video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) + compat_urllib_parse
.unquote(mobj
.group(2)) 
2157         video_file 
= self
._search
_regex
('source=\'(.*?)\'', dec_data
, u
'video file') 
2158         video_file 
= compat_urllib_parse
.unquote(video_file
) 
2160         if not video_file
.endswith('f4m'): 
2161             ppath
, prefix 
= video_file
.split('.') 
2162             video_playpath 
= '%s:%s' % (prefix
, ppath
) 
2163             video_hls_playlist 
= '' 
2166             video_hls_playlist 
= ( 
2167                 video_filepath 
+ video_file
 
2168             ).replace('.f4m', '.m3u8') 
2170         video_swfobj 
= self
._search
_regex
('swfobject.embedSWF\(\'(.+?)\'', webpage
, u
'swfobj') 
2171         video_swfobj 
= compat_urllib_parse
.unquote(video_swfobj
) 
2173         video_title 
= self
._html
_search
_regex
("<h1(?: class='globalHd')?>(.*?)</h1>", 
2179             'tc_url':             video_url
, 
2181             'upload_date':        None, 
2182             'title':              video_title
, 
2184             'play_path':          video_playpath
, 
2185             'video_file':         video_file
, 
2186             'video_hls_playlist': video_hls_playlist
, 
2187             'player_url':         video_swfobj
, 
2191 class ComedyCentralIE(InfoExtractor
): 
2192     """Information extractor for The Daily Show and Colbert Report """ 
2194     # urls can be abbreviations like :thedailyshow or :colbert 
2195     # urls for episodes like: 
2196     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day 
2197     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news 
2198     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 
2199     _VALID_URL 
= r
"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) 
2200                       |(https?://)?(www\.)? 
2201                           (?P<showname>thedailyshow|colbertnation)\.com/ 
2202                          (full-episodes/(?P<episode>.*)| 
2204                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) 
2205                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))))) 
2208     _available_formats 
= ['3500', '2200', '1700', '1200', '750', '400'] 
2210     _video_extensions 
= { 
2218     _video_dimensions 
= { 
2228     def suitable(cls
, url
): 
2229         """Receives a URL and returns True if suitable for this IE.""" 
2230         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
2232     def _print_formats(self
, formats
): 
2233         print('Available formats:') 
2235             print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'mp4'), self
._video
_dimensions
.get(x
, '???'))) 
2238     def _real_extract(self
, url
): 
2239         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2241             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2243         if mobj
.group('shortname'): 
2244             if mobj
.group('shortname') in ('tds', 'thedailyshow'): 
2245                 url 
= u
'http://www.thedailyshow.com/full-episodes/' 
2247                 url 
= u
'http://www.colbertnation.com/full-episodes/' 
2248             mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2249             assert mobj 
is not None 
2251         if mobj
.group('clip'): 
2252             if mobj
.group('showname') == 'thedailyshow': 
2253                 epTitle 
= mobj
.group('tdstitle') 
2255                 epTitle 
= mobj
.group('cntitle') 
2258             dlNewest 
= not mobj
.group('episode') 
2260                 epTitle 
= mobj
.group('showname') 
2262                 epTitle 
= mobj
.group('episode') 
2264         self
.report_extraction(epTitle
) 
2265         webpage
,htmlHandle 
= self
._download
_webpage
_handle
(url
, epTitle
) 
2267             url 
= htmlHandle
.geturl() 
2268             mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2270                 raise ExtractorError(u
'Invalid redirected URL: ' + url
) 
2271             if mobj
.group('episode') == '': 
2272                 raise ExtractorError(u
'Redirected URL is still not specific: ' + url
) 
2273             epTitle 
= mobj
.group('episode') 
2275         mMovieParams 
= re
.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage
) 
2277         if len(mMovieParams
) == 0: 
2278             # The Colbert Report embeds the information in a without 
2279             # a URL prefix; so extract the alternate reference 
2280             # and then add the URL prefix manually. 
2282             altMovieParams 
= re
.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage
) 
2283             if len(altMovieParams
) == 0: 
2284                 raise ExtractorError(u
'unable to find Flash URL in webpage ' + url
) 
2286                 mMovieParams 
= [("http://media.mtvnservices.com/" + altMovieParams
[0], altMovieParams
[0])] 
2288         uri 
= mMovieParams
[0][1] 
2289         indexUrl 
= 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse
.urlencode({'uri': uri
}) 
2290         indexXml 
= self
._download
_webpage
(indexUrl
, epTitle
, 
2291                                           u
'Downloading show index', 
2292                                           u
'unable to download episode index') 
2296         idoc 
= xml
.etree
.ElementTree
.fromstring(indexXml
) 
2297         itemEls 
= idoc
.findall('.//item') 
2298         for partNum
,itemEl 
in enumerate(itemEls
): 
2299             mediaId 
= itemEl
.findall('./guid')[0].text
 
2300             shortMediaId 
= mediaId
.split(':')[-1] 
2301             showId 
= mediaId
.split(':')[-2].replace('.com', '') 
2302             officialTitle 
= itemEl
.findall('./title')[0].text
 
2303             officialDate 
= unified_strdate(itemEl
.findall('./pubDate')[0].text
) 
2305             configUrl 
= ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + 
2306                         compat_urllib_parse
.urlencode({'uri': mediaId
})) 
2307             configXml 
= self
._download
_webpage
(configUrl
, epTitle
, 
2308                                                u
'Downloading configuration for %s' % shortMediaId
) 
2310             cdoc 
= xml
.etree
.ElementTree
.fromstring(configXml
) 
2312             for rendition 
in cdoc
.findall('.//rendition'): 
2313                 finfo 
= (rendition
.attrib
['bitrate'], rendition
.findall('./src')[0].text
) 
2317                 self
._downloader
.report_error(u
'unable to download ' + mediaId 
+ ': No videos found') 
2320             if self
._downloader
.params
.get('listformats', None): 
2321                 self
._print
_formats
([i
[0] for i 
in turls
]) 
2324             # For now, just pick the highest bitrate 
2325             format
,rtmp_video_url 
= turls
[-1] 
2327             # Get the format arg from the arg stream 
2328             req_format 
= self
._downloader
.params
.get('format', None) 
2330             # Select format if we can find one 
2333                     format
, rtmp_video_url 
= f
, v
 
2336             m 
= re
.match(r
'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url
) 
2338                 raise ExtractorError(u
'Cannot transform RTMP url') 
2339             base 
= 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' 
2340             video_url 
= base 
+ m
.group('finalid') 
2342             effTitle 
= showId 
+ u
'-' + epTitle 
+ u
' part ' + compat_str(partNum
+1) 
2347                 'upload_date': officialDate
, 
2352                 'description': officialTitle
, 
2354             results
.append(info
) 
2359 class EscapistIE(InfoExtractor
): 
2360     """Information extractor for The Escapist """ 
2362     _VALID_URL 
= r
'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$' 
2363     IE_NAME 
= u
'escapist' 
2365     def _real_extract(self
, url
): 
2366         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2368             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2369         showName 
= mobj
.group('showname') 
2370         videoId 
= mobj
.group('episode') 
2372         self
.report_extraction(videoId
) 
2373         webpage 
= self
._download
_webpage
(url
, videoId
) 
2375         videoDesc 
= self
._html
_search
_regex
('<meta name="description" content="([^"]*)"', 
2376             webpage
, u
'description', fatal
=False) 
2378         imgUrl 
= self
._html
_search
_regex
('<meta property="og:image" content="([^"]*)"', 
2379             webpage
, u
'thumbnail', fatal
=False) 
2381         playerUrl 
= self
._html
_search
_regex
('<meta property="og:video" content="([^"]*)"', 
2382             webpage
, u
'player url') 
2384         title 
= self
._html
_search
_regex
('<meta name="title" content="([^"]*)"', 
2385             webpage
, u
'player url').split(' : ')[-1] 
2387         configUrl 
= self
._search
_regex
('config=(.*)$', playerUrl
, u
'config url') 
2388         configUrl 
= compat_urllib_parse
.unquote(configUrl
) 
2390         configJSON 
= self
._download
_webpage
(configUrl
, videoId
, 
2391                                             u
'Downloading configuration', 
2392                                             u
'unable to download configuration') 
2394         # Technically, it's JavaScript, not JSON 
2395         configJSON 
= configJSON
.replace("'", '"') 
2398             config 
= json
.loads(configJSON
) 
2399         except (ValueError,) as err
: 
2400             raise ExtractorError(u
'Invalid JSON in configuration file: ' + compat_str(err
)) 
2402         playlist 
= config
['playlist'] 
2403         videoUrl 
= playlist
[1]['url'] 
2408             'uploader': showName
, 
2409             'upload_date': None, 
2412             'thumbnail': imgUrl
, 
2413             'description': videoDesc
, 
2414             'player_url': playerUrl
, 
2419 class CollegeHumorIE(InfoExtractor
): 
2420     """Information extractor for collegehumor.com""" 
2423     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$' 
2424     IE_NAME 
= u
'collegehumor' 
2426     def report_manifest(self
, video_id
): 
2427         """Report information extraction.""" 
2428         self
.to_screen(u
'%s: Downloading XML manifest' % video_id
) 
2430     def _real_extract(self
, url
): 
2431         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2433             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2434         video_id 
= mobj
.group('videoid') 
2439             'upload_date': None, 
2442         self
.report_extraction(video_id
) 
2443         xmlUrl 
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
 
2445             metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
2446         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2447             raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2449         mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2451             videoNode 
= mdoc
.findall('./video')[0] 
2452             info
['description'] = videoNode
.findall('./description')[0].text
 
2453             info
['title'] = videoNode
.findall('./caption')[0].text
 
2454             info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
 
2455             manifest_url 
= videoNode
.findall('./file')[0].text
 
2457             raise ExtractorError(u
'Invalid metadata XML file') 
2459         manifest_url 
+= '?hdcore=2.10.3' 
2460         self
.report_manifest(video_id
) 
2462             manifestXml 
= compat_urllib_request
.urlopen(manifest_url
).read() 
2463         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2464             raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2466         adoc 
= xml
.etree
.ElementTree
.fromstring(manifestXml
) 
2468             media_node 
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] 
2469             node_id 
= media_node
.attrib
['url'] 
2470             video_id 
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 
2471         except IndexError as err
: 
2472             raise ExtractorError(u
'Invalid manifest file') 
2474         url_pr 
= compat_urllib_parse_urlparse(manifest_url
) 
2475         url 
= url_pr
.scheme 
+ '://' + url_pr
.netloc 
+ '/z' + video_id
[:-2] + '/' + node_id 
+ 'Seg1-Frag1' 
2482 class XVideosIE(InfoExtractor
): 
2483     """Information extractor for xvideos.com""" 
2485     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' 
2486     IE_NAME 
= u
'xvideos' 
2488     def _real_extract(self
, url
): 
2489         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2491             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2492         video_id 
= mobj
.group(1) 
2494         webpage 
= self
._download
_webpage
(url
, video_id
) 
2496         self
.report_extraction(video_id
) 
2499         video_url 
= compat_urllib_parse
.unquote(self
._search
_regex
(r
'flv_url=(.+?)&', 
2500             webpage
, u
'video URL')) 
2503         video_title 
= self
._html
_search
_regex
(r
'<title>(.*?)\s+-\s+XVID', 
2506         # Extract video thumbnail 
2507         video_thumbnail 
= self
._search
_regex
(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', 
2508             webpage
, u
'thumbnail', fatal
=False) 
2514             'upload_date': None, 
2515             'title': video_title
, 
2517             'thumbnail': video_thumbnail
, 
2518             'description': None, 
2524 class SoundcloudIE(InfoExtractor
): 
2525     """Information extractor for soundcloud.com 
2526        To access the media, the uid of the song and a stream token 
2527        must be extracted from the page source and the script must make 
2528        a request to media.soundcloud.com/crossdomain.xml. Then 
2529        the media can be grabbed by requesting from an url composed 
2530        of the stream token and uid 
2533     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2534     IE_NAME 
= u
'soundcloud' 
2536     def report_resolve(self
, video_id
): 
2537         """Report information extraction.""" 
2538         self
.to_screen(u
'%s: Resolving id' % video_id
) 
2540     def _real_extract(self
, url
): 
2541         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2543             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2545         # extract uploader (which is in the url) 
2546         uploader 
= mobj
.group(1) 
2547         # extract simple title (uploader + slug of song title) 
2548         slug_title 
=  mobj
.group(2) 
2549         simple_title 
= uploader 
+ u
'-' + slug_title
 
2550         full_title 
= '%s/%s' % (uploader
, slug_title
) 
2552         self
.report_resolve(full_title
) 
2554         url 
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
) 
2555         resolv_url 
= 'http://api.soundcloud.com/resolve.json?url=' + url 
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2556         info_json 
= self
._download
_webpage
(resolv_url
, full_title
, u
'Downloading info JSON') 
2558         info 
= json
.loads(info_json
) 
2559         video_id 
= info
['id'] 
2560         self
.report_extraction(full_title
) 
2562         streams_url 
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2563         stream_json 
= self
._download
_webpage
(streams_url
, full_title
, 
2564                                              u
'Downloading stream definitions', 
2565                                              u
'unable to download stream definitions') 
2567         streams 
= json
.loads(stream_json
) 
2568         mediaURL 
= streams
['http_mp3_128_url'] 
2569         upload_date 
= unified_strdate(info
['created_at']) 
2574             'uploader': info
['user']['username'], 
2575             'upload_date': upload_date
, 
2576             'title':    info
['title'], 
2578             'description': info
['description'], 
2581 class SoundcloudSetIE(InfoExtractor
): 
2582     """Information extractor for soundcloud.com sets 
2583        To access the media, the uid of the song and a stream token 
2584        must be extracted from the page source and the script must make 
2585        a request to media.soundcloud.com/crossdomain.xml. Then 
2586        the media can be grabbed by requesting from an url composed 
2587        of the stream token and uid 
2590     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' 
2591     IE_NAME 
= u
'soundcloud:set' 
2593     def report_resolve(self
, video_id
): 
2594         """Report information extraction.""" 
2595         self
.to_screen(u
'%s: Resolving id' % video_id
) 
2597     def _real_extract(self
, url
): 
2598         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2600             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2602         # extract uploader (which is in the url) 
2603         uploader 
= mobj
.group(1) 
2604         # extract simple title (uploader + slug of song title) 
2605         slug_title 
=  mobj
.group(2) 
2606         simple_title 
= uploader 
+ u
'-' + slug_title
 
2607         full_title 
= '%s/sets/%s' % (uploader
, slug_title
) 
2609         self
.report_resolve(full_title
) 
2611         url 
= 'http://soundcloud.com/%s/sets/%s' % (uploader
, slug_title
) 
2612         resolv_url 
= 'http://api.soundcloud.com/resolve.json?url=' + url 
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2613         info_json 
= self
._download
_webpage
(resolv_url
, full_title
) 
2616         info 
= json
.loads(info_json
) 
2617         if 'errors' in info
: 
2618             for err 
in info
['errors']: 
2619                 self
._downloader
.report_error(u
'unable to download video webpage: %s' % compat_str(err
['error_message'])) 
2622         self
.report_extraction(full_title
) 
2623         for track 
in info
['tracks']: 
2624             video_id 
= track
['id'] 
2626             streams_url 
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2627             stream_json 
= self
._download
_webpage
(streams_url
, video_id
, u
'Downloading track info JSON') 
2629             self
.report_extraction(video_id
) 
2630             streams 
= json
.loads(stream_json
) 
2631             mediaURL 
= streams
['http_mp3_128_url'] 
2636                 'uploader': track
['user']['username'], 
2637                 'upload_date':  unified_strdate(track
['created_at']), 
2638                 'title':    track
['title'], 
2640                 'description': track
['description'], 
2645 class InfoQIE(InfoExtractor
): 
2646     """Information extractor for infoq.com""" 
2647     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' 
2649     def _real_extract(self
, url
): 
2650         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2652             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2654         webpage 
= self
._download
_webpage
(url
, video_id
=url
) 
2655         self
.report_extraction(url
) 
2658         mobj 
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
) 
2660             raise ExtractorError(u
'Unable to extract video url') 
2661         real_id 
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8')) 
2662         video_url 
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
 
2665         video_title 
= self
._search
_regex
(r
'contentTitle = "(.*?)";', 
2668         # Extract description 
2669         video_description 
= self
._html
_search
_regex
(r
'<meta name="description" content="(.*)"(?:\s*/)?>', 
2670             webpage
, u
'description', fatal
=False) 
2672         video_filename 
= video_url
.split('/')[-1] 
2673         video_id
, extension 
= video_filename
.split('.') 
2679             'upload_date': None, 
2680             'title': video_title
, 
2681             'ext': extension
, # Extension is always(?) mp4, but seems to be flv 
2683             'description': video_description
, 
2688 class MixcloudIE(InfoExtractor
): 
2689     """Information extractor for www.mixcloud.com""" 
2691     _WORKING 
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ 
2692     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2693     IE_NAME 
= u
'mixcloud' 
2695     def report_download_json(self
, file_id
): 
2696         """Report JSON download.""" 
2697         self
.to_screen(u
'Downloading json') 
2699     def get_urls(self
, jsonData
, fmt
, bitrate
='best'): 
2700         """Get urls from 'audio_formats' section in json""" 
2703             bitrate_list 
= jsonData
[fmt
] 
2704             if bitrate 
is None or bitrate 
== 'best' or bitrate 
not in bitrate_list
: 
2705                 bitrate 
= max(bitrate_list
) # select highest 
2707             url_list 
= jsonData
[fmt
][bitrate
] 
2708         except TypeError: # we have no bitrate info. 
2709             url_list 
= jsonData
[fmt
] 
2712     def check_urls(self
, url_list
): 
2713         """Returns 1st active url from list""" 
2714         for url 
in url_list
: 
2716                 compat_urllib_request
.urlopen(url
) 
2718             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2723     def _print_formats(self
, formats
): 
2724         print('Available formats:') 
2725         for fmt 
in formats
.keys(): 
2726             for b 
in formats
[fmt
]: 
2728                     ext 
= formats
[fmt
][b
][0] 
2729                     print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1])) 
2730                 except TypeError: # we have no bitrate info 
2731                     ext 
= formats
[fmt
][0] 
2732                     print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1])) 
2735     def _real_extract(self
, url
): 
2736         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2738             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2739         # extract uploader & filename from url 
2740         uploader 
= mobj
.group(1).decode('utf-8') 
2741         file_id 
= uploader 
+ "-" + mobj
.group(2).decode('utf-8') 
2743         # construct API request 
2744         file_url 
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json' 
2745         # retrieve .json file with links to files 
2746         request 
= compat_urllib_request
.Request(file_url
) 
2748             self
.report_download_json(file_url
) 
2749             jsonData 
= compat_urllib_request
.urlopen(request
).read() 
2750         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2751             raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
)) 
2754         json_data 
= json
.loads(jsonData
) 
2755         player_url 
= json_data
['player_swf_url'] 
2756         formats 
= dict(json_data
['audio_formats']) 
2758         req_format 
= self
._downloader
.params
.get('format', None) 
2761         if self
._downloader
.params
.get('listformats', None): 
2762             self
._print
_formats
(formats
) 
2765         if req_format 
is None or req_format 
== 'best': 
2766             for format_param 
in formats
.keys(): 
2767                 url_list 
= self
.get_urls(formats
, format_param
) 
2769                 file_url 
= self
.check_urls(url_list
) 
2770                 if file_url 
is not None: 
2773             if req_format 
not in formats
: 
2774                 raise ExtractorError(u
'Format is not available') 
2776             url_list 
= self
.get_urls(formats
, req_format
) 
2777             file_url 
= self
.check_urls(url_list
) 
2778             format_param 
= req_format
 
2781             'id': file_id
.decode('utf-8'), 
2782             'url': file_url
.decode('utf-8'), 
2783             'uploader': uploader
.decode('utf-8'), 
2784             'upload_date': None, 
2785             'title': json_data
['name'], 
2786             'ext': file_url
.split('.')[-1].decode('utf-8'), 
2787             'format': (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
2788             'thumbnail': json_data
['thumbnail_url'], 
2789             'description': json_data
['description'], 
2790             'player_url': player_url
.decode('utf-8'), 
2793 class StanfordOpenClassroomIE(InfoExtractor
): 
2794     """Information extractor for Stanford's Open ClassRoom""" 
2796     _VALID_URL 
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
2797     IE_NAME 
= u
'stanfordoc' 
2799     def _real_extract(self
, url
): 
2800         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2802             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2804         if mobj
.group('course') and mobj
.group('video'): # A specific video 
2805             course 
= mobj
.group('course') 
2806             video 
= mobj
.group('video') 
2808                 'id': course 
+ '_' + video
, 
2810                 'upload_date': None, 
2813             self
.report_extraction(info
['id']) 
2814             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
2815             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
2817                 metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
2818             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2819                 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2820             mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2822                 info
['title'] = mdoc
.findall('./title')[0].text
 
2823                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
2825                 raise ExtractorError(u
'Invalid metadata XML file') 
2826             info
['ext'] = info
['url'].rpartition('.')[2] 
2828         elif mobj
.group('course'): # A course page 
2829             course 
= mobj
.group('course') 
2834                 'upload_date': None, 
2837             coursepage 
= self
._download
_webpage
(url
, info
['id'], 
2838                                         note
='Downloading course info page', 
2839                                         errnote
='Unable to download course info page') 
2841             info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id']) 
2843             info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>', 
2844                 coursepage
, u
'description', fatal
=False) 
2846             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
2849                     'type': 'reference', 
2850                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
2854             for entry 
in info
['list']: 
2855                 assert entry
['type'] == 'reference' 
2856                 results 
+= self
.extract(entry
['url']) 
2860                 'id': 'Stanford OpenClassroom', 
2863                 'upload_date': None, 
2866             self
.report_download_webpage(info
['id']) 
2867             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
2869                 rootpage 
= compat_urllib_request
.urlopen(rootURL
).read() 
2870             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2871                 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
)) 
2873             info
['title'] = info
['id'] 
2875             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
2878                     'type': 'reference', 
2879                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
2884             for entry 
in info
['list']: 
2885                 assert entry
['type'] == 'reference' 
2886                 results 
+= self
.extract(entry
['url']) 
2889 class MTVIE(InfoExtractor
): 
2890     """Information extractor for MTV.com""" 
2892     _VALID_URL 
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' 
2895     def _real_extract(self
, url
): 
2896         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2898             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2899         if not mobj
.group('proto'): 
2900             url 
= 'http://' + url
 
2901         video_id 
= mobj
.group('videoid') 
2903         webpage 
= self
._download
_webpage
(url
, video_id
) 
2905         song_name 
= self
._html
_search
_regex
(r
'<meta name="mtv_vt" content="([^"]+)"/>', 
2906             webpage
, u
'song name', fatal
=False) 
2908         video_title 
= self
._html
_search
_regex
(r
'<meta name="mtv_an" content="([^"]+)"/>', 
2911         mtvn_uri 
= self
._html
_search
_regex
(r
'<meta name="mtvn_uri" content="([^"]+)"/>', 
2912             webpage
, u
'mtvn_uri', fatal
=False) 
2914         content_id 
= self
._search
_regex
(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);', 
2915             webpage
, u
'content id', fatal
=False) 
2917         videogen_url 
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri 
+ '&id=' + content_id 
+ '&vid=' + video_id 
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 
2918         self
.report_extraction(video_id
) 
2919         request 
= compat_urllib_request
.Request(videogen_url
) 
2921             metadataXml 
= compat_urllib_request
.urlopen(request
).read() 
2922         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2923             raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
)) 
2925         mdoc 
= xml
.etree
.ElementTree
.fromstring(metadataXml
) 
2926         renditions 
= mdoc
.findall('.//rendition') 
2928         # For now, always pick the highest quality. 
2929         rendition 
= renditions
[-1] 
2932             _
,_
,ext 
= rendition
.attrib
['type'].partition('/') 
2933             format 
= ext 
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate'] 
2934             video_url 
= rendition
.find('./src').text
 
2936             raise ExtractorError('Invalid rendition field.') 
2941             'uploader': performer
, 
2942             'upload_date': None, 
2943             'title': video_title
, 
2951 class YoukuIE(InfoExtractor
): 
2952     _VALID_URL 
=  r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html' 
2955         nowTime 
= int(time
.time() * 1000) 
2956         random1 
= random
.randint(1000,1998) 
2957         random2 
= random
.randint(1000,9999) 
2959         return "%d%d%d" %(nowTime
,random1
,random2
) 
2961     def _get_file_ID_mix_string(self
, seed
): 
2963         source 
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") 
2965         for i 
in range(len(source
)): 
2966             seed  
=  (seed 
* 211 + 30031 ) % 65536 
2967             index  
=  math
.floor(seed 
/ 65536 * len(source
) ) 
2968             mixed
.append(source
[int(index
)]) 
2969             source
.remove(source
[int(index
)]) 
2970         #return ''.join(mixed) 
2973     def _get_file_id(self
, fileId
, seed
): 
2974         mixed 
= self
._get
_file
_ID
_mix
_string
(seed
) 
2975         ids 
= fileId
.split('*') 
2979                 realId
.append(mixed
[int(ch
)]) 
2980         return ''.join(realId
) 
2982     def _real_extract(self
, url
): 
2983         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2985             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2986         video_id 
= mobj
.group('ID') 
2988         info_url 
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 
2990         jsondata 
= self
._download
_webpage
(info_url
, video_id
) 
2992         self
.report_extraction(video_id
) 
2994             config 
= json
.loads(jsondata
) 
2996             video_title 
=  config
['data'][0]['title'] 
2997             seed 
= config
['data'][0]['seed'] 
2999             format 
= self
._downloader
.params
.get('format', None) 
3000             supported_format 
= list(config
['data'][0]['streamfileids'].keys()) 
3002             if format 
is None or format 
== 'best': 
3003                 if 'hd2' in supported_format
: 
3008             elif format 
== 'worst': 
3016             fileid 
= config
['data'][0]['streamfileids'][format
] 
3017             keys 
= [s
['k'] for s 
in config
['data'][0]['segs'][format
]] 
3018         except (UnicodeDecodeError, ValueError, KeyError): 
3019             raise ExtractorError(u
'Unable to extract info section') 
3022         sid 
= self
._gen
_sid
() 
3023         fileid 
= self
._get
_file
_id
(fileid
, seed
) 
3025         #column 8,9 of fileid represent the segment number 
3026         #fileid[7:9] should be changed 
3027         for index
, key 
in enumerate(keys
): 
3029             temp_fileid 
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:]) 
3030             download_url 
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
) 
3033                 'id': '%s_part%02d' % (video_id
, index
), 
3034                 'url': download_url
, 
3036                 'upload_date': None, 
3037                 'title': video_title
, 
3040             files_info
.append(info
) 
3045 class XNXXIE(InfoExtractor
): 
3046     """Information extractor for xnxx.com""" 
3048     _VALID_URL 
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' 
3050     VIDEO_URL_RE 
= r
'flv_url=(.*?)&' 
3051     VIDEO_TITLE_RE 
= r
'<title>(.*?)\s+-\s+XNXX.COM' 
3052     VIDEO_THUMB_RE 
= r
'url_bigthumb=(.*?)&' 
3054     def _real_extract(self
, url
): 
3055         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3057             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3058         video_id 
= mobj
.group(1) 
3060         # Get webpage content 
3061         webpage 
= self
._download
_webpage
(url
, video_id
) 
3063         video_url 
= self
._search
_regex
(self
.VIDEO_URL_RE
, 
3064             webpage
, u
'video URL') 
3065         video_url 
= compat_urllib_parse
.unquote(video_url
) 
3067         video_title 
= self
._html
_search
_regex
(self
.VIDEO_TITLE_RE
, 
3070         video_thumbnail 
= self
._search
_regex
(self
.VIDEO_THUMB_RE
, 
3071             webpage
, u
'thumbnail', fatal
=False) 
3077             'upload_date': None, 
3078             'title': video_title
, 
3080             'thumbnail': video_thumbnail
, 
3081             'description': None, 
3085 class GooglePlusIE(InfoExtractor
): 
3086     """Information extractor for plus.google.com.""" 
3088     _VALID_URL 
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' 
3089     IE_NAME 
= u
'plus.google' 
3091     def _real_extract(self
, url
): 
3092         # Extract id from URL 
3093         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3095             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3097         post_url 
= mobj
.group(0) 
3098         video_id 
= mobj
.group(1) 
3100         video_extension 
= 'flv' 
3102         # Step 1, Retrieve post webpage to extract further information 
3103         webpage 
= self
._download
_webpage
(post_url
, video_id
, u
'Downloading entry webpage') 
3105         self
.report_extraction(video_id
) 
3107         # Extract update date 
3108         upload_date 
= self
._html
_search
_regex
('title="Timestamp">(.*?)</a>', 
3109             webpage
, u
'upload date', fatal
=False) 
3111             # Convert timestring to a format suitable for filename 
3112             upload_date 
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d") 
3113             upload_date 
= upload_date
.strftime('%Y%m%d') 
3116         uploader 
= self
._html
_search
_regex
(r
'rel\="author".*?>(.*?)</a>', 
3117             webpage
, u
'uploader', fatal
=False) 
3120         # Get the first line for title 
3121         video_title 
= self
._html
_search
_regex
(r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]', 
3122             webpage
, 'title', default
=u
'NA') 
3124         # Step 2, Stimulate clicking the image box to launch video 
3125         video_page 
= self
._search
_regex
('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]', 
3126             webpage
, u
'video page URL') 
3127         webpage 
= self
._download
_webpage
(video_page
, video_id
, u
'Downloading video page') 
3129         # Extract video links on video page 
3130         """Extract video links of all sizes""" 
3131         pattern 
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' 
3132         mobj 
= re
.findall(pattern
, webpage
) 
3134             raise ExtractorError(u
'Unable to extract video links') 
3136         # Sort in resolution 
3137         links 
= sorted(mobj
) 
3139         # Choose the lowest of the sort, i.e. highest resolution 
3140         video_url 
= links
[-1] 
3141         # Only get the url. The resolution part in the tuple has no use anymore 
3142         video_url 
= video_url
[-1] 
3143         # Treat escaped \u0026 style hex 
3145             video_url 
= video_url
.decode("unicode_escape") 
3146         except AttributeError: # Python 3 
3147             video_url 
= bytes(video_url
, 'ascii').decode('unicode-escape') 
3153             'uploader': uploader
, 
3154             'upload_date':  upload_date
, 
3155             'title':    video_title
, 
3156             'ext':      video_extension
, 
3159 class NBAIE(InfoExtractor
): 
3160     _VALID_URL 
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$' 
3163     def _real_extract(self
, url
): 
3164         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3166             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3168         video_id 
= mobj
.group(1) 
3170         webpage 
= self
._download
_webpage
(url
, video_id
) 
3172         video_url 
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id 
+ '_nba_1280x720.mp4' 
3174         shortened_video_id 
= video_id
.rpartition('/')[2] 
3175         title 
= self
._html
_search
_regex
(r
'<meta property="og:title" content="(.*?)"', 
3176             webpage
, 'title', default
=shortened_video_id
).replace('NBA.com: ', '') 
3178         # It isn't there in the HTML it returns to us 
3179         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False) 
3181         description 
= self
._html
_search
_regex
(r
'<meta name="description" (?:content|value)="(.*?)" />', webpage
, 'description', fatal
=False) 
3184             'id': shortened_video_id
, 
3188             # 'uploader_date': uploader_date, 
3189             'description': description
, 
3193 class JustinTVIE(InfoExtractor
): 
3194     """Information extractor for justin.tv and twitch.tv""" 
3195     # TODO: One broadcast may be split into multiple videos. The key 
3196     # 'broadcast_id' is the same for all parts, and 'broadcast_part' 
3197     # starts at 1 and increases. Can we treat all parts as one video? 
3199     _VALID_URL 
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ 
3201             (?P<channelid>[^/]+)| 
3202             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| 
3203             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) 
3207     _JUSTIN_PAGE_LIMIT 
= 100 
3208     IE_NAME 
= u
'justin.tv' 
3210     def report_download_page(self
, channel
, offset
): 
3211         """Report attempt to download a single page of videos.""" 
3212         self
.to_screen(u
'%s: Downloading video information from %d to %d' % 
3213                 (channel
, offset
, offset 
+ self
._JUSTIN
_PAGE
_LIMIT
)) 
3215     # Return count of items, list of *valid* items 
3216     def _parse_page(self
, url
, video_id
): 
3217         webpage 
= self
._download
_webpage
(url
, video_id
, 
3218                                          u
'Downloading video info JSON', 
3219                                          u
'unable to download video info JSON') 
3221         response 
= json
.loads(webpage
) 
3222         if type(response
) != list: 
3223             error_text 
= response
.get('error', 'unknown error') 
3224             raise ExtractorError(u
'Justin.tv API: %s' % error_text
) 
3226         for clip 
in response
: 
3227             video_url 
= clip
['video_file_url'] 
3229                 video_extension 
= os
.path
.splitext(video_url
)[1][1:] 
3230                 video_date 
= re
.sub('-', '', clip
['start_time'][:10]) 
3231                 video_uploader_id 
= clip
.get('user_id', clip
.get('channel_id')) 
3232                 video_id 
= clip
['id'] 
3233                 video_title 
= clip
.get('title', video_id
) 
3237                     'title': video_title
, 
3238                     'uploader': clip
.get('channel_name', video_uploader_id
), 
3239                     'uploader_id': video_uploader_id
, 
3240                     'upload_date': video_date
, 
3241                     'ext': video_extension
, 
3243         return (len(response
), info
) 
3245     def _real_extract(self
, url
): 
3246         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3248             raise ExtractorError(u
'invalid URL: %s' % url
) 
3250         api_base 
= 'http://api.justin.tv' 
3252         if mobj
.group('channelid'): 
3254             video_id 
= mobj
.group('channelid') 
3255             api 
= api_base 
+ '/channel/archives/%s.json' % video_id
 
3256         elif mobj
.group('chapterid'): 
3257             chapter_id 
= mobj
.group('chapterid') 
3259             webpage 
= self
._download
_webpage
(url
, chapter_id
) 
3260             m 
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
) 
3262                 raise ExtractorError(u
'Cannot find archive of a chapter') 
3263             archive_id 
= m
.group(1) 
3265             api 
= api_base 
+ '/broadcast/by_chapter/%s.xml' % chapter_id
 
3266             chapter_info_xml 
= self
._download
_webpage
(api
, chapter_id
, 
3267                                              note
=u
'Downloading chapter information', 
3268                                              errnote
=u
'Chapter information download failed') 
3269             doc 
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
) 
3270             for a 
in doc
.findall('.//archive'): 
3271                 if archive_id 
== a
.find('./id').text
: 
3274                 raise ExtractorError(u
'Could not find chapter in chapter information') 
3276             video_url 
= a
.find('./video_file_url').text
 
3277             video_ext 
= video_url
.rpartition('.')[2] or u
'flv' 
3279             chapter_api_url 
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
 
3280             chapter_info_json 
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
, 
3281                                    note
='Downloading chapter metadata', 
3282                                    errnote
='Download of chapter metadata failed') 
3283             chapter_info 
= json
.loads(chapter_info_json
) 
3285             bracket_start 
= int(doc
.find('.//bracket_start').text
) 
3286             bracket_end 
= int(doc
.find('.//bracket_end').text
) 
3288             # TODO determine start (and probably fix up file) 
3289             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 
3290             #video_url += u'?start=' + TODO:start_timestamp 
3291             # bracket_start is 13290, but we want 51670615 
3292             self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. ' 
3293                                             u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
))) 
3296                 'id': u
'c' + chapter_id
, 
3299                 'title': chapter_info
['title'], 
3300                 'thumbnail': chapter_info
['preview'], 
3301                 'description': chapter_info
['description'], 
3302                 'uploader': chapter_info
['channel']['display_name'], 
3303                 'uploader_id': chapter_info
['channel']['name'], 
3307             video_id 
= mobj
.group('videoid') 
3308             api 
= api_base 
+ '/broadcast/by_archive/%s.json' % video_id
 
3310         self
.report_extraction(video_id
) 
3314         limit 
= self
._JUSTIN
_PAGE
_LIMIT
 
3317                 self
.report_download_page(video_id
, offset
) 
3318             page_url 
= api 
+ ('?offset=%d&limit=%d' % (offset
, limit
)) 
3319             page_count
, page_info 
= self
._parse
_page
(page_url
, video_id
) 
3320             info
.extend(page_info
) 
3321             if not paged 
or page_count 
!= limit
: 
3326 class FunnyOrDieIE(InfoExtractor
): 
3327     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$' 
3329     def _real_extract(self
, url
): 
3330         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3332             raise ExtractorError(u
'invalid URL: %s' % url
) 
3334         video_id 
= mobj
.group('id') 
3335         webpage 
= self
._download
_webpage
(url
, video_id
) 
3337         video_url 
= self
._html
_search
_regex
(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', 
3338             webpage
, u
'video URL', flags
=re
.DOTALL
) 
3340         title 
= self
._html
_search
_regex
((r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", 
3341             r
'<title>(?P<title>[^<]+?)</title>'), webpage
, 'title', flags
=re
.DOTALL
) 
3343         video_description 
= self
._html
_search
_regex
(r
'<meta property="og:description" content="(?P<desc>.*?)"', 
3344             webpage
, u
'description', fatal
=False, flags
=re
.DOTALL
) 
3351             'description': video_description
, 
3355 class SteamIE(InfoExtractor
): 
3356     _VALID_URL 
= r
"""http://store\.steampowered\.com/ 
3358                 (?P<urltype>video|app)/ #If the page is only for videos or for a game 
3360                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID 
3362     _VIDEO_PAGE_TEMPLATE 
= 'http://store.steampowered.com/video/%s/' 
3363     _AGECHECK_TEMPLATE 
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' 
3366     def suitable(cls
, url
): 
3367         """Receives a URL and returns True if suitable for this IE.""" 
3368         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
3370     def _real_extract(self
, url
): 
3371         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
3372         gameID 
= m
.group('gameID') 
3374         videourl 
= self
._VIDEO
_PAGE
_TEMPLATE 
% gameID
 
3375         webpage 
= self
._download
_webpage
(videourl
, gameID
) 
3377         if re
.search('<h2>Please enter your birth date to continue:</h2>', webpage
) is not None: 
3378             videourl 
= self
._AGECHECK
_TEMPLATE 
% gameID
 
3379             self
.report_age_confirmation() 
3380             webpage 
= self
._download
_webpage
(videourl
, gameID
) 
3382         self
.report_extraction(gameID
) 
3383         game_title 
= self
._html
_search
_regex
(r
'<h2 class="pageheader">(.*?)</h2>', 
3384                                              webpage
, 'game title') 
3386         urlRE 
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P
<videoURL
>[\w
:/\
.\?=]+)\"(,\s
*MOVIE_NAME
: \"(?P
<videoName
>[\w
:/\
.\?=\
+-]+)\")?\s
*\
}," 
3387         mweb = re.finditer(urlRE, webpage) 
3388         namesRE = r'<span class="title
">(?P<videoName>.+?)</span>' 
3389         titles = re.finditer(namesRE, webpage) 
3390         thumbsRE = r'<img class="movie_thumb
" src="(?P
<thumbnail
>.+?
)">' 
3391         thumbs = re.finditer(thumbsRE, webpage) 
3393         for vid,vtitle,thumb in zip(mweb,titles,thumbs): 
3394             video_id = vid.group('videoID') 
3395             title = vtitle.group('videoName') 
3396             video_url = vid.group('videoURL') 
3397             video_thumb = thumb.group('thumbnail') 
3399                 raise ExtractorError(u'Cannot find video url for %s' % video_id) 
3404                 'title': unescapeHTML(title), 
3405                 'thumbnail': video_thumb 
3408         return [self.playlist_result(videos, gameID, game_title)] 
3410 class UstreamIE(InfoExtractor): 
3411     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)' 
3412     IE_NAME = u'ustream' 
3414     def _real_extract(self, url): 
3415         m = re.match(self._VALID_URL, url) 
3416         video_id = m.group('videoID') 
3418         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id 
3419         webpage = self._download_webpage(url, video_id) 
3421         self.report_extraction(video_id) 
3423         video_title = self._html_search_regex(r'data-title="(?P
<title
>.+)"', 
3426         uploader = self._html_search_regex(r'data-content-type="channel
".*?>(?P<uploader>.*?)</a>', 
3427             webpage, u'uploader', fatal=False, flags=re.DOTALL) 
3429         thumbnail = self._html_search_regex(r'<link rel="image_src
" href="(?P
<thumb
>.*?
)"', 
3430             webpage, u'thumbnail', fatal=False) 
3436                 'title': video_title, 
3437                 'uploader': uploader, 
3438                 'thumbnail': thumbnail, 
3442 class WorldStarHipHopIE(InfoExtractor): 
3443     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' 
3444     IE_NAME = u'WorldStarHipHop' 
3446     def _real_extract(self, url): 
3447         m = re.match(self._VALID_URL, url) 
3448         video_id = m.group('id') 
3450         webpage_src = self._download_webpage(url, video_id) 
3452         video_url = self._search_regex(r'so\.addVariable\("file","(.*?
)"\)', 
3453             webpage_src, u'video URL') 
3455         if 'mp4' in video_url: 
3460         video_title = self._html_search_regex(r"<title
>(.*)</title
>", 
3461             webpage_src, u'title') 
3463         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. 
3464         thumbnail = self._html_search_regex(r'rel="image_src
" href="(.*)" />', 
3465             webpage_src, u'thumbnail', fatal=False) 
3468             _title = r"""candytitles.*>(.*)</span>""" 
3469             mobj = re.search(_title, webpage_src) 
3470             if mobj is not None: 
3471                 video_title = mobj.group(1) 
3476                     'title' : video_title, 
3477                     'thumbnail' : thumbnail, 
3482 class RBMARadioIE(InfoExtractor): 
3483     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' 
3485     def _real_extract(self, url): 
3486         m = re.match(self._VALID_URL, url) 
3487         video_id = m.group('videoID') 
3489         webpage = self._download_webpage(url, video_id) 
3491         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', 
3492             webpage, u'json data', flags=re.MULTILINE) 
3495             data = json.loads(json_data) 
3496         except ValueError as e: 
3497             raise ExtractorError(u'Invalid JSON: ' + str(e)) 
3499         video_url = data['akamai_url'] + '&cbr=256' 
3500         url_parts = compat_urllib_parse_urlparse(video_url) 
3501         video_ext = url_parts.path.rpartition('.')[2] 
3506                 'title': data['title'], 
3507                 'description': data.get('teaser_text'), 
3508                 'location': data.get('country_of_origin'), 
3509                 'uploader': data.get('host', {}).get('name'), 
3510                 'uploader_id': data.get('host', {}).get('slug'), 
3511                 'thumbnail': data.get('image', {}).get('large_url_2x'), 
3512                 'duration': data.get('duration'), 
3517 class YouPornIE(InfoExtractor): 
3518     """Information extractor for youporn.com.""" 
3519     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)' 
3521     def _print_formats(self, formats): 
3522         """Print all available formats""" 
3523         print(u'Available formats:') 
3524         print(u'ext\t\tformat') 
3525         print(u'---------------------------------') 
3526         for format in formats: 
3527             print(u'%s\t\t%s'  % (format['ext'], format['format'])) 
3529     def _specific(self, req_format, formats): 
3531             if(x["format
"]==req_format): 
3535     def _real_extract(self, url): 
3536         mobj = re.match(self._VALID_URL, url) 
3538             raise ExtractorError(u'Invalid URL: %s' % url) 
3539         video_id = mobj.group('videoid') 
3541         req = compat_urllib_request.Request(url) 
3542         req.add_header('Cookie', 'age_verified=1') 
3543         webpage = self._download_webpage(req, video_id) 
3545         # Get JSON parameters 
3546         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters') 
3548             params = json.loads(json_params) 
3550             raise ExtractorError(u'Invalid JSON') 
3552         self.report_extraction(video_id) 
3554             video_title = params['title'] 
3555             upload_date = unified_strdate(params['release_date_f']) 
3556             video_description = params['description'] 
3557             video_uploader = params['submitted_by'] 
3558             thumbnail = params['thumbnails'][0]['image'] 
3560             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1]) 
3562         # Get all of the formats available 
3563         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList
">(?P<download_list>.*?)</ul>' 
3564         download_list_html = self._search_regex(DOWNLOAD_LIST_RE, 
3565             webpage, u'download list').strip() 
3567         # Get all of the links from the page 
3568         LINK_RE = r'(?s)<a href="(?P
<url
>[^
"]+)">' 
3569         links = re.findall(LINK_RE, download_list_html) 
3570         if(len(links) == 0): 
3571             raise ExtractorError(u'ERROR
: no known formats available 
for video
') 
3573         self.to_screen(u'Links found
: %d' % len(links)) 
3578             # A link looks like this: 
3579             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 
3580             # A path looks like this: 
3581             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 
3582             video_url = unescapeHTML( link ) 
3583             path = compat_urllib_parse_urlparse( video_url ).path 
3584             extension = os.path.splitext( path )[1][1:] 
3585             format = path.split('/')[4].split('_
')[:2] 
3588             format = "-".join( format ) 
3589             # title = u'%s-%s-%s' % (video_title, size, bitrate) 
3594                 'uploader
': video_uploader, 
3595                 'upload_date
': upload_date, 
3596                 'title
': video_title, 
3599                 'thumbnail
': thumbnail, 
3600                 'description
': video_description 
3603         if self._downloader.params.get('listformats
', None): 
3604             self._print_formats(formats) 
3607         req_format = self._downloader.params.get('format
', None) 
3608         self.to_screen(u'Format
: %s' % req_format) 
3610         if req_format is None or req_format == 'best
': 
3612         elif req_format == 'worst
': 
3613             return [formats[-1]] 
3614         elif req_format in ('-1', 'all
'): 
3617             format = self._specific( req_format, formats ) 
3619                 raise ExtractorError(u'Requested format 
not available
') 
3624 class PornotubeIE(InfoExtractor): 
3625     """Information extractor for pornotube.com.""" 
3626     _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?pornotube\
.com(/c
/(?P
<channel
>[0-9]+))?
(/m
/(?P
<videoid
>[0-9]+))(/(?P
<title
>.+))$
' 
3628     def _real_extract(self, url): 
3629         mobj = re.match(self._VALID_URL, url) 
3631             raise ExtractorError(u'Invalid URL
: %s' % url) 
3633         video_id = mobj.group('videoid
') 
3634         video_title = mobj.group('title
') 
3636         # Get webpage content 
3637         webpage = self._download_webpage(url, video_id) 
3640         VIDEO_URL_RE = r'url
: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' 
3641         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url
') 
3642         video_url = compat_urllib_parse.unquote(video_url) 
3644         #Get the uploaded date 
3645         VIDEO_UPLOADED_RE = r'<div 
class="video_added_by">Added (?P
<date
>[0-9\
/]+) by
' 
3646         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date
', fatal=False) 
3647         if upload_date: upload_date = unified_strdate(upload_date) 
3649         info = {'id': video_id, 
3652                 'upload_date
': upload_date, 
3653                 'title
': video_title, 
3659 class YouJizzIE(InfoExtractor): 
3660     """Information extractor for youjizz.com.""" 
3661     _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?youjizz\
.com
/videos
/(?P
<videoid
>[^
.]+).html$
' 
3663     def _real_extract(self, url): 
3664         mobj = re.match(self._VALID_URL, url) 
3666             raise ExtractorError(u'Invalid URL
: %s' % url) 
3668         video_id = mobj.group('videoid
') 
3670         # Get webpage content 
3671         webpage = self._download_webpage(url, video_id) 
3673         # Get the video title 
3674         video_title = self._html_search_regex(r'<title
>(?P
<title
>.*)</title
>', 
3675             webpage, u'title
').strip() 
3677         # Get the embed page 
3678         result = re.search(r'https?
://www
.youjizz
.com
/videos
/embed
/(?P
<videoid
>[0-9]+)', webpage) 
3680             raise ExtractorError(u'ERROR
: unable to extract embed page
') 
3682         embed_page_url = result.group(0).strip() 
3683         video_id = result.group('videoid
') 
3685         webpage = self._download_webpage(embed_page_url, video_id) 
3688         video_url = self._search_regex(r'so
.addVariable\
("file",encodeURIComponent\
("(?P<source>[^"]+)"\)\);', 
3689             webpage, u'video URL') 
3691         info = {'id': video_id, 
3693                 'title': video_title, 
3696                 'player_url': embed_page_url} 
3700 class EightTracksIE(InfoExtractor): 
3702     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' 
3704     def _real_extract(self, url): 
3705         mobj = re.match(self._VALID_URL, url) 
3707             raise ExtractorError(u'Invalid URL: %s' % url) 
3708         playlist_id = mobj.group('id') 
3710         webpage = self._download_webpage(url, playlist_id) 
3712         json_like = self._search_regex(r"PAGE
.mix 
= (.*?
);\n", webpage, u'trax information', flags=re.DOTALL) 
3713         data = json.loads(json_like) 
3715         session = str(random.randint(0, 1000000000)) 
3717         track_count = data['tracks_count'] 
3718         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) 
3719         next_url = first_url 
3721         for i in itertools.count(): 
3722             api_json = self._download_webpage(next_url, playlist_id, 
3723                 note=u'Downloading song information %s/%s' % (str(i+1), track_count), 
3724                 errnote=u'Failed to download song information') 
3725             api_data = json.loads(api_json) 
3726             track_data = api_data[u'set']['track'] 
3728                 'id': track_data['id'], 
3729                 'url': track_data['track_file_stream_url'], 
3730                 'title': track_data['performer'] + u' - ' + track_data['name'], 
3731                 'raw_title': track_data['name'], 
3732                 'uploader_id': data['user']['login'], 
3736             if api_data['set']['at_last_track']: 
3738             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) 
3741 class KeekIE(InfoExtractor): 
3742     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)' 
3745     def _real_extract(self, url): 
3746         m = re.match(self._VALID_URL, url) 
3747         video_id = m.group('videoID') 
3749         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id 
3750         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id 
3751         webpage = self._download_webpage(url, video_id) 
3753         video_title = self._html_search_regex(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"', 
3756         uploader = self._html_search_regex(r'<div class="user
-name
-and-bio
">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', 
3757             webpage, u'uploader', fatal=False) 
3763                 'title': video_title, 
3764                 'thumbnail': thumbnail, 
3765                 'uploader': uploader 
3769 class TEDIE(InfoExtractor): 
3770     _VALID_URL=r'''http://www\.ted\.com/ 
3772                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist 
3774                         ((?P<type_talk>talks)) # We have a simple talk 
3776                    (/lang/(.*?))? # The url may contain the language 
3777                    /(?P<name>\w+) # Here goes the name and then ".html
" 
3781     def suitable(cls, url): 
3782         """Receives a URL and returns True if suitable for this IE.""" 
3783         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None 
3785     def _real_extract(self, url): 
3786         m=re.match(self._VALID_URL, url, re.VERBOSE) 
3787         if m.group('type_talk'): 
3788             return [self._talk_info(url)] 
3790             playlist_id=m.group('playlist_id') 
3791             name=m.group('name') 
3792             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) 
3793             return [self._playlist_videos_info(url,name,playlist_id)] 
3795     def _playlist_videos_info(self,url,name,playlist_id=0): 
3796         '''Returns the videos of the playlist''' 
3798                      <li\ id="talk_(\d
+)"([.\s]*?)data-id="(?P
<video_id
>\d
+)" 
3799                      ([.\s]*?)data-playlist_item_id="(\d
+)" 
3800                      ([.\s]*?)data-mediaslug="(?P
<mediaSlug
>.+?
)" 
3802         video_name_RE=r'<p\ class="talk
-title
"><a href="(?P
<talk_url
>/talks
/(.+).html
)">(?P<fullname>.+?)</a></p>' 
3803         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') 
3804         m_videos=re.finditer(video_RE,webpage,re.VERBOSE) 
3805         m_names=re.finditer(video_name_RE,webpage) 
3807         playlist_title = self._html_search_regex(r'div class="headline
">\s*?<h1>\s*?<span>(.*?)</span>', 
3808                                                  webpage, 'playlist title') 
3810         playlist_entries = [] 
3811         for m_video, m_name in zip(m_videos,m_names): 
3812             video_id=m_video.group('video_id') 
3813             talk_url='http://www.ted.com%s' % m_name.group('talk_url') 
3814             playlist_entries.append(self.url_result(talk_url, 'TED')) 
3815         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) 
3817     def _talk_info(self, url, video_id=0): 
3818         """Return the video for the talk in the url""" 
3819         m = re.match(self._VALID_URL, url,re.VERBOSE) 
3820         video_name = m.group('name') 
3821         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name) 
3822         self.report_extraction(video_name) 
3823         # If the url includes the language we get the title translated 
3824         title = self._html_search_regex(r'<span id="altHeadline
" >(?P<title>.*)</span>', 
3826         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>', 
3827                                     webpage, 'json data') 
3828         info = json.loads(json_data) 
3829         desc = self._html_search_regex(r'<div class="talk
-intro
">.*?<p.*?>(.*?)</p>', 
3830                                        webpage, 'description', flags = re.DOTALL) 
3832         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?
)"', 
3833                                        webpage, 'thumbnail') 
3836                 'url': info['htmlStreams'][-1]['file'], 
3839                 'thumbnail': thumbnail, 
3840                 'description': desc, 
3844 class MySpassIE(InfoExtractor): 
3845     _VALID_URL = r'http://www.myspass.de/.*' 
3847     def _real_extract(self, url): 
3848         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' 
3850         # video id is the last path element of the URL 
3851         # usually there is a trailing slash, so also try the second but last 
3852         url_path = compat_urllib_parse_urlparse(url).path 
3853         url_parent_path, video_id = os.path.split(url_path) 
3855             _, video_id = os.path.split(url_parent_path) 
3858         metadata_url = META_DATA_URL_TEMPLATE % video_id 
3859         metadata_text = self._download_webpage(metadata_url, video_id) 
3860         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) 
3862         # extract values from metadata 
3863         url_flv_el = metadata.find('url_flv') 
3864         if url_flv_el is None: 
3865             raise ExtractorError(u'Unable to extract download url') 
3866         video_url = url_flv_el.text 
3867         extension = os.path.splitext(video_url)[1][1:] 
3868         title_el = metadata.find('title') 
3869         if title_el is None: 
3870             raise ExtractorError(u'Unable to extract title') 
3871         title = title_el.text 
3872         format_id_el = metadata.find('format_id') 
3873         if format_id_el is None: 
3876             format = format_id_el.text 
3877         description_el = metadata.find('description') 
3878         if description_el is not None: 
3879             description = description_el.text 
3882         imagePreview_el = metadata.find('imagePreview') 
3883         if imagePreview_el is not None: 
3884             thumbnail = imagePreview_el.text 
3893             'thumbnail': thumbnail, 
3894             'description': description 
3898 class SpiegelIE(InfoExtractor): 
3899     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' 
3901     def _real_extract(self, url): 
3902         m = re.match(self._VALID_URL, url) 
3903         video_id = m.group('videoID') 
3905         webpage = self._download_webpage(url, video_id) 
3907         video_title = self._html_search_regex(r'<div class="module
-title
">(.*?)</div>', 
3910         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' 
3911         xml_code = self._download_webpage(xml_url, video_id, 
3912                     note=u'Downloading XML', errnote=u'Failed to download XML') 
3914         idoc = xml.etree.ElementTree.fromstring(xml_code) 
3915         last_type = idoc[-1] 
3916         filename = last_type.findall('./filename')[0].text 
3917         duration = float(last_type.findall('./duration')[0].text) 
3919         video_url = 'http://video2.spiegel.de/flash/' + filename 
3920         video_ext = filename.rpartition('.')[2] 
3925             'title': video_title, 
3926             'duration': duration, 
3930 class LiveLeakIE(InfoExtractor): 
3932     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' 
3933     IE_NAME = u'liveleak' 
3935     def _real_extract(self, url): 
3936         mobj = re.match(self._VALID_URL, url) 
3938             raise ExtractorError(u'Invalid URL: %s' % url) 
3940         video_id = mobj.group('video_id') 
3942         webpage = self._download_webpage(url, video_id) 
3944         video_url = self._search_regex(r'file: "(.*?
)",', 
3945             webpage, u'video URL') 
3947         video_title = self._html_search_regex(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"', 
3948             webpage, u'title').replace('LiveLeak.com -', '').strip() 
3950         video_description = self._html_search_regex(r'<meta property="og
:description
" content="(?P
<desc
>.*?
)"', 
3951             webpage, u'description', fatal=False) 
3953         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>', 
3954             webpage, u'uploader', fatal=False) 
3960             'title': video_title, 
3961             'description': video_description, 
3962             'uploader': video_uploader 
3967 class ARDIE(InfoExtractor): 
3968     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' 
3969     _TITLE = r'<h1(?: class="boxTopHeadline
")?>(?P<title>.*)</h1>' 
3970     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P
<rtmp_url
>[^
"]*)", "(?P<video_url>[^"]*)", "[^
"]*"\
)' 
3972     def _real_extract(self, url): 
3973         # determine video id from url 
3974         m = re.match(self._VALID_URL, url) 
3976         numid = re.search(r'documentId
=([0-9]+)', url) 
3978             video_id = numid.group(1) 
3980             video_id = m.group('video_id
') 
3982         # determine title and media streams from webpage 
3983         html = self._download_webpage(url, video_id) 
3984         title = re.search(self._TITLE, html).group('title
') 
3985         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] 
3987             assert '"fsk"' in html 
3988             raise ExtractorError(u'This video 
is only available after 
8:00 pm
') 
3990         # choose default media type and highest quality for now 
3991         stream = max([s for s in streams if int(s["media_type"]) == 0], 
3992                      key=lambda s: int(s["quality"])) 
3994         # there's two possibilities
: RTMP stream 
or HTTP download
 
3995         info 
= {'id': video_id
, 'title': title
, 'ext': 'mp4'} 
3996         if stream
['rtmp_url']: 
3997             self
.to_screen(u
'RTMP download detected') 
3998             assert stream
['video_url'].startswith('mp4:') 
3999             info
["url"] = stream
["rtmp_url"] 
4000             info
["play_path"] = stream
['video_url'] 
4002             assert stream
["video_url"].endswith('.mp4') 
4003             info
["url"] = stream
["video_url"] 
4006 class ZDFIE(InfoExtractor
): 
4007     _VALID_URL 
= r
'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?' 
4008     _TITLE 
= r
'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>' 
4009     _MEDIA_STREAM 
= r
'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>' 
4010     _MMS_STREAM 
= r
'href="(?P<video_url>mms://[^"]*)"' 
4011     _RTSP_STREAM 
= r
'(?P<video_url>rtsp://[^"]*.mp4)' 
4013     def _real_extract(self
, url
): 
4014         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4016             raise ExtractorError(u
'Invalid URL: %s' % url
) 
4017         video_id 
= mobj
.group('video_id') 
4019         html 
= self
._download
_webpage
(url
, video_id
) 
4020         streams 
= [m
.groupdict() for m 
in re
.finditer(self
._MEDIA
_STREAM
, html
)] 
4022             raise ExtractorError(u
'No media url found.') 
4024         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url 
4025         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url 
4026         # choose first/default media type and highest quality for now 
4027         for s 
in streams
:        #find 300 - dsl1000mbit 
4028             if s
['quality'] == '300' and s
['media_type'] == 'wstreaming': 
4031         for s 
in streams
:        #find veryhigh - dsl2000mbit 
4032             if s
['quality'] == 'veryhigh' and s
['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working 
4036             raise ExtractorError(u
'No stream found.') 
4038         media_link 
= self
._download
_webpage
(stream_
['video_url'], video_id
,'Get stream URL') 
4040         self
.report_extraction(video_id
) 
4041         mobj 
= re
.search(self
._TITLE
, html
) 
4043             raise ExtractorError(u
'Cannot extract title') 
4044         title 
= unescapeHTML(mobj
.group('title')) 
4046         mobj 
= re
.search(self
._MMS
_STREAM
, media_link
) 
4048             mobj 
= re
.search(self
._RTSP
_STREAM
, media_link
) 
4050                 raise ExtractorError(u
'Cannot extract mms:// or rtsp:// URL') 
4051         mms_url 
= mobj
.group('video_url') 
4053         mobj 
= re
.search('(.*)[.](?P<ext>[^.]+)', mms_url
) 
4055             raise ExtractorError(u
'Cannot extract extention') 
4056         ext 
= mobj
.group('ext') 
4058         return [{'id': video_id
, 
4064 class TumblrIE(InfoExtractor
): 
4065     _VALID_URL 
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' 
4067     def _real_extract(self
, url
): 
4068         m_url 
= re
.match(self
._VALID
_URL
, url
) 
4069         video_id 
= m_url
.group('id') 
4070         blog 
= m_url
.group('blog_name') 
4072         url 
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
) 
4073         webpage 
= self
._download
_webpage
(url
, video_id
) 
4075         re_video 
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
) 
4076         video 
= re
.search(re_video
, webpage
) 
4078            raise ExtractorError(u
'Unable to extract video') 
4079         video_url 
= video
.group('video_url') 
4080         ext 
= video
.group('ext') 
4082         video_thumbnail 
= self
._search
_regex
(r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22', 
4083             webpage
, u
'thumbnail', fatal
=False)  # We pick the first poster 
4084         if video_thumbnail
: video_thumbnail 
= video_thumbnail
.replace('\\', '') 
4086         # The only place where you can get a title, it's not complete, 
4087         # but searching in other places doesn't work for all videos 
4088         video_title 
= self
._html
_search
_regex
(r
'<title>(?P<title>.*?)</title>', 
4089             webpage
, u
'title', flags
=re
.DOTALL
) 
4091         return [{'id': video_id
, 
4093                  'title': video_title
, 
4094                  'thumbnail': video_thumbnail
, 
4098 class BandcampIE(InfoExtractor
): 
4099     _VALID_URL 
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)' 
4101     def _real_extract(self
, url
): 
4102         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4103         title 
= mobj
.group('title') 
4104         webpage 
= self
._download
_webpage
(url
, title
) 
4105         # We get the link to the free download page 
4106         m_download 
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
) 
4107         if m_download 
is None: 
4108             raise ExtractorError(u
'No free songs found') 
4110         download_link 
= m_download
.group(1) 
4111         id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',  
4112                        webpage
, re
.MULTILINE|re
.DOTALL
).group('id') 
4114         download_webpage 
= self
._download
_webpage
(download_link
, id, 
4115                                                   'Downloading free downloads page') 
4116         # We get the dictionary of the track from some javascrip code 
4117         info 
= re
.search(r
'items: (.*?),$', 
4118                          download_webpage
, re
.MULTILINE
).group(1) 
4119         info 
= json
.loads(info
)[0] 
4120         # We pick mp3-320 for now, until format selection can be easily implemented. 
4121         mp3_info 
= info
[u
'downloads'][u
'mp3-320'] 
4122         # If we try to use this url it says the link has expired 
4123         initial_url 
= mp3_info
[u
'url'] 
4124         re_url 
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' 
4125         m_url 
= re
.match(re_url
, initial_url
) 
4126         #We build the url we will use to get the final track url 
4127         # This url is build in Bandcamp in the script download_bunde_*.js 
4128         request_url 
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts')) 
4129         final_url_webpage 
= self
._download
_webpage
(request_url
, id, 'Requesting download url') 
4130         # If we could correctly generate the .rand field the url would be 
4131         #in the "download_url" key 
4132         final_url 
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1) 
4134         track_info 
= {'id':id, 
4135                       'title' : info
[u
'title'], 
4138                       'thumbnail' : info
[u
'thumb_url'], 
4139                       'uploader' :  info
[u
'artist'] 
4144 class RedTubeIE(InfoExtractor
): 
4145     """Information Extractor for redtube""" 
4146     _VALID_URL 
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' 
4148     def _real_extract(self
,url
): 
4149         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4151             raise ExtractorError(u
'Invalid URL: %s' % url
) 
4153         video_id 
= mobj
.group('id') 
4154         video_extension 
= 'mp4'         
4155         webpage 
= self
._download
_webpage
(url
, video_id
) 
4157         self
.report_extraction(video_id
) 
4159         video_url 
= self
._html
_search
_regex
(r
'<source src="(.+?)" type="video/mp4">', 
4160             webpage
, u
'video URL') 
4162         video_title 
= self
._html
_search
_regex
('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>', 
4168             'ext':      video_extension
, 
4169             'title':    video_title
, 
4172 class InaIE(InfoExtractor
): 
4173     """Information Extractor for Ina.fr""" 
4174     _VALID_URL 
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' 
4176     def _real_extract(self
,url
): 
4177         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4179         video_id 
= mobj
.group('id') 
4180         mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
 
4181         video_extension 
= 'mp4' 
4182         webpage 
= self
._download
_webpage
(mrss_url
, video_id
) 
4184         self
.report_extraction(video_id
) 
4186         video_url 
= self
._html
_search
_regex
(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', 
4187             webpage
, u
'video URL') 
4189         video_title 
= self
._search
_regex
(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', 
4195             'ext':      video_extension
, 
4196             'title':    video_title
, 
4199 class HowcastIE(InfoExtractor
): 
4200     """Information Extractor for Howcast.com""" 
4201     _VALID_URL 
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)' 
4203     def _real_extract(self
, url
): 
4204         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4206         video_id 
= mobj
.group('id') 
4207         webpage_url 
= 'http://www.howcast.com/videos/' + video_id
 
4208         webpage 
= self
._download
_webpage
(webpage_url
, video_id
) 
4210         self
.report_extraction(video_id
) 
4212         video_url 
= self
._search
_regex
(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)', 
4213             webpage, u'video URL') 
4215         video_title = self._html_search_regex(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') property=\'og
:title
\'', 
4218         video_description = self._html_search_regex(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') name=\'description\'', 
4219             webpage, u'description', fatal=False) 
4221         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'', 
4222             webpage, u'thumbnail', fatal=False) 
4228             'title':    video_title, 
4229             'description': video_description, 
4230             'thumbnail': thumbnail, 
4233 class VineIE(InfoExtractor): 
4234     """Information Extractor for Vine.co""" 
4235     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)' 
4237     def _real_extract(self, url): 
4238         mobj = re.match(self._VALID_URL, url) 
4240         video_id = mobj.group('id') 
4241         webpage_url = 'https://vine.co/v/' + video_id 
4242         webpage = self._download_webpage(webpage_url, video_id) 
4244         self.report_extraction(video_id) 
4246         video_url = self._html_search_regex(r'<meta property="twitter
:player
:stream
" content="(.+?
)"', 
4247             webpage, u'video URL') 
4249         video_title = self._html_search_regex(r'<meta property="og
:title
" content="(.+?
)"', 
4252         thumbnail = self._html_search_regex(r'<meta property="og
:image
" content="(.+?
)(\?.*?
)?
"', 
4253             webpage, u'thumbnail', fatal=False) 
4255         uploader = self._html_search_regex(r'<div class="user
">.*?<h2>(.+?)</h2>', 
4256             webpage, u'uploader', fatal=False, flags=re.DOTALL) 
4262             'title':     video_title, 
4263             'thumbnail': thumbnail, 
4264             'uploader':  uploader, 
4267 class FlickrIE(InfoExtractor): 
4268     """Information Extractor for Flickr videos""" 
4269     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*' 
4271     def _real_extract(self, url): 
4272         mobj = re.match(self._VALID_URL, url) 
4274         video_id = mobj.group('id') 
4275         video_uploader_id = mobj.group('uploader_id') 
4276         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id 
4277         webpage = self._download_webpage(webpage_url, video_id) 
4279         secret = self._search_regex(r"photo_secret
: '(\w+)'", webpage, u'secret') 
4281         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' 
4282         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') 
4284         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>', 
4285             first_xml, u'node_id') 
4287         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' 
4288         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') 
4290         self.report_extraction(video_id) 
4292         mobj = re.search(r'<STREAM APP="(.+?
)" FULLPATH="(.+?
)"', second_xml) 
4294             raise ExtractorError(u'Unable to extract video url') 
4295         video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) 
4297         video_title = self._html_search_regex(r'<meta property="og
:title
" content=(?:"([^
"]+)"|
\'([^
\']+)\')', 
4298             webpage, u'video title
') 
4300         video_description = self._html_search_regex(r'<meta 
property="og:description" content
=(?
:"([^"]+)"|\'([^\']+)\')', 
4301             webpage, u'description', fatal=False) 
4303         thumbnail = self._html_search_regex(r'<meta property="og
:image
" content=(?:"([^
"]+)"|
\'([^
\']+)\')', 
4304             webpage, u'thumbnail
', fatal=False) 
4310             'title
':       video_title, 
4311             'description
': video_description, 
4312             'thumbnail
':   thumbnail, 
4313             'uploader_id
': video_uploader_id, 
4316 class TeamcocoIE(InfoExtractor): 
4317     _VALID_URL = r'http
://teamcoco\
.com
/video
/(?P
<url_title
>.*)' 
4319     def _real_extract(self, url): 
4320         mobj = re.match(self._VALID_URL, url) 
4322             raise ExtractorError(u'Invalid URL
: %s' % url) 
4323         url_title = mobj.group('url_title
') 
4324         webpage = self._download_webpage(url, url_title) 
4326         video_id = self._html_search_regex(r'<article 
class="video" data
-id="(\d+?)"', 
4327             webpage, u'video 
id') 
4329         self.report_extraction(video_id) 
4331         video_title = self._html_search_regex(r'<meta 
property="og:title" content
="(.+?)"', 
4334         thumbnail = self._html_search_regex(r'<meta 
property="og:image" content
="(.+?)"', 
4335             webpage, u'thumbnail
', fatal=False) 
4337         video_description = self._html_search_regex(r'<meta 
property="og:description" content
="(.*?)"', 
4338             webpage, u'description
', fatal=False) 
4340         data_url = 'http
://teamcoco
.com
/cvp
/2.0/%s.xml
' % video_id 
4341         data = self._download_webpage(data_url, video_id, 'Downloading data webpage
') 
4343         video_url = self._html_search_regex(r'<file type="high".*?
>(.*?
)</file>', 
4350             'title
':       video_title, 
4351             'thumbnail
':   thumbnail, 
4352             'description
': video_description, 
4355 class XHamsterIE(InfoExtractor): 
4356     """Information Extractor for xHamster""" 
4357     _VALID_URL = r'(?
:http
://)?
(?
:www
.)?xhamster\
.com
/movies
/(?P
<id>[0-9]+)/.*\
.html
' 
4359     def _real_extract(self,url): 
4360         mobj = re.match(self._VALID_URL, url) 
4362         video_id = mobj.group('id') 
4363         mrss_url = 'http
://xhamster
.com
/movies
/%s/.html
' % video_id 
4364         webpage = self._download_webpage(mrss_url, video_id) 
4366         mobj = re.search(r'\'srv
\': \'(?P
<server
>[^
\']*)\',\s
*\'file\': \'(?P
<file>[^
\']+)\',', webpage) 
4368             raise ExtractorError(u'Unable to extract media URL
') 
4369         if len(mobj.group('server
')) == 0: 
4370             video_url = compat_urllib_parse.unquote(mobj.group('file')) 
4372             video_url = mobj.group('server
')+'/key
='+mobj.group('file') 
4373         video_extension = video_url.split('.')[-1] 
4375         video_title = self._html_search_regex(r'<title
>(?P
<title
>.+?
) - xHamster\
.com
</title
>', 
4378         # Can't see the description anywhere 
in the UI
 
4379         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)', 
4380         #     webpage, u'description', fatal=False) 
4381         # if video_description: video_description = unescapeHTML(video_description) 
4383         mobj 
= re
.search(r
'hint=\'(?P
<upload_date_Y
>[0-9]{4}
)-(?P
<upload_date_m
>[0-9]{2}
)-(?P
<upload_date_d
>[0-9]{2}
) [0-9]{2}
:[0-9]{2}
:[0-9]{2} 
[A
-Z
]{3,4}\'', webpage) 
4385             video_upload_date = mobj.group('upload_date_Y
')+mobj.group('upload_date_m
')+mobj.group('upload_date_d
') 
4387             video_upload_date = None 
4388             self._downloader.report_warning(u'Unable to extract upload date
') 
4390         video_uploader_id = self._html_search_regex(r'<a href
=\'/user
/[^
>]+>(?P
<uploader_id
>[^
<]+)', 
4391             webpage, u'uploader 
id', default=u'anonymous
') 
4393         video_thumbnail = self._search_regex(r'\'image
\':\'(?P
<thumbnail
>[^
\']+)\'', 
4394             webpage, u'thumbnail
', fatal=False) 
4399             'ext
':      video_extension, 
4400             'title
':    video_title, 
4401             # 'description
': video_description, 
4402             'upload_date
': video_upload_date, 
4403             'uploader_id
': video_uploader_id, 
4404             'thumbnail
': video_thumbnail 
4407 class HypemIE(InfoExtractor): 
4408     """Information Extractor for hypem""" 
4409     _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?hypem\
.com
/track
/([^
/]+)/([^
/]+)' 
4411     def _real_extract(self, url): 
4412         mobj = re.match(self._VALID_URL, url) 
4414             raise ExtractorError(u'Invalid URL
: %s' % url) 
4415         track_id = mobj.group(1) 
4417         data = { 'ax
': 1, 'ts
': time.time() } 
4418         data_encoded = compat_urllib_parse.urlencode(data) 
4419         complete_url = url + "?" + data_encoded 
4420         request = compat_urllib_request.Request(complete_url) 
4421         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage 
with the url
') 
4422         cookie = urlh.headers.get('Set
-Cookie
', '') 
4424         self.report_extraction(track_id) 
4426         html_tracks = self._html_search_regex(r'<script 
type="application/json" id="displayList-data">(.*?
)</script
>', 
4427             response, u'tracks
', flags=re.MULTILINE|re.DOTALL).strip() 
4429             track_list = json.loads(html_tracks) 
4430             track = track_list[u'tracks
'][0] 
4432             raise ExtractorError(u'Hypemachine contained invalid JSON
.') 
4435         track_id = track[u"id"] 
4436         artist = track[u"artist"] 
4437         title = track[u"song"] 
4439         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) 
4440         request = compat_urllib_request.Request(serve_url, "" , {'Content
-Type
': 'application
/json
'}) 
4441         request.add_header('cookie
', cookie) 
4442         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata
') 
4444             song_data = json.loads(song_data_json) 
4446             raise ExtractorError(u'Hypemachine contained invalid JSON
.') 
4447         final_url = song_data[u"url"] 
4457 class Vbox7IE(InfoExtractor): 
4458     """Information Extractor for Vbox7""" 
4459     _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?vbox7\
.com
/play
:([^
/]+)' 
4461     def _real_extract(self,url): 
4462         mobj = re.match(self._VALID_URL, url) 
4464             raise ExtractorError(u'Invalid URL
: %s' % url) 
4465         video_id = mobj.group(1) 
4467         redirect_page, urlh = self._download_webpage_handle(url, video_id) 
4468         new_location = self._search_regex(r'window\
.location 
= \'(.*)\';', redirect_page, u'redirect location
') 
4469         redirect_url = urlh.geturl() + new_location 
4470         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page
') 
4472         title = self._html_search_regex(r'<title
>(.*)</title
>', 
4473             webpage, u'title
').split('/')[0].strip() 
4476         info_url = "http://vbox7.com/play/magare.do" 
4477         data = compat_urllib_parse.urlencode({'as3
':'1','vid
':video_id}) 
4478         info_request = compat_urllib_request.Request(info_url, data) 
4479         info_request.add_header('Content
-Type
', 'application
/x
-www
-form
-urlencoded
') 
4480         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage
') 
4481         if info_response is None: 
4482             raise ExtractorError(u'Unable to extract the media url
') 
4483         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) 
4490             'thumbnail
': thumbnail_url, 
4493 class GametrailersIE(InfoExtractor): 
4494     _VALID_URL = r'http
://www
.gametrailers
.com
/(?P
<type>videos|reviews|full
-episodes
)/(?P
<id>.*?
)/(?P
<title
>.*)' 
4496     def _real_extract(self, url): 
4497         mobj = re.match(self._VALID_URL, url) 
4499             raise ExtractorError(u'Invalid URL
: %s' % url) 
4500         video_id = mobj.group('id') 
4501         video_type = mobj.group('type') 
4502         webpage = self._download_webpage(url, video_id) 
4503         if video_type == 'full
-episodes
': 
4504             mgid_re = r'data
-video
="(?P<mgid>mgid:.*?)"' 
4506             mgid_re = r'data
-contentId
=\'(?P
<mgid
>mgid
:.*?
)\'' 
4507         mgid = self._search_regex(mgid_re, webpage, u'mgid
') 
4508         data = compat_urllib_parse.urlencode({'uri
': mgid, 'acceptMethods
': 'fms
'}) 
4510         info_page = self._download_webpage('http
://www
.gametrailers
.com
/feeds
/mrss?
' + data, 
4511                                            video_id, u'Downloading video info
') 
4512         links_webpage = self._download_webpage('http
://www
.gametrailers
.com
/feeds
/mediagen
/?
' + data, 
4513                                                video_id, u'Downloading video urls info
') 
4515         self.report_extraction(video_id) 
4516         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* 
4517                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.* 
4519                         <url>(?P<thumb>.*?)</url>.* 
4522         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) 
4524             raise ExtractorError(u'Unable to extract video info
') 
4525         video_title = m_info.group('title
') 
4526         video_description = m_info.group('description
') 
4527         video_thumb = m_info.group('thumb
') 
4529         m_urls = list(re.finditer(r'<src
>(?P
<url
>.*)</src
>', links_webpage)) 
4530         if m_urls is None or len(m_urls) == 0: 
4531             raise ExtractError(u'Unable to extrat video url
') 
4532         # They are sorted from worst to best quality 
4533         video_url = m_urls[-1].group('url
') 
4535         return {'url
':         video_url, 
4537                 'title
':       video_title, 
4538                 # Videos are actually flv not mp4 
4540                 'thumbnail
':   video_thumb, 
4541                 'description
': video_description, 
4544 def gen_extractors(): 
4545     """ Return a list of an instance of every supported extractor. 
4546     The order does matter; the first extractor matched is the one handling the URL. 
4549         YoutubePlaylistIE(), 
4574         StanfordOpenClassroomIE(), 
4584         WorldStarHipHopIE(), 
4613 def get_info_extractor(ie_name): 
4614     """Returns the info extractor class with the given ie_name""" 
4615     return globals()[ie_name+'IE
']