2 # -*- coding: utf-8 -*- 
   4 from __future__ 
import absolute_import
 
  15 import xml
.etree
.ElementTree
 
  26 class InfoExtractor(object): 
  27     """Information Extractor class. 
  29     Information extractors are the classes that, given a URL, extract 
  30     information about the video (or videos) the URL refers to. This 
  31     information includes the real video URL, the video title, author and 
  32     others. The information is stored in a dictionary which is then 
  33     passed to the FileDownloader. The FileDownloader processes this 
  34     information possibly downloading the video to the file system, among 
  35     other possible outcomes. 
  37     The dictionaries must include the following fields: 
  41     title:          Video title, unescaped. 
  42     ext:            Video filename extension. 
  44     The following fields are optional: 
  46     format:         The video format, defaults to ext (used for --get-format) 
  47     thumbnail:      Full URL to a video thumbnail image. 
  48     description:    One-line video description. 
  49     uploader:       Full name of the video uploader. 
  50     upload_date:    Video upload date (YYYYMMDD). 
  51     uploader_id:    Nickname or id of the video uploader. 
  52     location:       Physical location of the video. 
  53     player_url:     SWF Player URL (used for rtmpdump). 
  54     subtitles:      The subtitle file contents. 
  55     urlhandle:      [internal] The urlHandle to be used to download the file, 
  56                     like returned by urllib.request.urlopen 
  58     The fields should all be Unicode strings. 
  60     Subclasses of this one should re-define the _real_initialize() and 
  61     _real_extract() methods and define a _VALID_URL regexp. 
  62     Probably, they should also be added to the list of extractors. 
  64     _real_extract() must return a *list* of information dictionaries as 
  67     Finally, the _WORKING attribute should be set to False for broken IEs 
  68     in order to warn the users and skip the tests. 
  75     def __init__(self
, downloader
=None): 
  76         """Constructor. Receives an optional downloader.""" 
  78         self
.set_downloader(downloader
) 
  81     def suitable(cls
, url
): 
  82         """Receives a URL and returns True if suitable for this IE.""" 
  83         return re
.match(cls
._VALID
_URL
, url
) is not None 
  87         """Getter method for _WORKING.""" 
  91         """Initializes an instance (authentication, etc).""" 
  93             self
._real
_initialize
() 
  96     def extract(self
, url
): 
  97         """Extracts URL information and returns it in list of dicts.""" 
  99         return self
._real
_extract
(url
) 
 101     def set_downloader(self
, downloader
): 
 102         """Sets the downloader for this IE.""" 
 103         self
._downloader 
= downloader
 
 105     def _real_initialize(self
): 
 106         """Real initialization process. Redefine in subclasses.""" 
 109     def _real_extract(self
, url
): 
 110         """Real extraction process. Redefine in subclasses.""" 
 115         return type(self
).__name
__[:-2] 
 117     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 118         """ Returns the response handle """ 
 120             self
.report_download_webpage(video_id
) 
 121         elif note 
is not False: 
 122             self
.to_screen(u
'%s: %s' % (video_id
, note
)) 
 124             return compat_urllib_request
.urlopen(url_or_request
) 
 125         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 127                 errnote 
= u
'Unable to download webpage' 
 128             raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2]) 
 130     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 131         """ Returns a tuple (page content as string, URL handle) """ 
 132         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
) 
 133         content_type 
= urlh
.headers
.get('Content-Type', '') 
 134         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 136             encoding 
= m
.group(1) 
 139         webpage_bytes 
= urlh
.read() 
 140         if self
._downloader
.params
.get('dump_intermediate_pages', False): 
 142                 url 
= url_or_request
.get_full_url() 
 143             except AttributeError: 
 145             self
.to_screen(u
'Dumping request to ' + url
) 
 146             dump 
= base64
.b64encode(webpage_bytes
).decode('ascii') 
 147             self
._downloader
.to_screen(dump
) 
 148         content 
= webpage_bytes
.decode(encoding
, 'replace') 
 149         return (content
, urlh
) 
 151     def _download_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 152         """ Returns the data of the page as a string """ 
 153         return self
._download
_webpage
_handle
(url_or_request
, video_id
, note
, errnote
)[0] 
 155     def to_screen(self
, msg
): 
 156         """Print msg to screen, prefixing it with '[ie_name]'""" 
 157         self
._downloader
.to_screen(u
'[%s] %s' % (self
.IE_NAME
, msg
)) 
 159     def report_extraction(self
, id_or_name
): 
 160         """Report information extraction.""" 
 161         self
.to_screen(u
'%s: Extracting information' % id_or_name
) 
 163     def report_download_webpage(self
, video_id
): 
 164         """Report webpage download.""" 
 165         self
.to_screen(u
'%s: Downloading webpage' % video_id
) 
 167     def report_age_confirmation(self
): 
 168         """Report attempt to confirm age.""" 
 169         self
.to_screen(u
'Confirming age') 
 171     #Methods for following #608 
 172     #They set the correct value of the '_type' key 
 173     def video_result(self
, video_info
): 
 174         """Returns a video""" 
 175         video_info
['_type'] = 'video' 
 177     def url_result(self
, url
, ie
=None): 
 178         """Returns a url that points to a page that should be processed""" 
 179         #TODO: ie should be the class used for getting the info 
 180         video_info 
= {'_type': 'url', 
 184     def playlist_result(self
, entries
, playlist_id
=None, playlist_title
=None): 
 185         """Returns a playlist""" 
 186         video_info 
= {'_type': 'playlist', 
 189             video_info
['id'] = playlist_id
 
 191             video_info
['title'] = playlist_title
 
 194 class SearchInfoExtractor(InfoExtractor
): 
 196     Base class for paged search queries extractors. 
 197     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} 
 198     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
 202     def _make_valid_url(cls
): 
 203         return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
 
 206     def suitable(cls
, url
): 
 207         return re
.match(cls
._make
_valid
_url
(), url
) is not None 
 209     def _real_extract(self
, query
): 
 210         mobj 
= re
.match(self
._make
_valid
_url
(), query
) 
 212             raise ExtractorError(u
'Invalid search query "%s"' % query
) 
 214         prefix 
= mobj
.group('prefix') 
 215         query 
= mobj
.group('query') 
 217             return self
._get
_n
_results
(query
, 1) 
 218         elif prefix 
== 'all': 
 219             return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
) 
 223                 raise ExtractorError(u
'invalid download number %s for query "%s"' % (n
, query
)) 
 224             elif n 
> self
._MAX
_RESULTS
: 
 225                 self
._downloader
.report_warning(u
'%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
)) 
 226                 n 
= self
._MAX
_RESULTS
 
 227             return self
._get
_n
_results
(query
, n
) 
 229     def _get_n_results(self
, query
, n
): 
 230         """Get a specified number of results for a query""" 
 231         raise NotImplementedError("This method must be implemented by sublclasses") 
 234 class YoutubeIE(InfoExtractor
): 
 235     """Information extractor for youtube.com.""" 
 239                          (?:https?://)?                                       # http(s):// (optional) 
 240                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| 
 241                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains 
 242                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls 
 243                          (?:                                                  # the various things that can precede the ID: 
 244                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/ 
 245                              |(?:                                             # or the v= param in all its forms 
 246                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx) 
 247                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #! 
 248                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx) 
 251                          )?                                                   # optional -> youtube.com/xxxx is OK 
 252                      )?                                                       # all until now is optional -> you can pass the naked ID 
 253                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID 
 254                      (?(1).+)?                                                # if we found the ID, everything can follow 
 256     _LANG_URL 
= r
'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 
 257     _LOGIN_URL 
= 'https://accounts.google.com/ServiceLogin' 
 258     _AGE_URL 
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 
 259     _NEXT_URL_RE 
= r
'[\?&]next_url=([^&]+)' 
 260     _NETRC_MACHINE 
= 'youtube' 
 261     # Listed in order of quality 
 262     _available_formats 
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] 
 263     _available_formats_prefer_free 
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] 
 264     _video_extensions 
= { 
 270         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 
 276     _video_dimensions 
= { 
 295     def suitable(cls
, url
): 
 296         """Receives a URL and returns True if suitable for this IE.""" 
 297         if YoutubePlaylistIE
.suitable(url
): return False 
 298         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
 300     def report_lang(self
): 
 301         """Report attempt to set language.""" 
 302         self
.to_screen(u
'Setting language') 
 304     def report_login(self
): 
 305         """Report attempt to log in.""" 
 306         self
.to_screen(u
'Logging in') 
 308     def report_video_webpage_download(self
, video_id
): 
 309         """Report attempt to download video webpage.""" 
 310         self
.to_screen(u
'%s: Downloading video webpage' % video_id
) 
 312     def report_video_info_webpage_download(self
, video_id
): 
 313         """Report attempt to download video info webpage.""" 
 314         self
.to_screen(u
'%s: Downloading video info webpage' % video_id
) 
 316     def report_video_subtitles_download(self
, video_id
): 
 317         """Report attempt to download video info webpage.""" 
 318         self
.to_screen(u
'%s: Checking available subtitles' % video_id
) 
 320     def report_video_subtitles_request(self
, video_id
, sub_lang
, format
): 
 321         """Report attempt to download video info webpage.""" 
 322         self
.to_screen(u
'%s: Downloading video subtitles for %s.%s' % (video_id
, sub_lang
, format
)) 
 324     def report_video_subtitles_available(self
, video_id
, sub_lang_list
): 
 325         """Report available subtitles.""" 
 326         sub_lang 
= ",".join(list(sub_lang_list
.keys())) 
 327         self
.to_screen(u
'%s: Available subtitles for video: %s' % (video_id
, sub_lang
)) 
 329     def report_information_extraction(self
, video_id
): 
 330         """Report attempt to extract video information.""" 
 331         self
.to_screen(u
'%s: Extracting video information' % video_id
) 
 333     def report_unavailable_format(self
, video_id
, format
): 
 334         """Report extracted video URL.""" 
 335         self
.to_screen(u
'%s: Format %s not available' % (video_id
, format
)) 
 337     def report_rtmp_download(self
): 
 338         """Indicate the download will use the RTMP protocol.""" 
 339         self
.to_screen(u
'RTMP download detected') 
 341     def _get_available_subtitles(self
, video_id
): 
 342         self
.report_video_subtitles_download(video_id
) 
 343         request 
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
) 
 345             sub_list 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 346         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 347             return (u
'unable to download video subtitles: %s' % compat_str(err
), None) 
 348         sub_lang_list 
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list
) 
 349         sub_lang_list 
= dict((l
[1], l
[0]) for l 
in sub_lang_list
) 
 350         if not sub_lang_list
: 
 351             return (u
'video doesn\'t have subtitles', None) 
 354     def _list_available_subtitles(self
, video_id
): 
 355         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 356         self
.report_video_subtitles_available(video_id
, sub_lang_list
) 
 358     def _request_subtitle(self
, sub_lang
, sub_name
, video_id
, format
): 
 361         (error_message, sub_lang, sub) 
 363         self
.report_video_subtitles_request(video_id
, sub_lang
, format
) 
 364         params 
= compat_urllib_parse
.urlencode({ 
 370         url 
= 'http://www.youtube.com/api/timedtext?' + params
 
 372             sub 
= compat_urllib_request
.urlopen(url
).read().decode('utf-8') 
 373         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 374             return (u
'unable to download video subtitles: %s' % compat_str(err
), None, None) 
 376             return (u
'Did not fetch video subtitles', None, None) 
 377         return (None, sub_lang
, sub
) 
 379     def _extract_subtitle(self
, video_id
): 
 381         Return a list with a tuple: 
 382         [(error_message, sub_lang, sub)] 
 384         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 385         sub_format 
= self
._downloader
.params
.get('subtitlesformat') 
 386         if  isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles 
 387             return [(sub_lang_list
[0], None, None)] 
 388         if self
._downloader
.params
.get('subtitleslang', False): 
 389             sub_lang 
= self
._downloader
.params
.get('subtitleslang') 
 390         elif 'en' in sub_lang_list
: 
 393             sub_lang 
= list(sub_lang_list
.keys())[0] 
 394         if not sub_lang 
in sub_lang_list
: 
 395             return [(u
'no closed captions found in the specified language "%s"' % sub_lang
, None, None)] 
 397         subtitle 
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
) 
 400     def _extract_all_subtitles(self
, video_id
): 
 401         sub_lang_list 
= self
._get
_available
_subtitles
(video_id
) 
 402         sub_format 
= self
._downloader
.params
.get('subtitlesformat') 
 403         if  isinstance(sub_lang_list
,tuple): #There was some error, it didn't get the available subtitles 
 404             return [(sub_lang_list
[0], None, None)] 
 406         for sub_lang 
in sub_lang_list
: 
 407             subtitle 
= self
._request
_subtitle
(sub_lang
, sub_lang_list
[sub_lang
].encode('utf-8'), video_id
, sub_format
) 
 408             subtitles
.append(subtitle
) 
 411     def _print_formats(self
, formats
): 
 412         print('Available formats:') 
 414             print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???'))) 
 416     def _real_initialize(self
): 
 417         if self
._downloader 
is None: 
 422         downloader_params 
= self
._downloader
.params
 
 424         # Attempt to use provided username and password or .netrc data 
 425         if downloader_params
.get('username', None) is not None: 
 426             username 
= downloader_params
['username'] 
 427             password 
= downloader_params
['password'] 
 428         elif downloader_params
.get('usenetrc', False): 
 430                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
 435                     raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
 436             except (IOError, netrc
.NetrcParseError
) as err
: 
 437                 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
)) 
 441         request 
= compat_urllib_request
.Request(self
._LANG
_URL
) 
 444             compat_urllib_request
.urlopen(request
).read() 
 445         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 446             self
._downloader
.report_warning(u
'unable to set language: %s' % compat_str(err
)) 
 449         # No authentication to be performed 
 453         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
) 
 455             login_page 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 456         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 457             self
._downloader
.report_warning(u
'unable to fetch login page: %s' % compat_str(err
)) 
 462         match 
= re
.search(re
.compile(r
'<input.+?name="GALX".+?value="(.+?)"', re
.DOTALL
), login_page
) 
 464           galx 
= match
.group(1) 
 466         match 
= re
.search(re
.compile(r
'<input.+?name="dsh".+?value="(.+?)"', re
.DOTALL
), login_page
) 
 472                 u
'continue': u
'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1', 
 476                 u
'PersistentCookie': u
'yes', 
 478                 u
'bgresponse': u
'js_disabled', 
 479                 u
'checkConnection': u
'', 
 480                 u
'checkedDomains': u
'youtube', 
 486                 u
'signIn': u
'Sign in', 
 488                 u
'service': u
'youtube', 
 492         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode 
 494         login_form 
= dict((k
.encode('utf-8'), v
.encode('utf-8')) for k
,v 
in login_form_strs
.items()) 
 495         login_data 
= compat_urllib_parse
.urlencode(login_form
).encode('ascii') 
 496         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
, login_data
) 
 499             login_results 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 500             if re
.search(r
'(?i)<form[^>]* id="gaia_loginform"', login_results
) is not None: 
 501                 self
._downloader
.report_warning(u
'unable to log in: bad username or password') 
 503         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 504             self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
)) 
 510                 'action_confirm':   'Confirm', 
 512         request 
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
)) 
 514             self
.report_age_confirmation() 
 515             age_results 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
 516         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 517             raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
)) 
 519     def _extract_id(self
, url
): 
 520         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
 522             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 523         video_id 
= mobj
.group(2) 
 526     def _real_extract(self
, url
): 
 527         # Extract original video URL from URL with redirection, like age verification, using next_url parameter 
 528         mobj 
= re
.search(self
._NEXT
_URL
_RE
, url
) 
 530             url 
= 'https://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/') 
 531         video_id 
= self
._extract
_id
(url
) 
 534         self
.report_video_webpage_download(video_id
) 
 535         url 
= 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 
 536         request 
= compat_urllib_request
.Request(url
) 
 538             video_webpage_bytes 
= compat_urllib_request
.urlopen(request
).read() 
 539         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 540             raise ExtractorError(u
'Unable to download video webpage: %s' % compat_str(err
)) 
 542         video_webpage 
= video_webpage_bytes
.decode('utf-8', 'ignore') 
 544         # Attempt to extract SWF player URL 
 545         mobj 
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
) 
 547             player_url 
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1)) 
 552         self
.report_video_info_webpage_download(video_id
) 
 553         for el_type 
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 
 554             video_info_url 
= ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 
 555                     % (video_id
, el_type
)) 
 556             video_info_webpage 
= self
._download
_webpage
(video_info_url
, video_id
, 
 558                                     errnote
='unable to download video info webpage') 
 559             video_info 
= compat_parse_qs(video_info_webpage
) 
 560             if 'token' in video_info
: 
 562         if 'token' not in video_info
: 
 563             if 'reason' in video_info
: 
 564                 raise ExtractorError(u
'YouTube said: %s' % video_info
['reason'][0]) 
 566                 raise ExtractorError(u
'"token" parameter not in video info for unknown reason') 
 568         # Check for "rental" videos 
 569         if 'ypc_video_rental_bar_text' in video_info 
and 'author' not in video_info
: 
 570             raise ExtractorError(u
'"rental" videos not supported') 
 572         # Start extracting information 
 573         self
.report_information_extraction(video_id
) 
 576         if 'author' not in video_info
: 
 577             raise ExtractorError(u
'Unable to extract uploader name') 
 578         video_uploader 
= compat_urllib_parse
.unquote_plus(video_info
['author'][0]) 
 581         video_uploader_id 
= None 
 582         mobj 
= re
.search(r
'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage
) 
 584             video_uploader_id 
= mobj
.group(1) 
 586             self
._downloader
.report_warning(u
'unable to extract uploader nickname') 
 589         if 'title' not in video_info
: 
 590             raise ExtractorError(u
'Unable to extract video title') 
 591         video_title 
= compat_urllib_parse
.unquote_plus(video_info
['title'][0]) 
 594         if 'thumbnail_url' not in video_info
: 
 595             self
._downloader
.report_warning(u
'unable to extract video thumbnail') 
 597         else:   # don't panic if we can't find it 
 598             video_thumbnail 
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0]) 
 602         mobj 
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
) 
 604             upload_date 
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split()) 
 605             upload_date 
= unified_strdate(upload_date
) 
 608         video_description 
= get_element_by_id("eow-description", video_webpage
) 
 609         if video_description
: 
 610             video_description 
= clean_html(video_description
) 
 612             fd_mobj 
= re
.search(r
'<meta name="description" content="([^"]+)"', video_webpage
) 
 614                 video_description 
= unescapeHTML(fd_mobj
.group(1)) 
 616                 video_description 
= u
'' 
 619         video_subtitles 
= None 
 621         if self
._downloader
.params
.get('writesubtitles', False): 
 622             video_subtitles 
= self
._extract
_subtitle
(video_id
) 
 624                 (sub_error
, sub_lang
, sub
) = video_subtitles
[0] 
 626                     self
._downloader
.report_error(sub_error
) 
 628         if self
._downloader
.params
.get('allsubtitles', False): 
 629             video_subtitles 
= self
._extract
_all
_subtitles
(video_id
) 
 630             for video_subtitle 
in video_subtitles
: 
 631                 (sub_error
, sub_lang
, sub
) = video_subtitle
 
 633                     self
._downloader
.report_error(sub_error
) 
 635         if self
._downloader
.params
.get('listsubtitles', False): 
 636             sub_lang_list 
= self
._list
_available
_subtitles
(video_id
) 
 639         if 'length_seconds' not in video_info
: 
 640             self
._downloader
.report_warning(u
'unable to extract video duration') 
 643             video_duration 
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0]) 
 646         video_token 
= compat_urllib_parse
.unquote_plus(video_info
['token'][0]) 
 648         # Decide which formats to download 
 649         req_format 
= self
._downloader
.params
.get('format', None) 
 651         if 'conn' in video_info 
and video_info
['conn'][0].startswith('rtmp'): 
 652             self
.report_rtmp_download() 
 653             video_url_list 
= [(None, video_info
['conn'][0])] 
 654         elif 'url_encoded_fmt_stream_map' in video_info 
and len(video_info
['url_encoded_fmt_stream_map']) >= 1: 
 656             for url_data_str 
in video_info
['url_encoded_fmt_stream_map'][0].split(','): 
 657                 url_data 
= compat_parse_qs(url_data_str
) 
 658                 if 'itag' in url_data 
and 'url' in url_data
: 
 659                     url 
= url_data
['url'][0] + '&signature=' + url_data
['sig'][0] 
 660                     if not 'ratebypass' in url
: url 
+= '&ratebypass=yes' 
 661                     url_map
[url_data
['itag'][0]] = url
 
 663             format_limit 
= self
._downloader
.params
.get('format_limit', None) 
 664             available_formats 
= self
._available
_formats
_prefer
_free 
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
 
 665             if format_limit 
is not None and format_limit 
in available_formats
: 
 666                 format_list 
= available_formats
[available_formats
.index(format_limit
):] 
 668                 format_list 
= available_formats
 
 669             existing_formats 
= [x 
for x 
in format_list 
if x 
in url_map
] 
 670             if len(existing_formats
) == 0: 
 671                 raise ExtractorError(u
'no known formats available for video') 
 672             if self
._downloader
.params
.get('listformats', None): 
 673                 self
._print
_formats
(existing_formats
) 
 675             if req_format 
is None or req_format 
== 'best': 
 676                 video_url_list 
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality 
 677             elif req_format 
== 'worst': 
 678                 video_url_list 
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality 
 679             elif req_format 
in ('-1', 'all'): 
 680                 video_url_list 
= [(f
, url_map
[f
]) for f 
in existing_formats
] # All formats 
 682                 # Specific formats. We pick the first in a slash-delimeted sequence. 
 683                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. 
 684                 req_formats 
= req_format
.split('/') 
 685                 video_url_list 
= None 
 686                 for rf 
in req_formats
: 
 688                         video_url_list 
= [(rf
, url_map
[rf
])] 
 690                 if video_url_list 
is None: 
 691                     raise ExtractorError(u
'requested format not available') 
 693             raise ExtractorError(u
'no conn or url_encoded_fmt_stream_map information found in video info') 
 696         for format_param
, video_real_url 
in video_url_list
: 
 698             video_extension 
= self
._video
_extensions
.get(format_param
, 'flv') 
 700             video_format 
= '{0} - {1}'.format(format_param 
if format_param 
else video_extension
, 
 701                                               self
._video
_dimensions
.get(format_param
, '???')) 
 705                 'url':      video_real_url
, 
 706                 'uploader': video_uploader
, 
 707                 'uploader_id': video_uploader_id
, 
 708                 'upload_date':  upload_date
, 
 709                 'title':    video_title
, 
 710                 'ext':      video_extension
, 
 711                 'format':   video_format
, 
 712                 'thumbnail':    video_thumbnail
, 
 713                 'description':  video_description
, 
 714                 'player_url':   player_url
, 
 715                 'subtitles':    video_subtitles
, 
 716                 'duration':     video_duration
 
 721 class MetacafeIE(InfoExtractor
): 
 722     """Information Extractor for metacafe.com.""" 
 724     _VALID_URL 
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 
 725     _DISCLAIMER 
= 'http://www.metacafe.com/family_filter/' 
 726     _FILTER_POST 
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 
 727     IE_NAME 
= u
'metacafe' 
 729     def report_disclaimer(self
): 
 730         """Report disclaimer retrieval.""" 
 731         self
.to_screen(u
'Retrieving disclaimer') 
 733     def _real_initialize(self
): 
 734         # Retrieve disclaimer 
 735         request 
= compat_urllib_request
.Request(self
._DISCLAIMER
) 
 737             self
.report_disclaimer() 
 738             disclaimer 
= compat_urllib_request
.urlopen(request
).read() 
 739         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 740             raise ExtractorError(u
'Unable to retrieve disclaimer: %s' % compat_str(err
)) 
 745             'submit': "Continue - I'm over 18", 
 747         request 
= compat_urllib_request
.Request(self
._FILTER
_POST
, compat_urllib_parse
.urlencode(disclaimer_form
)) 
 749             self
.report_age_confirmation() 
 750             disclaimer 
= compat_urllib_request
.urlopen(request
).read() 
 751         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 752             raise ExtractorError(u
'Unable to confirm age: %s' % compat_str(err
)) 
 754     def _real_extract(self
, url
): 
 755         # Extract id and simplified title from URL 
 756         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 758             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 760         video_id 
= mobj
.group(1) 
 762         # Check if video comes from YouTube 
 763         mobj2 
= re
.match(r
'^yt-(.*)$', video_id
) 
 764         if mobj2 
is not None: 
 765             return [self
.url_result('http://www.youtube.com/watch?v=%s' % mobj2
.group(1), 'Youtube')] 
 767         # Retrieve video webpage to extract further information 
 768         webpage 
= self
._download
_webpage
('http://www.metacafe.com/watch/%s/' % video_id
, video_id
) 
 770         # Extract URL, uploader and title from webpage 
 771         self
.report_extraction(video_id
) 
 772         mobj 
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
) 
 774             mediaURL 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 775             video_extension 
= mediaURL
[-3:] 
 777             # Extract gdaKey if available 
 778             mobj 
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
) 
 782                 gdaKey 
= mobj
.group(1) 
 783                 video_url 
= '%s?__gda__=%s' % (mediaURL
, gdaKey
) 
 785             mobj 
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
) 
 787                 raise ExtractorError(u
'Unable to extract media URL') 
 788             vardict 
= compat_parse_qs(mobj
.group(1)) 
 789             if 'mediaData' not in vardict
: 
 790                 raise ExtractorError(u
'Unable to extract media URL') 
 791             mobj 
= re
.search(r
'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict
['mediaData'][0]) 
 793                 raise ExtractorError(u
'Unable to extract media URL') 
 794             mediaURL 
= mobj
.group('mediaURL').replace('\\/', '/') 
 795             video_extension 
= mediaURL
[-3:] 
 796             video_url 
= '%s?__gda__=%s' % (mediaURL
, mobj
.group('key')) 
 798         mobj 
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
) 
 800             raise ExtractorError(u
'Unable to extract title') 
 801         video_title 
= mobj
.group(1).decode('utf-8') 
 803         mobj 
= re
.search(r
'submitter=(.*?);', webpage
) 
 805             raise ExtractorError(u
'Unable to extract uploader nickname') 
 806         video_uploader 
= mobj
.group(1) 
 809             'id':       video_id
.decode('utf-8'), 
 810             'url':      video_url
.decode('utf-8'), 
 811             'uploader': video_uploader
.decode('utf-8'), 
 813             'title':    video_title
, 
 814             'ext':      video_extension
.decode('utf-8'), 
 817 class DailymotionIE(InfoExtractor
): 
 818     """Information Extractor for Dailymotion""" 
 820     _VALID_URL 
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' 
 821     IE_NAME 
= u
'dailymotion' 
 823     def _real_extract(self
, url
): 
 824         # Extract id and simplified title from URL 
 825         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 827             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 829         video_id 
= mobj
.group(1).split('_')[0].split('?')[0] 
 831         video_extension 
= 'mp4' 
 833         # Retrieve video webpage to extract further information 
 834         request 
= compat_urllib_request
.Request(url
) 
 835         request
.add_header('Cookie', 'family_filter=off') 
 836         webpage 
= self
._download
_webpage
(request
, video_id
) 
 838         # Extract URL, uploader and title from webpage 
 839         self
.report_extraction(video_id
) 
 840         mobj 
= re
.search(r
'\s*var flashvars = (.*)', webpage
) 
 842             raise ExtractorError(u
'Unable to extract media URL') 
 843         flashvars 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 845         for key 
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: 
 848                 self
.to_screen(u
'Using %s' % key
) 
 851             raise ExtractorError(u
'Unable to extract video URL') 
 853         mobj 
= re
.search(r
'"' + max_quality 
+ r
'":"(.+?)"', flashvars
) 
 855             raise ExtractorError(u
'Unable to extract video URL') 
 857         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)).replace('\\/', '/') 
 859         # TODO: support choosing qualities 
 861         mobj 
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
) 
 863             raise ExtractorError(u
'Unable to extract title') 
 864         video_title 
= unescapeHTML(mobj
.group('title')) 
 866         video_uploader 
= None 
 867         mobj 
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage
) 
 869             # lookin for official user 
 870             mobj_official 
= re
.search(r
'<span rel="author"[^>]+?>([^<]+?)</span>', webpage
) 
 871             if mobj_official 
is None: 
 872                 self
._downloader
.report_warning(u
'unable to extract uploader nickname') 
 874                 video_uploader 
= mobj_official
.group(1) 
 876             video_uploader 
= mobj
.group(1) 
 878         video_upload_date 
= None 
 879         mobj 
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
) 
 881             video_upload_date 
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1) 
 886             'uploader': video_uploader
, 
 887             'upload_date':  video_upload_date
, 
 888             'title':    video_title
, 
 889             'ext':      video_extension
, 
 893 class PhotobucketIE(InfoExtractor
): 
 894     """Information extractor for photobucket.com.""" 
 896     # TODO: the original _VALID_URL was: 
 897     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 
 898     # Check if it's necessary to keep the old extracion process 
 899     _VALID_URL 
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))' 
 900     IE_NAME 
= u
'photobucket' 
 902     def _real_extract(self
, url
): 
 903         # Extract id from URL 
 904         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 906             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 908         video_id 
= mobj
.group('id') 
 910         video_extension 
= mobj
.group('ext') 
 912         # Retrieve video webpage to extract further information 
 913         webpage 
= self
._download
_webpage
(url
, video_id
) 
 915         # Extract URL, uploader, and title from webpage 
 916         self
.report_extraction(video_id
) 
 917         # We try first by looking the javascript code: 
 918         mobj 
= re
.search(r
'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage
) 
 920             info 
= json
.loads(mobj
.group('json')) 
 923                 'url':      info
[u
'downloadUrl'], 
 924                 'uploader': info
[u
'username'], 
 925                 'upload_date':  datetime
.date
.fromtimestamp(info
[u
'creationDate']).strftime('%Y%m%d'), 
 926                 'title':    info
[u
'title'], 
 927                 'ext':      video_extension
, 
 928                 'thumbnail': info
[u
'thumbUrl'], 
 931         # We try looking in other parts of the webpage 
 932         mobj 
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
) 
 934             raise ExtractorError(u
'Unable to extract media URL') 
 935         mediaURL 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 939         mobj 
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
) 
 941             raise ExtractorError(u
'Unable to extract title') 
 942         video_title 
= mobj
.group(1).decode('utf-8') 
 944         video_uploader 
= mobj
.group(2).decode('utf-8') 
 947             'id':       video_id
.decode('utf-8'), 
 948             'url':      video_url
.decode('utf-8'), 
 949             'uploader': video_uploader
, 
 951             'title':    video_title
, 
 952             'ext':      video_extension
.decode('utf-8'), 
 956 class YahooIE(InfoExtractor
): 
 957     """Information extractor for screen.yahoo.com.""" 
 958     _VALID_URL 
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html' 
 960     def _real_extract(self
, url
): 
 961         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 963             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 964         video_id 
= mobj
.group('id') 
 965         webpage 
= self
._download
_webpage
(url
, video_id
) 
 966         m_id 
= re
.search(r
'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage
) 
 969             # TODO: Check which url parameters are required 
 970             info_url 
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 
 971             webpage 
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info webpage') 
 972             info_re 
= r
'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.* 
 973                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.* 
 974                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.* 
 975                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB" 
 977             self
.report_extraction(video_id
) 
 978             m_info 
= re
.search(info_re
, webpage
, re
.VERBOSE|re
.DOTALL
) 
 980                 raise ExtractorError(u
'Unable to extract video info') 
 981             video_title 
= m_info
.group('title') 
 982             video_description 
= m_info
.group('description') 
 983             video_thumb 
= m_info
.group('thumb') 
 984             video_date 
= m_info
.group('date') 
 985             video_date 
= datetime
.datetime
.strptime(video_date
, '%m/%d/%Y').strftime('%Y%m%d') 
 987             # TODO: Find a way to get mp4 videos 
 988             rest_url 
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 
 989             webpage 
= self
._download
_webpage
(rest_url
, video_id
, u
'Downloading video url webpage') 
 990             m_rest 
= re
.search(r
'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage
) 
 991             video_url 
= m_rest
.group('url') 
 992             video_path 
= m_rest
.group('path') 
 994                 raise ExtractorError(u
'Unable to extract video url') 
 996         else: # We have to use a different method if another id is defined 
 997             long_id 
= m_id
.group('new_id') 
 998             info_url 
= 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id 
+ '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335' 
 999             webpage 
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info json') 
1000             json_str 
= re
.search(r
'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage
).group(1) 
1001             info 
= json
.loads(json_str
) 
1002             res 
= info
[u
'query'][u
'results'][u
'mediaObj'][0] 
1003             stream 
= res
[u
'streams'][0] 
1004             video_path 
= stream
[u
'path'] 
1005             video_url 
= stream
[u
'host'] 
1007             video_title 
= meta
[u
'title'] 
1008             video_description 
= meta
[u
'description'] 
1009             video_thumb 
= meta
[u
'thumbnail'] 
1010             video_date 
= None # I can't find it 
1015                      'play_path': video_path
, 
1016                      'title':video_title
, 
1017                      'description': video_description
, 
1018                      'thumbnail': video_thumb
, 
1019                      'upload_date': video_date
, 
1024 class VimeoIE(InfoExtractor
): 
1025     """Information extractor for vimeo.com.""" 
1027     # _VALID_URL matches Vimeo URLs 
1028     _VALID_URL 
= r
'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)' 
1031     def _real_extract(self
, url
, new_video
=True): 
1032         # Extract ID from URL 
1033         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1035             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1037         video_id 
= mobj
.group('id') 
1038         if not mobj
.group('proto'): 
1039             url 
= 'https://' + url
 
1040         if mobj
.group('direct_link'): 
1041             url 
= 'https://vimeo.com/' + video_id
 
1043         # Retrieve video webpage to extract further information 
1044         request 
= compat_urllib_request
.Request(url
, None, std_headers
) 
1045         webpage 
= self
._download
_webpage
(request
, video_id
) 
1047         # Now we begin extracting as much information as we can from what we 
1048         # retrieved. First we extract the information common to all extractors, 
1049         # and latter we extract those that are Vimeo specific. 
1050         self
.report_extraction(video_id
) 
1052         # Extract the config JSON 
1054             config 
= webpage
.split(' = {config:')[1].split(',assets:')[0] 
1055             config 
= json
.loads(config
) 
1057             if re
.search('The creator of this video has not given you permission to embed it on this domain.', webpage
): 
1058                 raise ExtractorError(u
'The author has restricted the access to this video, try with the "--referer" option') 
1060                 raise ExtractorError(u
'Unable to extract info section') 
1063         video_title 
= config
["video"]["title"] 
1065         # Extract uploader and uploader_id 
1066         video_uploader 
= config
["video"]["owner"]["name"] 
1067         video_uploader_id 
= config
["video"]["owner"]["url"].split('/')[-1] 
1069         # Extract video thumbnail 
1070         video_thumbnail 
= config
["video"]["thumbnail"] 
1072         # Extract video description 
1073         video_description 
= get_element_by_attribute("itemprop", "description", webpage
) 
1074         if video_description
: video_description 
= clean_html(video_description
) 
1075         else: video_description 
= u
'' 
1077         # Extract upload date 
1078         video_upload_date 
= None 
1079         mobj 
= re
.search(r
'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage
) 
1080         if mobj 
is not None: 
1081             video_upload_date 
= mobj
.group(1) + mobj
.group(2) + mobj
.group(3) 
1083         # Vimeo specific: extract request signature and timestamp 
1084         sig 
= config
['request']['signature'] 
1085         timestamp 
= config
['request']['timestamp'] 
1087         # Vimeo specific: extract video codec and quality information 
1088         # First consider quality, then codecs, then take everything 
1089         # TODO bind to format param 
1090         codecs 
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] 
1091         files 
= { 'hd': [], 'sd': [], 'other': []} 
1092         for codec_name
, codec_extension 
in codecs
: 
1093             if codec_name 
in config
["video"]["files"]: 
1094                 if 'hd' in config
["video"]["files"][codec_name
]: 
1095                     files
['hd'].append((codec_name
, codec_extension
, 'hd')) 
1096                 elif 'sd' in config
["video"]["files"][codec_name
]: 
1097                     files
['sd'].append((codec_name
, codec_extension
, 'sd')) 
1099                     files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0])) 
1101         for quality 
in ('hd', 'sd', 'other'): 
1102             if len(files
[quality
]) > 0: 
1103                 video_quality 
= files
[quality
][0][2] 
1104                 video_codec 
= files
[quality
][0][0] 
1105                 video_extension 
= files
[quality
][0][1] 
1106                 self
.to_screen(u
'%s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
)) 
1109             raise ExtractorError(u
'No known codec found') 
1111         video_url 
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 
1112                     %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper()) 
1117             'uploader': video_uploader
, 
1118             'uploader_id': video_uploader_id
, 
1119             'upload_date':  video_upload_date
, 
1120             'title':    video_title
, 
1121             'ext':      video_extension
, 
1122             'thumbnail':    video_thumbnail
, 
1123             'description':  video_description
, 
1127 class ArteTvIE(InfoExtractor
): 
1128     """arte.tv information extractor.""" 
1130     _VALID_URL 
= r
'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' 
1131     _LIVE_URL 
= r
'index-[0-9]+\.html$' 
1133     IE_NAME 
= u
'arte.tv' 
1135     def fetch_webpage(self
, url
): 
1136         request 
= compat_urllib_request
.Request(url
) 
1138             self
.report_download_webpage(url
) 
1139             webpage 
= compat_urllib_request
.urlopen(request
).read() 
1140         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1141             raise ExtractorError(u
'Unable to retrieve video webpage: %s' % compat_str(err
)) 
1142         except ValueError as err
: 
1143             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1146     def grep_webpage(self
, url
, regex
, regexFlags
, matchTuples
): 
1147         page 
= self
.fetch_webpage(url
) 
1148         mobj 
= re
.search(regex
, page
, regexFlags
) 
1152             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1154         for (i
, key
, err
) in matchTuples
: 
1155             if mobj
.group(i
) is None: 
1156                 raise ExtractorError(err
) 
1158                 info
[key
] = mobj
.group(i
) 
1162     def extractLiveStream(self
, url
): 
1163         video_lang 
= url
.split('/')[-4] 
1164         info 
= self
.grep_webpage( 
1166             r
'src="(.*?/videothek_js.*?\.js)', 
1169                 (1, 'url', u
'Invalid URL: %s' % url
) 
1172         http_host 
= url
.split('/')[2] 
1173         next_url 
= 'http://%s%s' % (http_host
, compat_urllib_parse
.unquote(info
.get('url'))) 
1174         info 
= self
.grep_webpage( 
1176             r
'(s_artestras_scst_geoFRDE_' + video_lang 
+ '.*?)\'.*?' + 
1177                 '(http://.*?\.swf).*?' + 
1181                 (1, 'path',   u
'could not extract video path: %s' % url
), 
1182                 (2, 'player', u
'could not extract video player: %s' % url
), 
1183                 (3, 'url',    u
'could not extract video url: %s' % url
) 
1186         video_url 
= u
'%s/%s' % (info
.get('url'), info
.get('path')) 
1188     def extractPlus7Stream(self
, url
): 
1189         video_lang 
= url
.split('/')[-3] 
1190         info 
= self
.grep_webpage( 
1192             r
'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', 
1195                 (1, 'url', u'Invalid URL: %s' % url) 
1198         next_url = compat_urllib_parse.unquote(info.get('url')) 
1199         info = self.grep_webpage( 
1201             r'<video lang="%s" ref="(http
[^
\'"&]*)' % video_lang, 
1204                 (1, 'url', u'Could not find <video> tag: %s' % url) 
1207         next_url = compat_urllib_parse.unquote(info.get('url')) 
1209         info = self.grep_webpage( 
1211             r'<video id="(.*?
)".*?>.*?' + 
1212                 '<name>(.*?)</name>.*?' + 
1213                 '<dateVideo>(.*?)</dateVideo>.*?' + 
1214                 '<url quality="hd
">(.*?)</url>', 
1217                 (1, 'id',    u'could not extract video id: %s' % url), 
1218                 (2, 'title', u'could not extract video title: %s' % url), 
1219                 (3, 'date',  u'could not extract video date: %s' % url), 
1220                 (4, 'url',   u'could not extract video url: %s' % url) 
1225             'id':           info.get('id'), 
1226             'url':          compat_urllib_parse.unquote(info.get('url')), 
1227             'uploader':     u'arte.tv', 
1228             'upload_date':  unified_strdate(info.get('date')), 
1229             'title':        info.get('title').decode('utf-8'), 
1235     def _real_extract(self, url): 
1236         video_id = url.split('/')[-1] 
1237         self.report_extraction(video_id) 
1239         if re.search(self._LIVE_URL, video_id) is not None: 
1240             self.extractLiveStream(url) 
1243             info = self.extractPlus7Stream(url) 
1248 class GenericIE(InfoExtractor): 
1249     """Generic last-resort information extractor.""" 
1252     IE_NAME = u'generic' 
1254     def report_download_webpage(self, video_id): 
1255         """Report webpage download.""" 
1256         if not self._downloader.params.get('test', False): 
1257             self._downloader.report_warning(u'Falling back on generic information extractor.') 
1258         super(GenericIE, self).report_download_webpage(video_id) 
1260     def report_following_redirect(self, new_url): 
1261         """Report information extraction.""" 
1262         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) 
1264     def _test_redirect(self, url): 
1265         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
1266         class HeadRequest(compat_urllib_request.Request): 
1267             def get_method(self): 
1270         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): 
1272             Subclass the HTTPRedirectHandler to make it use our 
1273             HeadRequest also on the redirected URL 
1275             def redirect_request(self, req, fp, code, msg, headers, newurl): 
1276                 if code in (301, 302, 303, 307): 
1277                     newurl = newurl.replace(' ', '%20') 
1278                     newheaders = dict((k,v) for k,v in req.headers.items() 
1279                                       if k.lower() not in ("content
-length
", "content
-type")) 
1280                     return HeadRequest(newurl, 
1282                                        origin_req_host=req.get_origin_req_host(), 
1285                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) 
1287         class HTTPMethodFallback(compat_urllib_request.BaseHandler): 
1289             Fallback to GET if HEAD is not allowed (405 HTTP error) 
1291             def http_error_405(self, req, fp, code, msg, headers): 
1295                 newheaders = dict((k,v) for k,v in req.headers.items() 
1296                                   if k.lower() not in ("content
-length
", "content
-type")) 
1297                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(), 
1299                                                  origin_req_host=req.get_origin_req_host(), 
1303         opener = compat_urllib_request.OpenerDirector() 
1304         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, 
1305                         HTTPMethodFallback, HEADRedirectHandler, 
1306                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: 
1307             opener.add_handler(handler()) 
1309         response = opener.open(HeadRequest(url)) 
1310         if response is None: 
1311             raise ExtractorError(u'Invalid URL protocol') 
1312         new_url = response.geturl() 
1317         self.report_following_redirect(new_url) 
1320     def _real_extract(self, url): 
1321         new_url = self._test_redirect(url) 
1322         if new_url: return [self.url_result(new_url)] 
1324         video_id = url.split('/')[-1] 
1326             webpage = self._download_webpage(url, video_id) 
1327         except ValueError as err: 
1328             # since this is the last-resort InfoExtractor, if 
1329             # this error is thrown, it'll be thrown here 
1330             raise ExtractorError(u'Invalid URL: %s' % url) 
1332         self.report_extraction(video_id) 
1333         # Start with something easy: JW Player in SWFObject 
1334         mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage) 
1336             # Broaden the search a little bit 
1337             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) 
1339             # Broaden the search a little bit: JWPlayer JS loader 
1340             mobj = re.search(r'[^A
-Za
-z0
-9]?
file:\s
*["\'](http[^\'"&]*)', webpage) 
1342             raise ExtractorError(u'Invalid URL
: %s' % url) 
1344         # It's possible that one of the regexes
 
1345         # matched, but returned an empty group: 
1346         if mobj
.group(1) is None: 
1347             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1349         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
1350         video_id 
= os
.path
.basename(video_url
) 
1352         # here's a fun little line of code for you: 
1353         video_extension 
= os
.path
.splitext(video_id
)[1][1:] 
1354         video_id 
= os
.path
.splitext(video_id
)[0] 
1356         # it's tempting to parse this further, but you would 
1357         # have to take into account all the variations like 
1358         #   Video Title - Site Name 
1359         #   Site Name | Video Title 
1360         #   Video Title - Tagline | Site Name 
1361         # and so on and so forth; it's just not practical 
1362         mobj 
= re
.search(r
'<title>(.*)</title>', webpage
) 
1364             raise ExtractorError(u
'Unable to extract title') 
1365         video_title 
= mobj
.group(1) 
1367         # video uploader is domain name 
1368         mobj 
= re
.match(r
'(?:https?://)?([^/]*)/.*', url
) 
1370             raise ExtractorError(u
'Unable to extract title') 
1371         video_uploader 
= mobj
.group(1) 
1376             'uploader': video_uploader
, 
1377             'upload_date':  None, 
1378             'title':    video_title
, 
1379             'ext':      video_extension
, 
1383 class YoutubeSearchIE(SearchInfoExtractor
): 
1384     """Information Extractor for YouTube search queries.""" 
1385     _API_URL 
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' 
1387     IE_NAME 
= u
'youtube:search' 
1388     _SEARCH_KEY 
= 'ytsearch' 
1390     def report_download_page(self
, query
, pagenum
): 
1391         """Report attempt to download search page with given number.""" 
1392         query 
= query
.decode(preferredencoding()) 
1393         self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
)) 
1395     def _get_n_results(self
, query
, n
): 
1396         """Get a specified number of results for a query""" 
1402         while (50 * pagenum
) < limit
: 
1403             self
.report_download_page(query
, pagenum
+1) 
1404             result_url 
= self
._API
_URL 
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1) 
1405             request 
= compat_urllib_request
.Request(result_url
) 
1407                 data 
= compat_urllib_request
.urlopen(request
).read().decode('utf-8') 
1408             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1409                 raise ExtractorError(u
'Unable to download API page: %s' % compat_str(err
)) 
1410             api_response 
= json
.loads(data
)['data'] 
1412             if not 'items' in api_response
: 
1413                 raise ExtractorError(u
'[youtube] No video results') 
1415             new_ids 
= list(video
['id'] for video 
in api_response
['items']) 
1416             video_ids 
+= new_ids
 
1418             limit 
= min(n
, api_response
['totalItems']) 
1421         if len(video_ids
) > n
: 
1422             video_ids 
= video_ids
[:n
] 
1423         videos 
= [self
.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids
] 
1424         return self
.playlist_result(videos
, query
) 
1427 class GoogleSearchIE(SearchInfoExtractor
): 
1428     """Information Extractor for Google Video search queries.""" 
1429     _MORE_PAGES_INDICATOR 
= r
'id="pnnext" class="pn"' 
1431     IE_NAME 
= u
'video.google:search' 
1432     _SEARCH_KEY 
= 'gvsearch' 
1434     def _get_n_results(self
, query
, n
): 
1435         """Get a specified number of results for a query""" 
1438             '_type': 'playlist', 
1443         for pagenum 
in itertools
.count(1): 
1444             result_url 
= u
'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse
.quote_plus(query
), pagenum
*10) 
1445             webpage 
= self
._download
_webpage
(result_url
, u
'gvsearch:' + query
, 
1446                                              note
='Downloading result page ' + str(pagenum
)) 
1448             for mobj 
in re
.finditer(r
'<h3 class="r"><a href="([^"]+)"', webpage
): 
1451                     'url': mobj
.group(1) 
1453                 res
['entries'].append(e
) 
1455             if (pagenum 
* 10 > n
) or not re
.search(self
._MORE
_PAGES
_INDICATOR
, webpage
): 
1458 class YahooSearchIE(SearchInfoExtractor
): 
1459     """Information Extractor for Yahoo! Video search queries.""" 
1462     IE_NAME 
= u
'screen.yahoo:search' 
1463     _SEARCH_KEY 
= 'yvsearch' 
1465     def _get_n_results(self
, query
, n
): 
1466         """Get a specified number of results for a query""" 
1469             '_type': 'playlist', 
1473         for pagenum 
in itertools
.count(0):  
1474             result_url 
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum 
* 30) 
1475             webpage 
= self
._download
_webpage
(result_url
, query
, 
1476                                              note
='Downloading results page '+str(pagenum
+1)) 
1477             info 
= json
.loads(webpage
) 
1479             results 
= info
[u
'results'] 
1481             for (i
, r
) in enumerate(results
): 
1482                 if (pagenum 
* 30) +i 
>= n
: 
1484                 mobj 
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
) 
1485                 e 
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo') 
1486                 res
['entries'].append(e
) 
1487             if (pagenum 
* 30 +i 
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )): 
1493 class YoutubePlaylistIE(InfoExtractor
): 
1494     """Information Extractor for YouTube playlists.""" 
1496     _VALID_URL 
= r
"""(?: 
1501                            (?:course|view_play_list|my_playlists|artist|playlist|watch) 
1502                            \? (?:.*?&)*? (?:p|a|list)= 
1505                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) 
1508                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) 
1510     _TEMPLATE_URL 
= 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json' 
1512     IE_NAME 
= u
'youtube:playlist' 
1515     def suitable(cls
, url
): 
1516         """Receives a URL and returns True if suitable for this IE.""" 
1517         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
1519     def _real_extract(self
, url
): 
1520         # Extract playlist id 
1521         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
1523             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1525         # Download playlist videos from API 
1526         playlist_id 
= mobj
.group(1) or mobj
.group(2) 
1531             url 
= self
._TEMPLATE
_URL 
% (playlist_id
, self
._MAX
_RESULTS
, self
._MAX
_RESULTS 
* (page_num 
- 1) + 1) 
1532             page 
= self
._download
_webpage
(url
, playlist_id
, u
'Downloading page #%s' % page_num
) 
1535                 response 
= json
.loads(page
) 
1536             except ValueError as err
: 
1537                 raise ExtractorError(u
'Invalid JSON in API response: ' + compat_str(err
)) 
1539             if 'feed' not in response
: 
1540                 raise ExtractorError(u
'Got a malformed response from YouTube API') 
1541             playlist_title 
= response
['feed']['title']['$t'] 
1542             if 'entry' not in response
['feed']: 
1543                 # Number of videos is a multiple of self._MAX_RESULTS 
1546             videos 
+= [ (entry
['yt$position']['$t'], entry
['content']['src']) 
1547                         for entry 
in response
['feed']['entry'] 
1548                         if 'content' in entry 
] 
1550             if len(response
['feed']['entry']) < self
._MAX
_RESULTS
: 
1554         videos 
= [v
[1] for v 
in sorted(videos
)] 
1556         url_results 
= [self
.url_result(url
, 'Youtube') for url 
in videos
] 
1557         return [self
.playlist_result(url_results
, playlist_id
, playlist_title
)] 
1560 class YoutubeChannelIE(InfoExtractor
): 
1561     """Information Extractor for YouTube channels.""" 
1563     _VALID_URL 
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" 
1564     _TEMPLATE_URL 
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' 
1565     _MORE_PAGES_INDICATOR 
= 'yt-uix-load-more' 
1566     _MORE_PAGES_URL 
= 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' 
1567     IE_NAME 
= u
'youtube:channel' 
1569     def extract_videos_from_page(self
, page
): 
1571         for mobj 
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&?', page
): 
1572             if mobj
.group(1) not in ids_in_page
: 
1573                 ids_in_page
.append(mobj
.group(1)) 
1576     def _real_extract(self
, url
): 
1577         # Extract channel id 
1578         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1580             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1582         # Download channel page 
1583         channel_id 
= mobj
.group(1) 
1587         url 
= self
._TEMPLATE
_URL 
% (channel_id
, pagenum
) 
1588         page 
= self
._download
_webpage
(url
, channel_id
, 
1589                                       u
'Downloading page #%s' % pagenum
) 
1591         # Extract video identifiers 
1592         ids_in_page 
= self
.extract_videos_from_page(page
) 
1593         video_ids
.extend(ids_in_page
) 
1595         # Download any subsequent channel pages using the json-based channel_ajax query 
1596         if self
._MORE
_PAGES
_INDICATOR 
in page
: 
1598                 pagenum 
= pagenum 
+ 1 
1600                 url 
= self
._MORE
_PAGES
_URL 
% (pagenum
, channel_id
) 
1601                 page 
= self
._download
_webpage
(url
, channel_id
, 
1602                                               u
'Downloading page #%s' % pagenum
) 
1604                 page 
= json
.loads(page
) 
1606                 ids_in_page 
= self
.extract_videos_from_page(page
['content_html']) 
1607                 video_ids
.extend(ids_in_page
) 
1609                 if self
._MORE
_PAGES
_INDICATOR  
not in page
['load_more_widget_html']: 
1612         self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
))) 
1614         urls 
= ['http://www.youtube.com/watch?v=%s' % id for id in video_ids
] 
1615         url_entries 
= [self
.url_result(url
, 'Youtube') for url 
in urls
] 
1616         return [self
.playlist_result(url_entries
, channel_id
)] 
1619 class YoutubeUserIE(InfoExtractor
): 
1620     """Information Extractor for YouTube users.""" 
1622     _VALID_URL 
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' 
1623     _TEMPLATE_URL 
= 'http://gdata.youtube.com/feeds/api/users/%s' 
1624     _GDATA_PAGE_SIZE 
= 50 
1625     _GDATA_URL 
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' 
1626     _VIDEO_INDICATOR 
= r
'/watch\?v=(.+?)[\<&]' 
1627     IE_NAME 
= u
'youtube:user' 
1629     def _real_extract(self
, url
): 
1631         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1633             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1635         username 
= mobj
.group(1) 
1637         # Download video ids using YouTube Data API. Result size per 
1638         # query is limited (currently to 50 videos) so we need to query 
1639         # page by page until there are no video ids - it means we got 
1646             start_index 
= pagenum 
* self
._GDATA
_PAGE
_SIZE 
+ 1 
1648             gdata_url 
= self
._GDATA
_URL 
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
) 
1649             page 
= self
._download
_webpage
(gdata_url
, username
, 
1650                                           u
'Downloading video ids from %d to %d' % (start_index
, start_index 
+ self
._GDATA
_PAGE
_SIZE
)) 
1652             # Extract video identifiers 
1655             for mobj 
in re
.finditer(self
._VIDEO
_INDICATOR
, page
): 
1656                 if mobj
.group(1) not in ids_in_page
: 
1657                     ids_in_page
.append(mobj
.group(1)) 
1659             video_ids
.extend(ids_in_page
) 
1661             # A little optimization - if current page is not 
1662             # "full", ie. does not contain PAGE_SIZE video ids then 
1663             # we can assume that this page is the last one - there 
1664             # are no more ids on further pages - no need to query 
1667             if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
: 
1672         urls 
= ['http://www.youtube.com/watch?v=%s' % video_id 
for video_id 
in video_ids
] 
1673         url_results 
= [self
.url_result(url
, 'Youtube') for url 
in urls
] 
1674         return [self
.playlist_result(url_results
, playlist_title 
= username
)] 
1677 class BlipTVUserIE(InfoExtractor
): 
1678     """Information Extractor for blip.tv users.""" 
1680     _VALID_URL 
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' 
1682     IE_NAME 
= u
'blip.tv:user' 
1684     def _real_extract(self
, url
): 
1686         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1688             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1690         username 
= mobj
.group(1) 
1692         page_base 
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' 
1694         page 
= self
._download
_webpage
(url
, username
, u
'Downloading user page') 
1695         mobj 
= re
.search(r
'data-users-id="([^"]+)"', page
) 
1696         page_base 
= page_base 
% mobj
.group(1) 
1699         # Download video ids using BlipTV Ajax calls. Result size per 
1700         # query is limited (currently to 12 videos) so we need to query 
1701         # page by page until there are no video ids - it means we got 
1708             url 
= page_base 
+ "&page=" + str(pagenum
) 
1709             page 
= self
._download
_webpage
(url
, username
, 
1710                                           u
'Downloading video ids from page %d' % pagenum
) 
1712             # Extract video identifiers 
1715             for mobj 
in re
.finditer(r
'href="/([^"]+)"', page
): 
1716                 if mobj
.group(1) not in ids_in_page
: 
1717                     ids_in_page
.append(unescapeHTML(mobj
.group(1))) 
1719             video_ids
.extend(ids_in_page
) 
1721             # A little optimization - if current page is not 
1722             # "full", ie. does not contain PAGE_SIZE video ids then 
1723             # we can assume that this page is the last one - there 
1724             # are no more ids on further pages - no need to query 
1727             if len(ids_in_page
) < self
._PAGE
_SIZE
: 
1732         urls 
= [u
'http://blip.tv/%s' % video_id 
for video_id 
in video_ids
] 
1733         url_entries 
= [self
.url_result(url
, 'BlipTV') for url 
in urls
] 
1734         return [self
.playlist_result(url_entries
, playlist_title 
= username
)] 
1737 class DepositFilesIE(InfoExtractor
): 
1738     """Information extractor for depositfiles.com""" 
1740     _VALID_URL 
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)' 
1742     def _real_extract(self
, url
): 
1743         file_id 
= url
.split('/')[-1] 
1744         # Rebuild url in english locale 
1745         url 
= 'http://depositfiles.com/en/files/' + file_id
 
1747         # Retrieve file webpage with 'Free download' button pressed 
1748         free_download_indication 
= { 'gateway_result' : '1' } 
1749         request 
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
)) 
1751             self
.report_download_webpage(file_id
) 
1752             webpage 
= compat_urllib_request
.urlopen(request
).read() 
1753         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1754             raise ExtractorError(u
'Unable to retrieve file webpage: %s' % compat_str(err
)) 
1756         # Search for the real file URL 
1757         mobj 
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
) 
1758         if (mobj 
is None) or (mobj
.group(1) is None): 
1759             # Try to figure out reason of the error. 
1760             mobj 
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
) 
1761             if (mobj 
is not None) and (mobj
.group(1) is not None): 
1762                 restriction_message 
= re
.sub('\s+', ' ', mobj
.group(1)).strip() 
1763                 raise ExtractorError(u
'%s' % restriction_message
) 
1765                 raise ExtractorError(u
'Unable to extract download URL from: %s' % url
) 
1767         file_url 
= mobj
.group(1) 
1768         file_extension 
= os
.path
.splitext(file_url
)[1][1:] 
1770         # Search for file title 
1771         mobj 
= re
.search(r
'<b title="(.*?)">', webpage
) 
1773             raise ExtractorError(u
'Unable to extract title') 
1774         file_title 
= mobj
.group(1).decode('utf-8') 
1777             'id':       file_id
.decode('utf-8'), 
1778             'url':      file_url
.decode('utf-8'), 
1780             'upload_date':  None, 
1781             'title':    file_title
, 
1782             'ext':      file_extension
.decode('utf-8'), 
1786 class FacebookIE(InfoExtractor
): 
1787     """Information Extractor for Facebook""" 
1789     _VALID_URL 
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' 
1790     _LOGIN_URL 
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' 
1791     _NETRC_MACHINE 
= 'facebook' 
1792     IE_NAME 
= u
'facebook' 
1794     def report_login(self
): 
1795         """Report attempt to log in.""" 
1796         self
.to_screen(u
'Logging in') 
1798     def _real_initialize(self
): 
1799         if self
._downloader 
is None: 
1804         downloader_params 
= self
._downloader
.params
 
1806         # Attempt to use provided username and password or .netrc data 
1807         if downloader_params
.get('username', None) is not None: 
1808             useremail 
= downloader_params
['username'] 
1809             password 
= downloader_params
['password'] 
1810         elif downloader_params
.get('usenetrc', False): 
1812                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
1813                 if info 
is not None: 
1817                     raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
1818             except (IOError, netrc
.NetrcParseError
) as err
: 
1819                 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
)) 
1822         if useremail 
is None: 
1831         request 
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
)) 
1834             login_results 
= compat_urllib_request
.urlopen(request
).read() 
1835             if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None: 
1836                 self
._downloader
.report_warning(u
'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.') 
1838         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1839             self
._downloader
.report_warning(u
'unable to log in: %s' % compat_str(err
)) 
1842     def _real_extract(self
, url
): 
1843         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1845             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1846         video_id 
= mobj
.group('ID') 
1848         url 
= 'https://www.facebook.com/video/video.php?v=%s' % video_id
 
1849         webpage 
= self
._download
_webpage
(url
, video_id
) 
1851         BEFORE 
= '{swf.addParam(param[0], param[1]);});\n' 
1852         AFTER 
= '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' 
1853         m 
= re
.search(re
.escape(BEFORE
) + '(.*?)' + re
.escape(AFTER
), webpage
) 
1855             raise ExtractorError(u
'Cannot parse data') 
1856         data 
= dict(json
.loads(m
.group(1))) 
1857         params_raw 
= compat_urllib_parse
.unquote(data
['params']) 
1858         params 
= json
.loads(params_raw
) 
1859         video_data 
= params
['video_data'][0] 
1860         video_url 
= video_data
.get('hd_src') 
1862             video_url 
= video_data
['sd_src'] 
1864             raise ExtractorError(u
'Cannot find video URL') 
1865         video_duration 
= int(video_data
['video_duration']) 
1866         thumbnail 
= video_data
['thumbnail_src'] 
1868         m 
= re
.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage
) 
1870             raise ExtractorError(u
'Cannot find title in webpage') 
1871         video_title 
= unescapeHTML(m
.group(1)) 
1875             'title': video_title
, 
1878             'duration': video_duration
, 
1879             'thumbnail': thumbnail
, 
1884 class BlipTVIE(InfoExtractor
): 
1885     """Information extractor for blip.tv""" 
1887     _VALID_URL 
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' 
1888     _URL_EXT 
= r
'^.*\.([a-z0-9]+)$' 
1889     IE_NAME 
= u
'blip.tv' 
1891     def report_direct_download(self
, title
): 
1892         """Report information extraction.""" 
1893         self
.to_screen(u
'%s: Direct download detected' % title
) 
1895     def _real_extract(self
, url
): 
1896         mobj 
= re
.match(self
._VALID
_URL
, url
) 
1898             raise ExtractorError(u
'Invalid URL: %s' % url
) 
1900         urlp 
= compat_urllib_parse_urlparse(url
) 
1901         if urlp
.path
.startswith('/play/'): 
1902             request 
= compat_urllib_request
.Request(url
) 
1903             response 
= compat_urllib_request
.urlopen(request
) 
1904             redirecturl 
= response
.geturl() 
1905             rurlp 
= compat_urllib_parse_urlparse(redirecturl
) 
1906             file_id 
= compat_parse_qs(rurlp
.fragment
)['file'][0].rpartition('/')[2] 
1907             url 
= 'http://blip.tv/a/a-' + file_id
 
1908             return self
._real
_extract
(url
) 
1915         json_url 
= url 
+ cchar 
+ 'skin=json&version=2&no_wrap=1' 
1916         request 
= compat_urllib_request
.Request(json_url
) 
1917         request
.add_header('User-Agent', 'iTunes/10.6.1') 
1918         self
.report_extraction(mobj
.group(1)) 
1921             urlh 
= compat_urllib_request
.urlopen(request
) 
1922             if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download 
1923                 basename 
= url
.split('/')[-1] 
1924                 title
,ext 
= os
.path
.splitext(basename
) 
1925                 title 
= title
.decode('UTF-8') 
1926                 ext 
= ext
.replace('.', '') 
1927                 self
.report_direct_download(title
) 
1932                     'upload_date': None, 
1937         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1938             raise ExtractorError(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
)) 
1939         if info 
is None: # Regular URL 
1941                 json_code_bytes 
= urlh
.read() 
1942                 json_code 
= json_code_bytes
.decode('utf-8') 
1943             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
1944                 raise ExtractorError(u
'Unable to read video info webpage: %s' % compat_str(err
)) 
1947                 json_data 
= json
.loads(json_code
) 
1948                 if 'Post' in json_data
: 
1949                     data 
= json_data
['Post'] 
1953                 upload_date 
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d') 
1954                 video_url 
= data
['media']['url'] 
1955                 umobj 
= re
.match(self
._URL
_EXT
, video_url
) 
1957                     raise ValueError('Can not determine filename extension') 
1958                 ext 
= umobj
.group(1) 
1961                     'id': data
['item_id'], 
1963                     'uploader': data
['display_name'], 
1964                     'upload_date': upload_date
, 
1965                     'title': data
['title'], 
1967                     'format': data
['media']['mimeType'], 
1968                     'thumbnail': data
['thumbnailUrl'], 
1969                     'description': data
['description'], 
1970                     'player_url': data
['embedUrl'], 
1971                     'user_agent': 'iTunes/10.6.1', 
1973             except (ValueError,KeyError) as err
: 
1974                 raise ExtractorError(u
'Unable to parse video information: %s' % repr(err
)) 
1979 class MyVideoIE(InfoExtractor
): 
1980     """Information Extractor for myvideo.de.""" 
1982     _VALID_URL 
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' 
1983     IE_NAME 
= u
'myvideo' 
1985     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git 
1986     # Released into the Public Domain by Tristan Fischer on 2013-05-19 
1987     # https://github.com/rg3/youtube-dl/pull/842 
1988     def __rc4crypt(self
,data
, key
): 
1990         box 
= list(range(256)) 
1991         for i 
in list(range(256)): 
1992             x 
= (x 
+ box
[i
] + compat_ord(key
[i 
% len(key
)])) % 256 
1993             box
[i
], box
[x
] = box
[x
], box
[i
] 
1999             y 
= (y 
+ box
[x
]) % 256 
2000             box
[x
], box
[y
] = box
[y
], box
[x
] 
2001             out 
+= chr(compat_ord(char
) ^ box
[(box
[x
] + box
[y
]) % 256]) 
2005         return hashlib
.md5(s
).hexdigest().encode() 
2007     def _real_extract(self
,url
): 
2008         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2010             raise ExtractorError(u
'invalid URL: %s' % url
) 
2012         video_id 
= mobj
.group(1) 
2015           b
'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' 
2016           b
'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' 
2017           b
'TnpsbA0KTVRkbU1tSTRNdz09' 
2021         webpage_url 
= 'http://www.myvideo.de/watch/%s' % video_id
 
2022         webpage 
= self
._download
_webpage
(webpage_url
, video_id
) 
2024         mobj 
= re
.search('source src=\'(.+?)[.]([^.]+)\'', webpage
) 
2025         if mobj 
is not None: 
2026             self
.report_extraction(video_id
) 
2027             video_url 
= mobj
.group(1) + '.flv' 
2029             mobj 
= re
.search('<title>([^<]+)</title>', webpage
) 
2031                 raise ExtractorError(u
'Unable to extract title') 
2032             video_title 
= mobj
.group(1) 
2034             mobj 
= re
.search('[.](.+?)$', video_url
) 
2036                 raise ExtractorError(u
'Unable to extract extention') 
2037             video_ext 
= mobj
.group(1) 
2043                 'upload_date':  None, 
2044                 'title':    video_title
, 
2049         mobj 
= re
.search('var flashvars={(.+?)}', webpage
) 
2051             raise ExtractorError(u
'Unable to extract video') 
2056         for (a
, b
) in re
.findall('(.+?):\'(.+?)\',?', sec
): 
2057             if not a 
== '_encxml': 
2060                 encxml 
= compat_urllib_parse
.unquote(b
) 
2061         if not params
.get('domain'): 
2062             params
['domain'] = 'www.myvideo.de' 
2063         xmldata_url 
= '%s?%s' % (encxml
, compat_urllib_parse
.urlencode(params
)) 
2064         if 'flash_playertype=MTV' in xmldata_url
: 
2065             self
._downloader
.report_warning(u
'avoiding MTV player') 
2067                 'http://www.myvideo.de/dynamic/get_player_video_xml.php' 
2068                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' 
2072         enc_data 
= self
._download
_webpage
(xmldata_url
, video_id
).split('=')[1] 
2073         enc_data_b 
= binascii
.unhexlify(enc_data
) 
2075             base64
.b64decode(base64
.b64decode(GK
)) + 
2077                 str(video_id
).encode('utf-8') 
2080         dec_data 
= self
.__rc
4crypt
(enc_data_b
, sk
) 
2083         self
.report_extraction(video_id
) 
2085         mobj 
= re
.search('connectionurl=\'(.*?)\'', dec_data
) 
2087             raise ExtractorError(u
'unable to extract rtmpurl') 
2088         video_rtmpurl 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
2089         if 'myvideo2flash' in video_rtmpurl
: 
2090             self
._downloader
.report_warning(u
'forcing RTMPT ...') 
2091             video_rtmpurl 
= video_rtmpurl
.replace('rtmpe://', 'rtmpt://') 
2093         # extract non rtmp videos 
2094         if (video_rtmpurl 
is None) or (video_rtmpurl 
== ''): 
2095             mobj 
= re
.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data
) 
2097                 raise ExtractorError(u
'unable to extract url') 
2098             video_rtmpurl 
= compat_urllib_parse
.unquote(mobj
.group(1)) + compat_urllib_parse
.unquote(mobj
.group(2)) 
2100         mobj 
= re
.search('source=\'(.*?)\'', dec_data
) 
2102             raise ExtractorError(u
'unable to extract swfobj') 
2103         video_file     
= compat_urllib_parse
.unquote(mobj
.group(1)) 
2105         if not video_file
.endswith('f4m'): 
2106             ppath
, prefix 
= video_file
.split('.') 
2107             video_playpath 
= '%s:%s' % (prefix
, ppath
) 
2108             video_hls_playlist 
= '' 
2111             video_hls_playlist 
= ( 
2112                 video_filepath 
+ video_file
 
2113             ).replace('.f4m', '.m3u8') 
2115         mobj 
= re
.search('swfobject.embedSWF\(\'(.+?)\'', webpage
) 
2117             raise ExtractorError(u
'unable to extract swfobj') 
2118         video_swfobj 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
2120         mobj 
= re
.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage
) 
2122             raise ExtractorError(u
'unable to extract title') 
2123         video_title 
= mobj
.group(1) 
2127             'url':                video_rtmpurl
, 
2128             'tc_url':             video_rtmpurl
, 
2130             'upload_date':        None, 
2131             'title':              video_title
, 
2133             'play_path':          video_playpath
, 
2134             'video_file':         video_file
, 
2135             'video_hls_playlist': video_hls_playlist
, 
2136             'player_url':         video_swfobj
, 
2139 class ComedyCentralIE(InfoExtractor
): 
2140     """Information extractor for The Daily Show and Colbert Report """ 
2142     # urls can be abbreviations like :thedailyshow or :colbert 
2143     # urls for episodes like: 
2144     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day 
2145     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news 
2146     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 
2147     _VALID_URL 
= r
"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport) 
2148                       |(https?://)?(www\.)? 
2149                           (?P<showname>thedailyshow|colbertnation)\.com/ 
2150                          (full-episodes/(?P<episode>.*)| 
2152                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) 
2153                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))))) 
2156     _available_formats 
= ['3500', '2200', '1700', '1200', '750', '400'] 
2158     _video_extensions 
= { 
2166     _video_dimensions 
= { 
2176     def suitable(cls
, url
): 
2177         """Receives a URL and returns True if suitable for this IE.""" 
2178         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
2180     def _print_formats(self
, formats
): 
2181         print('Available formats:') 
2183             print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'mp4'), self
._video
_dimensions
.get(x
, '???'))) 
2186     def _real_extract(self
, url
): 
2187         mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2189             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2191         if mobj
.group('shortname'): 
2192             if mobj
.group('shortname') in ('tds', 'thedailyshow'): 
2193                 url 
= u
'http://www.thedailyshow.com/full-episodes/' 
2195                 url 
= u
'http://www.colbertnation.com/full-episodes/' 
2196             mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2197             assert mobj 
is not None 
2199         if mobj
.group('clip'): 
2200             if mobj
.group('showname') == 'thedailyshow': 
2201                 epTitle 
= mobj
.group('tdstitle') 
2203                 epTitle 
= mobj
.group('cntitle') 
2206             dlNewest 
= not mobj
.group('episode') 
2208                 epTitle 
= mobj
.group('showname') 
2210                 epTitle 
= mobj
.group('episode') 
2212         self
.report_extraction(epTitle
) 
2213         webpage
,htmlHandle 
= self
._download
_webpage
_handle
(url
, epTitle
) 
2215             url 
= htmlHandle
.geturl() 
2216             mobj 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
2218                 raise ExtractorError(u
'Invalid redirected URL: ' + url
) 
2219             if mobj
.group('episode') == '': 
2220                 raise ExtractorError(u
'Redirected URL is still not specific: ' + url
) 
2221             epTitle 
= mobj
.group('episode') 
2223         mMovieParams 
= re
.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage
) 
2225         if len(mMovieParams
) == 0: 
2226             # The Colbert Report embeds the information in a without 
2227             # a URL prefix; so extract the alternate reference 
2228             # and then add the URL prefix manually. 
2230             altMovieParams 
= re
.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage
) 
2231             if len(altMovieParams
) == 0: 
2232                 raise ExtractorError(u
'unable to find Flash URL in webpage ' + url
) 
2234                 mMovieParams 
= [("http://media.mtvnservices.com/" + altMovieParams
[0], altMovieParams
[0])] 
2236         uri 
= mMovieParams
[0][1] 
2237         indexUrl 
= 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse
.urlencode({'uri': uri
}) 
2238         indexXml 
= self
._download
_webpage
(indexUrl
, epTitle
, 
2239                                           u
'Downloading show index', 
2240                                           u
'unable to download episode index') 
2244         idoc 
= xml
.etree
.ElementTree
.fromstring(indexXml
) 
2245         itemEls 
= idoc
.findall('.//item') 
2246         for partNum
,itemEl 
in enumerate(itemEls
): 
2247             mediaId 
= itemEl
.findall('./guid')[0].text
 
2248             shortMediaId 
= mediaId
.split(':')[-1] 
2249             showId 
= mediaId
.split(':')[-2].replace('.com', '') 
2250             officialTitle 
= itemEl
.findall('./title')[0].text
 
2251             officialDate 
= unified_strdate(itemEl
.findall('./pubDate')[0].text
) 
2253             configUrl 
= ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' + 
2254                         compat_urllib_parse
.urlencode({'uri': mediaId
})) 
2255             configXml 
= self
._download
_webpage
(configUrl
, epTitle
, 
2256                                                u
'Downloading configuration for %s' % shortMediaId
) 
2258             cdoc 
= xml
.etree
.ElementTree
.fromstring(configXml
) 
2260             for rendition 
in cdoc
.findall('.//rendition'): 
2261                 finfo 
= (rendition
.attrib
['bitrate'], rendition
.findall('./src')[0].text
) 
2265                 self
._downloader
.report_error(u
'unable to download ' + mediaId 
+ ': No videos found') 
2268             if self
._downloader
.params
.get('listformats', None): 
2269                 self
._print
_formats
([i
[0] for i 
in turls
]) 
2272             # For now, just pick the highest bitrate 
2273             format
,rtmp_video_url 
= turls
[-1] 
2275             # Get the format arg from the arg stream 
2276             req_format 
= self
._downloader
.params
.get('format', None) 
2278             # Select format if we can find one 
2281                     format
, rtmp_video_url 
= f
, v
 
2284             m 
= re
.match(r
'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url
) 
2286                 raise ExtractorError(u
'Cannot transform RTMP url') 
2287             base 
= 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/' 
2288             video_url 
= base 
+ m
.group('finalid') 
2290             effTitle 
= showId 
+ u
'-' + epTitle 
+ u
' part ' + compat_str(partNum
+1) 
2295                 'upload_date': officialDate
, 
2300                 'description': officialTitle
, 
2302             results
.append(info
) 
2307 class EscapistIE(InfoExtractor
): 
2308     """Information extractor for The Escapist """ 
2310     _VALID_URL 
= r
'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$' 
2311     IE_NAME 
= u
'escapist' 
2313     def _real_extract(self
, url
): 
2314         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2316             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2317         showName 
= mobj
.group('showname') 
2318         videoId 
= mobj
.group('episode') 
2320         self
.report_extraction(showName
) 
2321         webPage 
= self
._download
_webpage
(url
, showName
) 
2323         descMatch 
= re
.search('<meta name="description" content="([^"]*)"', webPage
) 
2324         description 
= unescapeHTML(descMatch
.group(1)) 
2325         imgMatch 
= re
.search('<meta property="og:image" content="([^"]*)"', webPage
) 
2326         imgUrl 
= unescapeHTML(imgMatch
.group(1)) 
2327         playerUrlMatch 
= re
.search('<meta property="og:video" content="([^"]*)"', webPage
) 
2328         playerUrl 
= unescapeHTML(playerUrlMatch
.group(1)) 
2329         configUrlMatch 
= re
.search('config=(.*)$', playerUrl
) 
2330         configUrl 
= compat_urllib_parse
.unquote(configUrlMatch
.group(1)) 
2332         configJSON 
= self
._download
_webpage
(configUrl
, showName
, 
2333                                             u
'Downloading configuration', 
2334                                             u
'unable to download configuration') 
2336         # Technically, it's JavaScript, not JSON 
2337         configJSON 
= configJSON
.replace("'", '"') 
2340             config 
= json
.loads(configJSON
) 
2341         except (ValueError,) as err
: 
2342             raise ExtractorError(u
'Invalid JSON in configuration file: ' + compat_str(err
)) 
2344         playlist 
= config
['playlist'] 
2345         videoUrl 
= playlist
[1]['url'] 
2350             'uploader': showName
, 
2351             'upload_date': None, 
2354             'thumbnail': imgUrl
, 
2355             'description': description
, 
2356             'player_url': playerUrl
, 
2361 class CollegeHumorIE(InfoExtractor
): 
2362     """Information extractor for collegehumor.com""" 
2365     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$' 
2366     IE_NAME 
= u
'collegehumor' 
2368     def report_manifest(self
, video_id
): 
2369         """Report information extraction.""" 
2370         self
.to_screen(u
'%s: Downloading XML manifest' % video_id
) 
2372     def _real_extract(self
, url
): 
2373         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2375             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2376         video_id 
= mobj
.group('videoid') 
2381             'upload_date': None, 
2384         self
.report_extraction(video_id
) 
2385         xmlUrl 
= 'http://www.collegehumor.com/moogaloop/video/' + video_id
 
2387             metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
2388         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2389             raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2391         mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2393             videoNode 
= mdoc
.findall('./video')[0] 
2394             info
['description'] = videoNode
.findall('./description')[0].text
 
2395             info
['title'] = videoNode
.findall('./caption')[0].text
 
2396             info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
 
2397             manifest_url 
= videoNode
.findall('./file')[0].text
 
2399             raise ExtractorError(u
'Invalid metadata XML file') 
2401         manifest_url 
+= '?hdcore=2.10.3' 
2402         self
.report_manifest(video_id
) 
2404             manifestXml 
= compat_urllib_request
.urlopen(manifest_url
).read() 
2405         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2406             raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2408         adoc 
= xml
.etree
.ElementTree
.fromstring(manifestXml
) 
2410             media_node 
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}media')[0] 
2411             node_id 
= media_node
.attrib
['url'] 
2412             video_id 
= adoc
.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 
2413         except IndexError as err
: 
2414             raise ExtractorError(u
'Invalid manifest file') 
2416         url_pr 
= compat_urllib_parse_urlparse(manifest_url
) 
2417         url 
= url_pr
.scheme 
+ '://' + url_pr
.netloc 
+ '/z' + video_id
[:-2] + '/' + node_id 
+ 'Seg1-Frag1' 
2424 class XVideosIE(InfoExtractor
): 
2425     """Information extractor for xvideos.com""" 
2427     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' 
2428     IE_NAME 
= u
'xvideos' 
2430     def _real_extract(self
, url
): 
2431         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2433             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2434         video_id 
= mobj
.group(1) 
2436         webpage 
= self
._download
_webpage
(url
, video_id
) 
2438         self
.report_extraction(video_id
) 
2442         mobj 
= re
.search(r
'flv_url=(.+?)&', webpage
) 
2444             raise ExtractorError(u
'Unable to extract video url') 
2445         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
2449         mobj 
= re
.search(r
'<title>(.*?)\s+-\s+XVID', webpage
) 
2451             raise ExtractorError(u
'Unable to extract video title') 
2452         video_title 
= mobj
.group(1) 
2455         # Extract video thumbnail 
2456         mobj 
= re
.search(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage
) 
2458             raise ExtractorError(u
'Unable to extract video thumbnail') 
2459         video_thumbnail 
= mobj
.group(0) 
2465             'upload_date': None, 
2466             'title': video_title
, 
2468             'thumbnail': video_thumbnail
, 
2469             'description': None, 
2475 class SoundcloudIE(InfoExtractor
): 
2476     """Information extractor for soundcloud.com 
2477        To access the media, the uid of the song and a stream token 
2478        must be extracted from the page source and the script must make 
2479        a request to media.soundcloud.com/crossdomain.xml. Then 
2480        the media can be grabbed by requesting from an url composed 
2481        of the stream token and uid 
2484     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2485     IE_NAME 
= u
'soundcloud' 
2487     def report_resolve(self
, video_id
): 
2488         """Report information extraction.""" 
2489         self
.to_screen(u
'%s: Resolving id' % video_id
) 
2491     def _real_extract(self
, url
): 
2492         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2494             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2496         # extract uploader (which is in the url) 
2497         uploader 
= mobj
.group(1) 
2498         # extract simple title (uploader + slug of song title) 
2499         slug_title 
=  mobj
.group(2) 
2500         simple_title 
= uploader 
+ u
'-' + slug_title
 
2501         full_title 
= '%s/%s' % (uploader
, slug_title
) 
2503         self
.report_resolve(full_title
) 
2505         url 
= 'http://soundcloud.com/%s/%s' % (uploader
, slug_title
) 
2506         resolv_url 
= 'http://api.soundcloud.com/resolve.json?url=' + url 
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2507         info_json 
= self
._download
_webpage
(resolv_url
, full_title
, u
'Downloading info JSON') 
2509         info 
= json
.loads(info_json
) 
2510         video_id 
= info
['id'] 
2511         self
.report_extraction(full_title
) 
2513         streams_url 
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2514         stream_json 
= self
._download
_webpage
(streams_url
, full_title
, 
2515                                              u
'Downloading stream definitions', 
2516                                              u
'unable to download stream definitions') 
2518         streams 
= json
.loads(stream_json
) 
2519         mediaURL 
= streams
['http_mp3_128_url'] 
2520         upload_date 
= unified_strdate(info
['created_at']) 
2525             'uploader': info
['user']['username'], 
2526             'upload_date': upload_date
, 
2527             'title':    info
['title'], 
2529             'description': info
['description'], 
2532 class SoundcloudSetIE(InfoExtractor
): 
2533     """Information extractor for soundcloud.com sets 
2534        To access the media, the uid of the song and a stream token 
2535        must be extracted from the page source and the script must make 
2536        a request to media.soundcloud.com/crossdomain.xml. Then 
2537        the media can be grabbed by requesting from an url composed 
2538        of the stream token and uid 
2541     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)' 
2542     IE_NAME 
= u
'soundcloud:set' 
2544     def report_resolve(self
, video_id
): 
2545         """Report information extraction.""" 
2546         self
.to_screen(u
'%s: Resolving id' % video_id
) 
2548     def _real_extract(self
, url
): 
2549         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2551             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2553         # extract uploader (which is in the url) 
2554         uploader 
= mobj
.group(1) 
2555         # extract simple title (uploader + slug of song title) 
2556         slug_title 
=  mobj
.group(2) 
2557         simple_title 
= uploader 
+ u
'-' + slug_title
 
2558         full_title 
= '%s/sets/%s' % (uploader
, slug_title
) 
2560         self
.report_resolve(full_title
) 
2562         url 
= 'http://soundcloud.com/%s/sets/%s' % (uploader
, slug_title
) 
2563         resolv_url 
= 'http://api.soundcloud.com/resolve.json?url=' + url 
+ '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2564         info_json 
= self
._download
_webpage
(resolv_url
, full_title
) 
2567         info 
= json
.loads(info_json
) 
2568         if 'errors' in info
: 
2569             for err 
in info
['errors']: 
2570                 self
._downloader
.report_error(u
'unable to download video webpage: %s' % compat_str(err
['error_message'])) 
2573         self
.report_extraction(full_title
) 
2574         for track 
in info
['tracks']: 
2575             video_id 
= track
['id'] 
2577             streams_url 
= 'https://api.sndcdn.com/i1/tracks/' + str(video_id
) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28' 
2578             stream_json 
= self
._download
_webpage
(streams_url
, video_id
, u
'Downloading track info JSON') 
2580             self
.report_extraction(video_id
) 
2581             streams 
= json
.loads(stream_json
) 
2582             mediaURL 
= streams
['http_mp3_128_url'] 
2587                 'uploader': track
['user']['username'], 
2588                 'upload_date':  unified_strdate(track
['created_at']), 
2589                 'title':    track
['title'], 
2591                 'description': track
['description'], 
2596 class InfoQIE(InfoExtractor
): 
2597     """Information extractor for infoq.com""" 
2598     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' 
2600     def _real_extract(self
, url
): 
2601         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2603             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2605         webpage 
= self
._download
_webpage
(url
, video_id
=url
) 
2606         self
.report_extraction(url
) 
2609         mobj 
= re
.search(r
"jsclassref ?= ?'([^']*)'", webpage
) 
2611             raise ExtractorError(u
'Unable to extract video url') 
2612         real_id 
= compat_urllib_parse
.unquote(base64
.b64decode(mobj
.group(1).encode('ascii')).decode('utf-8')) 
2613         video_url 
= 'rtmpe://video.infoq.com/cfx/st/' + real_id
 
2616         mobj 
= re
.search(r
'contentTitle = "(.*?)";', webpage
) 
2618             raise ExtractorError(u
'Unable to extract video title') 
2619         video_title 
= mobj
.group(1) 
2621         # Extract description 
2622         video_description 
= u
'No description available.' 
2623         mobj 
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', webpage
) 
2624         if mobj 
is not None: 
2625             video_description 
= mobj
.group(1) 
2627         video_filename 
= video_url
.split('/')[-1] 
2628         video_id
, extension 
= video_filename
.split('.') 
2634             'upload_date': None, 
2635             'title': video_title
, 
2636             'ext': extension
, # Extension is always(?) mp4, but seems to be flv 
2638             'description': video_description
, 
2643 class MixcloudIE(InfoExtractor
): 
2644     """Information extractor for www.mixcloud.com""" 
2646     _WORKING 
= False # New API, but it seems good http://www.mixcloud.com/developers/documentation/ 
2647     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2648     IE_NAME 
= u
'mixcloud' 
2650     def report_download_json(self
, file_id
): 
2651         """Report JSON download.""" 
2652         self
.to_screen(u
'Downloading json') 
2654     def get_urls(self
, jsonData
, fmt
, bitrate
='best'): 
2655         """Get urls from 'audio_formats' section in json""" 
2658             bitrate_list 
= jsonData
[fmt
] 
2659             if bitrate 
is None or bitrate 
== 'best' or bitrate 
not in bitrate_list
: 
2660                 bitrate 
= max(bitrate_list
) # select highest 
2662             url_list 
= jsonData
[fmt
][bitrate
] 
2663         except TypeError: # we have no bitrate info. 
2664             url_list 
= jsonData
[fmt
] 
2667     def check_urls(self
, url_list
): 
2668         """Returns 1st active url from list""" 
2669         for url 
in url_list
: 
2671                 compat_urllib_request
.urlopen(url
) 
2673             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2678     def _print_formats(self
, formats
): 
2679         print('Available formats:') 
2680         for fmt 
in formats
.keys(): 
2681             for b 
in formats
[fmt
]: 
2683                     ext 
= formats
[fmt
][b
][0] 
2684                     print('%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1])) 
2685                 except TypeError: # we have no bitrate info 
2686                     ext 
= formats
[fmt
][0] 
2687                     print('%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1])) 
2690     def _real_extract(self
, url
): 
2691         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2693             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2694         # extract uploader & filename from url 
2695         uploader 
= mobj
.group(1).decode('utf-8') 
2696         file_id 
= uploader 
+ "-" + mobj
.group(2).decode('utf-8') 
2698         # construct API request 
2699         file_url 
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json' 
2700         # retrieve .json file with links to files 
2701         request 
= compat_urllib_request
.Request(file_url
) 
2703             self
.report_download_json(file_url
) 
2704             jsonData 
= compat_urllib_request
.urlopen(request
).read() 
2705         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2706             raise ExtractorError(u
'Unable to retrieve file: %s' % compat_str(err
)) 
2709         json_data 
= json
.loads(jsonData
) 
2710         player_url 
= json_data
['player_swf_url'] 
2711         formats 
= dict(json_data
['audio_formats']) 
2713         req_format 
= self
._downloader
.params
.get('format', None) 
2716         if self
._downloader
.params
.get('listformats', None): 
2717             self
._print
_formats
(formats
) 
2720         if req_format 
is None or req_format 
== 'best': 
2721             for format_param 
in formats
.keys(): 
2722                 url_list 
= self
.get_urls(formats
, format_param
) 
2724                 file_url 
= self
.check_urls(url_list
) 
2725                 if file_url 
is not None: 
2728             if req_format 
not in formats
: 
2729                 raise ExtractorError(u
'Format is not available') 
2731             url_list 
= self
.get_urls(formats
, req_format
) 
2732             file_url 
= self
.check_urls(url_list
) 
2733             format_param 
= req_format
 
2736             'id': file_id
.decode('utf-8'), 
2737             'url': file_url
.decode('utf-8'), 
2738             'uploader': uploader
.decode('utf-8'), 
2739             'upload_date': None, 
2740             'title': json_data
['name'], 
2741             'ext': file_url
.split('.')[-1].decode('utf-8'), 
2742             'format': (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
2743             'thumbnail': json_data
['thumbnail_url'], 
2744             'description': json_data
['description'], 
2745             'player_url': player_url
.decode('utf-8'), 
2748 class StanfordOpenClassroomIE(InfoExtractor
): 
2749     """Information extractor for Stanford's Open ClassRoom""" 
2751     _VALID_URL 
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
2752     IE_NAME 
= u
'stanfordoc' 
2754     def _real_extract(self
, url
): 
2755         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2757             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2759         if mobj
.group('course') and mobj
.group('video'): # A specific video 
2760             course 
= mobj
.group('course') 
2761             video 
= mobj
.group('video') 
2763                 'id': course 
+ '_' + video
, 
2765                 'upload_date': None, 
2768             self
.report_extraction(info
['id']) 
2769             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
2770             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
2772                 metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
2773             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2774                 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
2775             mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2777                 info
['title'] = mdoc
.findall('./title')[0].text
 
2778                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
2780                 raise ExtractorError(u
'Invalid metadata XML file') 
2781             info
['ext'] = info
['url'].rpartition('.')[2] 
2783         elif mobj
.group('course'): # A course page 
2784             course 
= mobj
.group('course') 
2789                 'upload_date': None, 
2792             coursepage 
= self
._download
_webpage
(url
, info
['id'], 
2793                                         note
='Downloading course info page', 
2794                                         errnote
='Unable to download course info page') 
2796             m 
= re
.search('<h1>([^<]+)</h1>', coursepage
) 
2798                 info
['title'] = unescapeHTML(m
.group(1)) 
2800                 info
['title'] = info
['id'] 
2802             m 
= re
.search('<description>([^<]+)</description>', coursepage
) 
2804                 info
['description'] = unescapeHTML(m
.group(1)) 
2806             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
2809                     'type': 'reference', 
2810                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
2814             for entry 
in info
['list']: 
2815                 assert entry
['type'] == 'reference' 
2816                 results 
+= self
.extract(entry
['url']) 
2820                 'id': 'Stanford OpenClassroom', 
2823                 'upload_date': None, 
2826             self
.report_download_webpage(info
['id']) 
2827             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
2829                 rootpage 
= compat_urllib_request
.urlopen(rootURL
).read() 
2830             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2831                 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
)) 
2833             info
['title'] = info
['id'] 
2835             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
2838                     'type': 'reference', 
2839                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
2844             for entry 
in info
['list']: 
2845                 assert entry
['type'] == 'reference' 
2846                 results 
+= self
.extract(entry
['url']) 
2849 class MTVIE(InfoExtractor
): 
2850     """Information extractor for MTV.com""" 
2852     _VALID_URL 
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' 
2855     def _real_extract(self
, url
): 
2856         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2858             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2859         if not mobj
.group('proto'): 
2860             url 
= 'http://' + url
 
2861         video_id 
= mobj
.group('videoid') 
2863         webpage 
= self
._download
_webpage
(url
, video_id
) 
2865         mobj 
= re
.search(r
'<meta name="mtv_vt" content="([^"]+)"/>', webpage
) 
2867             raise ExtractorError(u
'Unable to extract song name') 
2868         song_name 
= unescapeHTML(mobj
.group(1).decode('iso-8859-1')) 
2869         mobj 
= re
.search(r
'<meta name="mtv_an" content="([^"]+)"/>', webpage
) 
2871             raise ExtractorError(u
'Unable to extract performer') 
2872         performer 
= unescapeHTML(mobj
.group(1).decode('iso-8859-1')) 
2873         video_title 
= performer 
+ ' - ' + song_name
 
2875         mobj 
= re
.search(r
'<meta name="mtvn_uri" content="([^"]+)"/>', webpage
) 
2877             raise ExtractorError(u
'Unable to mtvn_uri') 
2878         mtvn_uri 
= mobj
.group(1) 
2880         mobj 
= re
.search(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage
) 
2882             raise ExtractorError(u
'Unable to extract content id') 
2883         content_id 
= mobj
.group(1) 
2885         videogen_url 
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri 
+ '&id=' + content_id 
+ '&vid=' + video_id 
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 
2886         self
.report_extraction(video_id
) 
2887         request 
= compat_urllib_request
.Request(videogen_url
) 
2889             metadataXml 
= compat_urllib_request
.urlopen(request
).read() 
2890         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
2891             raise ExtractorError(u
'Unable to download video metadata: %s' % compat_str(err
)) 
2893         mdoc 
= xml
.etree
.ElementTree
.fromstring(metadataXml
) 
2894         renditions 
= mdoc
.findall('.//rendition') 
2896         # For now, always pick the highest quality. 
2897         rendition 
= renditions
[-1] 
2900             _
,_
,ext 
= rendition
.attrib
['type'].partition('/') 
2901             format 
= ext 
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate'] 
2902             video_url 
= rendition
.find('./src').text
 
2904             raise ExtractorError('Invalid rendition field.') 
2909             'uploader': performer
, 
2910             'upload_date': None, 
2911             'title': video_title
, 
2919 class YoukuIE(InfoExtractor
): 
2920     _VALID_URL 
=  r
'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html' 
2923         nowTime 
= int(time
.time() * 1000) 
2924         random1 
= random
.randint(1000,1998) 
2925         random2 
= random
.randint(1000,9999) 
2927         return "%d%d%d" %(nowTime
,random1
,random2
) 
2929     def _get_file_ID_mix_string(self
, seed
): 
2931         source 
= list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890") 
2933         for i 
in range(len(source
)): 
2934             seed  
=  (seed 
* 211 + 30031 ) % 65536 
2935             index  
=  math
.floor(seed 
/ 65536 * len(source
) ) 
2936             mixed
.append(source
[int(index
)]) 
2937             source
.remove(source
[int(index
)]) 
2938         #return ''.join(mixed) 
2941     def _get_file_id(self
, fileId
, seed
): 
2942         mixed 
= self
._get
_file
_ID
_mix
_string
(seed
) 
2943         ids 
= fileId
.split('*') 
2947                 realId
.append(mixed
[int(ch
)]) 
2948         return ''.join(realId
) 
2950     def _real_extract(self
, url
): 
2951         mobj 
= re
.match(self
._VALID
_URL
, url
) 
2953             raise ExtractorError(u
'Invalid URL: %s' % url
) 
2954         video_id 
= mobj
.group('ID') 
2956         info_url 
= 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 
2958         jsondata 
= self
._download
_webpage
(info_url
, video_id
) 
2960         self
.report_extraction(video_id
) 
2962             config 
= json
.loads(jsondata
) 
2964             video_title 
=  config
['data'][0]['title'] 
2965             seed 
= config
['data'][0]['seed'] 
2967             format 
= self
._downloader
.params
.get('format', None) 
2968             supported_format 
= list(config
['data'][0]['streamfileids'].keys()) 
2970             if format 
is None or format 
== 'best': 
2971                 if 'hd2' in supported_format
: 
2976             elif format 
== 'worst': 
2984             fileid 
= config
['data'][0]['streamfileids'][format
] 
2985             keys 
= [s
['k'] for s 
in config
['data'][0]['segs'][format
]] 
2986         except (UnicodeDecodeError, ValueError, KeyError): 
2987             raise ExtractorError(u
'Unable to extract info section') 
2990         sid 
= self
._gen
_sid
() 
2991         fileid 
= self
._get
_file
_id
(fileid
, seed
) 
2993         #column 8,9 of fileid represent the segment number 
2994         #fileid[7:9] should be changed 
2995         for index
, key 
in enumerate(keys
): 
2997             temp_fileid 
= '%s%02X%s' % (fileid
[0:8], index
, fileid
[10:]) 
2998             download_url 
= 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid
, index
, temp_fileid
, key
) 
3001                 'id': '%s_part%02d' % (video_id
, index
), 
3002                 'url': download_url
, 
3004                 'upload_date': None, 
3005                 'title': video_title
, 
3008             files_info
.append(info
) 
3013 class XNXXIE(InfoExtractor
): 
3014     """Information extractor for xnxx.com""" 
3016     _VALID_URL 
= r
'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)' 
3018     VIDEO_URL_RE 
= r
'flv_url=(.*?)&' 
3019     VIDEO_TITLE_RE 
= r
'<title>(.*?)\s+-\s+XNXX.COM' 
3020     VIDEO_THUMB_RE 
= r
'url_bigthumb=(.*?)&' 
3022     def _real_extract(self
, url
): 
3023         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3025             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3026         video_id 
= mobj
.group(1) 
3028         # Get webpage content 
3029         webpage 
= self
._download
_webpage
(url
, video_id
) 
3031         result 
= re
.search(self
.VIDEO_URL_RE
, webpage
) 
3033             raise ExtractorError(u
'Unable to extract video url') 
3034         video_url 
= compat_urllib_parse
.unquote(result
.group(1)) 
3036         result 
= re
.search(self
.VIDEO_TITLE_RE
, webpage
) 
3038             raise ExtractorError(u
'Unable to extract video title') 
3039         video_title 
= result
.group(1) 
3041         result 
= re
.search(self
.VIDEO_THUMB_RE
, webpage
) 
3043             raise ExtractorError(u
'Unable to extract video thumbnail') 
3044         video_thumbnail 
= result
.group(1) 
3050             'upload_date': None, 
3051             'title': video_title
, 
3053             'thumbnail': video_thumbnail
, 
3054             'description': None, 
3058 class GooglePlusIE(InfoExtractor
): 
3059     """Information extractor for plus.google.com.""" 
3061     _VALID_URL 
= r
'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)' 
3062     IE_NAME 
= u
'plus.google' 
3064     def report_extract_entry(self
, url
): 
3065         """Report downloading extry""" 
3066         self
.to_screen(u
'Downloading entry: %s' % url
) 
3068     def report_date(self
, upload_date
): 
3069         """Report downloading extry""" 
3070         self
.to_screen(u
'Entry date: %s' % upload_date
) 
3072     def report_uploader(self
, uploader
): 
3073         """Report downloading extry""" 
3074         self
.to_screen(u
'Uploader: %s' % uploader
) 
3076     def report_title(self
, video_title
): 
3077         """Report downloading extry""" 
3078         self
.to_screen(u
'Title: %s' % video_title
) 
3080     def report_extract_vid_page(self
, video_page
): 
3081         """Report information extraction.""" 
3082         self
.to_screen(u
'Extracting video page: %s' % video_page
) 
3084     def _real_extract(self
, url
): 
3085         # Extract id from URL 
3086         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3088             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3090         post_url 
= mobj
.group(0) 
3091         video_id 
= mobj
.group(1) 
3093         video_extension 
= 'flv' 
3095         # Step 1, Retrieve post webpage to extract further information 
3096         self
.report_extract_entry(post_url
) 
3097         webpage 
= self
._download
_webpage
(post_url
, video_id
, u
'Downloading entry webpage') 
3099         # Extract update date 
3101         pattern 
= 'title="Timestamp">(.*?)</a>' 
3102         mobj 
= re
.search(pattern
, webpage
) 
3104             upload_date 
= mobj
.group(1) 
3105             # Convert timestring to a format suitable for filename 
3106             upload_date 
= datetime
.datetime
.strptime(upload_date
, "%Y-%m-%d") 
3107             upload_date 
= upload_date
.strftime('%Y%m%d') 
3108         self
.report_date(upload_date
) 
3112         pattern 
= r
'rel\="author".*?>(.*?)</a>' 
3113         mobj 
= re
.search(pattern
, webpage
) 
3115             uploader 
= mobj
.group(1) 
3116         self
.report_uploader(uploader
) 
3119         # Get the first line for title 
3121         pattern 
= r
'<meta name\=\"Description\" content\=\"(.*?)[\n<"]' 
3122         mobj 
= re
.search(pattern
, webpage
) 
3124             video_title 
= mobj
.group(1) 
3125         self
.report_title(video_title
) 
3127         # Step 2, Stimulate clicking the image box to launch video 
3128         pattern 
= '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' 
3129         mobj 
= re
.search(pattern
, webpage
) 
3131             raise ExtractorError(u
'Unable to extract video page URL') 
3133         video_page 
= mobj
.group(1) 
3134         webpage 
= self
._download
_webpage
(video_page
, video_id
, u
'Downloading video page') 
3135         self
.report_extract_vid_page(video_page
) 
3138         # Extract video links on video page 
3139         """Extract video links of all sizes""" 
3140         pattern 
= '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' 
3141         mobj 
= re
.findall(pattern
, webpage
) 
3143             raise ExtractorError(u
'Unable to extract video links') 
3145         # Sort in resolution 
3146         links 
= sorted(mobj
) 
3148         # Choose the lowest of the sort, i.e. highest resolution 
3149         video_url 
= links
[-1] 
3150         # Only get the url. The resolution part in the tuple has no use anymore 
3151         video_url 
= video_url
[-1] 
3152         # Treat escaped \u0026 style hex 
3154             video_url 
= video_url
.decode("unicode_escape") 
3155         except AttributeError: # Python 3 
3156             video_url 
= bytes(video_url
, 'ascii').decode('unicode-escape') 
3162             'uploader': uploader
, 
3163             'upload_date':  upload_date
, 
3164             'title':    video_title
, 
3165             'ext':      video_extension
, 
3168 class NBAIE(InfoExtractor
): 
3169     _VALID_URL 
= r
'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$' 
3172     def _real_extract(self
, url
): 
3173         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3175             raise ExtractorError(u
'Invalid URL: %s' % url
) 
3177         video_id 
= mobj
.group(1) 
3178         if video_id
.endswith('/index.html'): 
3179             video_id 
= video_id
[:-len('/index.html')] 
3181         webpage 
= self
._download
_webpage
(url
, video_id
) 
3183         video_url 
= u
'http://ht-mobile.cdn.turner.com/nba/big' + video_id 
+ '_nba_1280x720.mp4' 
3184         def _findProp(rexp
, default
=None): 
3185             m 
= re
.search(rexp
, webpage
) 
3187                 return unescapeHTML(m
.group(1)) 
3191         shortened_video_id 
= video_id
.rpartition('/')[2] 
3192         title 
= _findProp(r
'<meta property="og:title" content="(.*?)"', shortened_video_id
).replace('NBA.com: ', '') 
3194             'id': shortened_video_id
, 
3198             'uploader_date': _findProp(r
'<b>Date:</b> (.*?)</div>'), 
3199             'description': _findProp(r
'<div class="description">(.*?)</h1>'), 
3203 class JustinTVIE(InfoExtractor
): 
3204     """Information extractor for justin.tv and twitch.tv""" 
3205     # TODO: One broadcast may be split into multiple videos. The key 
3206     # 'broadcast_id' is the same for all parts, and 'broadcast_part' 
3207     # starts at 1 and increases. Can we treat all parts as one video? 
3209     _VALID_URL 
= r
"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/ 
3211             (?P<channelid>[^/]+)| 
3212             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))| 
3213             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+)) 
3217     _JUSTIN_PAGE_LIMIT 
= 100 
3218     IE_NAME 
= u
'justin.tv' 
3220     def report_download_page(self
, channel
, offset
): 
3221         """Report attempt to download a single page of videos.""" 
3222         self
.to_screen(u
'%s: Downloading video information from %d to %d' % 
3223                 (channel
, offset
, offset 
+ self
._JUSTIN
_PAGE
_LIMIT
)) 
3225     # Return count of items, list of *valid* items 
3226     def _parse_page(self
, url
, video_id
): 
3227         webpage 
= self
._download
_webpage
(url
, video_id
, 
3228                                          u
'Downloading video info JSON', 
3229                                          u
'unable to download video info JSON') 
3231         response 
= json
.loads(webpage
) 
3232         if type(response
) != list: 
3233             error_text 
= response
.get('error', 'unknown error') 
3234             raise ExtractorError(u
'Justin.tv API: %s' % error_text
) 
3236         for clip 
in response
: 
3237             video_url 
= clip
['video_file_url'] 
3239                 video_extension 
= os
.path
.splitext(video_url
)[1][1:] 
3240                 video_date 
= re
.sub('-', '', clip
['start_time'][:10]) 
3241                 video_uploader_id 
= clip
.get('user_id', clip
.get('channel_id')) 
3242                 video_id 
= clip
['id'] 
3243                 video_title 
= clip
.get('title', video_id
) 
3247                     'title': video_title
, 
3248                     'uploader': clip
.get('channel_name', video_uploader_id
), 
3249                     'uploader_id': video_uploader_id
, 
3250                     'upload_date': video_date
, 
3251                     'ext': video_extension
, 
3253         return (len(response
), info
) 
3255     def _real_extract(self
, url
): 
3256         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3258             raise ExtractorError(u
'invalid URL: %s' % url
) 
3260         api_base 
= 'http://api.justin.tv' 
3262         if mobj
.group('channelid'): 
3264             video_id 
= mobj
.group('channelid') 
3265             api 
= api_base 
+ '/channel/archives/%s.json' % video_id
 
3266         elif mobj
.group('chapterid'): 
3267             chapter_id 
= mobj
.group('chapterid') 
3269             webpage 
= self
._download
_webpage
(url
, chapter_id
) 
3270             m 
= re
.search(r
'PP\.archive_id = "([0-9]+)";', webpage
) 
3272                 raise ExtractorError(u
'Cannot find archive of a chapter') 
3273             archive_id 
= m
.group(1) 
3275             api 
= api_base 
+ '/broadcast/by_chapter/%s.xml' % chapter_id
 
3276             chapter_info_xml 
= self
._download
_webpage
(api
, chapter_id
, 
3277                                              note
=u
'Downloading chapter information', 
3278                                              errnote
=u
'Chapter information download failed') 
3279             doc 
= xml
.etree
.ElementTree
.fromstring(chapter_info_xml
) 
3280             for a 
in doc
.findall('.//archive'): 
3281                 if archive_id 
== a
.find('./id').text
: 
3284                 raise ExtractorError(u
'Could not find chapter in chapter information') 
3286             video_url 
= a
.find('./video_file_url').text
 
3287             video_ext 
= video_url
.rpartition('.')[2] or u
'flv' 
3289             chapter_api_url 
= u
'https://api.twitch.tv/kraken/videos/c' + chapter_id
 
3290             chapter_info_json 
= self
._download
_webpage
(chapter_api_url
, u
'c' + chapter_id
, 
3291                                    note
='Downloading chapter metadata', 
3292                                    errnote
='Download of chapter metadata failed') 
3293             chapter_info 
= json
.loads(chapter_info_json
) 
3295             bracket_start 
= int(doc
.find('.//bracket_start').text
) 
3296             bracket_end 
= int(doc
.find('.//bracket_end').text
) 
3298             # TODO determine start (and probably fix up file) 
3299             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457 
3300             #video_url += u'?start=' + TODO:start_timestamp 
3301             # bracket_start is 13290, but we want 51670615 
3302             self
._downloader
.report_warning(u
'Chapter detected, but we can just download the whole file. ' 
3303                                             u
'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start
), formatSeconds(bracket_end
))) 
3306                 'id': u
'c' + chapter_id
, 
3309                 'title': chapter_info
['title'], 
3310                 'thumbnail': chapter_info
['preview'], 
3311                 'description': chapter_info
['description'], 
3312                 'uploader': chapter_info
['channel']['display_name'], 
3313                 'uploader_id': chapter_info
['channel']['name'], 
3317             video_id 
= mobj
.group('videoid') 
3318             api 
= api_base 
+ '/broadcast/by_archive/%s.json' % video_id
 
3320         self
.report_extraction(video_id
) 
3324         limit 
= self
._JUSTIN
_PAGE
_LIMIT
 
3327                 self
.report_download_page(video_id
, offset
) 
3328             page_url 
= api 
+ ('?offset=%d&limit=%d' % (offset
, limit
)) 
3329             page_count
, page_info 
= self
._parse
_page
(page_url
, video_id
) 
3330             info
.extend(page_info
) 
3331             if not paged 
or page_count 
!= limit
: 
3336 class FunnyOrDieIE(InfoExtractor
): 
3337     _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$' 
3339     def _real_extract(self
, url
): 
3340         mobj 
= re
.match(self
._VALID
_URL
, url
) 
3342             raise ExtractorError(u
'invalid URL: %s' % url
) 
3344         video_id 
= mobj
.group('id') 
3345         webpage 
= self
._download
_webpage
(url
, video_id
) 
3347         m 
= re
.search(r
'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage
, re
.DOTALL
) 
3349             raise ExtractorError(u
'Unable to find video information') 
3350         video_url 
= unescapeHTML(m
.group('url')) 
3352         m 
= re
.search(r
"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage
, flags
=re
.DOTALL
) 
3354             m 
= re
.search(r
'<title>(?P<title>[^<]+?)</title>', webpage
) 
3356                 raise ExtractorError(u
'Cannot find video title') 
3357         title 
= clean_html(m
.group('title')) 
3359         m 
= re
.search(r
'<meta property="og:description" content="(?P<desc>.*?)"', webpage
) 
3361             desc 
= unescapeHTML(m
.group('desc')) 
3370             'description': desc
, 
3374 class SteamIE(InfoExtractor
): 
3375     _VALID_URL 
= r
"""http://store\.steampowered\.com/ 
3377                 (?P<urltype>video|app)/ #If the page is only for videos or for a game 
3379                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID 
3383     def suitable(cls
, url
): 
3384         """Receives a URL and returns True if suitable for this IE.""" 
3385         return re
.match(cls
._VALID
_URL
, url
, re
.VERBOSE
) is not None 
3387     def _real_extract(self
, url
): 
3388         m 
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) 
3389         gameID 
= m
.group('gameID') 
3390         videourl 
= 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
 
3391         self
.report_age_confirmation() 
3392         webpage 
= self
._download
_webpage
(videourl
, gameID
) 
3393         game_title 
= re
.search(r
'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage
).group('game_title') 
3395         urlRE 
= r
"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P
<videoURL
>[\w
:/\
.\?=]+)\"(,\s
*MOVIE_NAME
: \"(?P
<videoName
>[\w
:/\
.\?=\
+-]+)\")?\s
*\
}," 
3396         mweb = re.finditer(urlRE, webpage) 
3397         namesRE = r'<span class="title
">(?P<videoName>.+?)</span>' 
3398         titles = re.finditer(namesRE, webpage) 
3399         thumbsRE = r'<img class="movie_thumb
" src="(?P
<thumbnail
>.+?
)">' 
3400         thumbs = re.finditer(thumbsRE, webpage) 
3402         for vid,vtitle,thumb in zip(mweb,titles,thumbs): 
3403             video_id = vid.group('videoID') 
3404             title = vtitle.group('videoName') 
3405             video_url = vid.group('videoURL') 
3406             video_thumb = thumb.group('thumbnail') 
3408                 raise ExtractorError(u'Cannot find video url for %s' % video_id) 
3413                 'title': unescapeHTML(title), 
3414                 'thumbnail': video_thumb 
3417         return [self.playlist_result(videos, gameID, game_title)] 
3419 class UstreamIE(InfoExtractor): 
3420     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)' 
3421     IE_NAME = u'ustream' 
3423     def _real_extract(self, url): 
3424         m = re.match(self._VALID_URL, url) 
3425         video_id = m.group('videoID') 
3426         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id 
3427         webpage = self._download_webpage(url, video_id) 
3428         self.report_extraction(video_id) 
3430             m = re.search(r'data-title="(?P
<title
>.+)"',webpage) 
3431             title = m.group('title') 
3432             m = re.search(r'data-content-type="channel
".*?>(?P<uploader>.*?)</a>', 
3434             uploader = unescapeHTML(m.group('uploader').strip()) 
3435             m = re.search(r'<link rel="image_src
" href="(?P
<thumb
>.*?
)"', webpage) 
3436             thumb = m.group('thumb') 
3437         except AttributeError: 
3438             raise ExtractorError(u'Unable to extract info') 
3444                 'uploader': uploader, 
3449 class WorldStarHipHopIE(InfoExtractor): 
3450     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)' 
3451     IE_NAME = u'WorldStarHipHop' 
3453     def _real_extract(self, url): 
3454         _src_url = r'so\.addVariable\("file","(.*?
)"\)' 
3456         m = re.match(self._VALID_URL, url) 
3457         video_id = m.group('id') 
3459         webpage_src = self._download_webpage(url, video_id)  
3461         mobj = re.search(_src_url, webpage_src) 
3463         if mobj is not None: 
3464             video_url = mobj.group(1) 
3465             if 'mp4' in video_url: 
3470             raise ExtractorError(u'Cannot find video url for %s' % video_id) 
3472         mobj = re.search(r"<title
>(.*)</title
>", webpage_src) 
3475             raise ExtractorError(u'Cannot determine title') 
3476         title = mobj.group(1) 
3478         mobj = re.search(r'rel="image_src
" href="(.*)" />', webpage_src) 
3479         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. 
3480         if mobj is not None: 
3481             thumbnail = mobj.group(1) 
3483             _title = r"""candytitles.*>(.*)</span>""" 
3484             mobj = re.search(_title, webpage_src) 
3485             if mobj is not None: 
3486                 title = mobj.group(1) 
3493                     'thumbnail' : thumbnail, 
3498 class RBMARadioIE(InfoExtractor): 
3499     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' 
3501     def _real_extract(self, url): 
3502         m = re.match(self._VALID_URL, url) 
3503         video_id = m.group('videoID') 
3505         webpage = self._download_webpage(url, video_id) 
3506         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage) 
3508             raise ExtractorError(u'Cannot find metadata') 
3509         json_data = m.group(1) 
3512             data = json.loads(json_data) 
3513         except ValueError as e: 
3514             raise ExtractorError(u'Invalid JSON: ' + str(e)) 
3516         video_url = data['akamai_url'] + '&cbr=256' 
3517         url_parts = compat_urllib_parse_urlparse(video_url) 
3518         video_ext = url_parts.path.rpartition('.')[2] 
3523                 'title': data['title'], 
3524                 'description': data.get('teaser_text'), 
3525                 'location': data.get('country_of_origin'), 
3526                 'uploader': data.get('host', {}).get('name'), 
3527                 'uploader_id': data.get('host', {}).get('slug'), 
3528                 'thumbnail': data.get('image', {}).get('large_url_2x'), 
3529                 'duration': data.get('duration'), 
3534 class YouPornIE(InfoExtractor): 
3535     """Information extractor for youporn.com.""" 
3536     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)' 
3538     def _print_formats(self, formats): 
3539         """Print all available formats""" 
3540         print(u'Available formats:') 
3541         print(u'ext\t\tformat') 
3542         print(u'---------------------------------') 
3543         for format in formats: 
3544             print(u'%s\t\t%s'  % (format['ext'], format['format'])) 
3546     def _specific(self, req_format, formats): 
3548             if(x["format
"]==req_format): 
3552     def _real_extract(self, url): 
3553         mobj = re.match(self._VALID_URL, url) 
3555             raise ExtractorError(u'Invalid URL: %s' % url) 
3557         video_id = mobj.group('videoid') 
3559         req = compat_urllib_request.Request(url) 
3560         req.add_header('Cookie', 'age_verified=1') 
3561         webpage = self._download_webpage(req, video_id) 
3563         # Get the video title 
3564         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) 
3566             raise ExtractorError(u'Unable to extract video title') 
3567         video_title = result.group('title').strip() 
3569         # Get the video date 
3570         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) 
3572             self._downloader.report_warning(u'unable to extract video date') 
3575             upload_date = unified_strdate(result.group('date').strip()) 
3577         # Get the video uploader 
3578         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) 
3580             self._downloader.report_warning(u'unable to extract uploader') 
3581             video_uploader = None 
3583             video_uploader = result.group('uploader').strip() 
3584             video_uploader = clean_html( video_uploader ) 
3586         # Get all of the formats available 
3587         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList
">(?P<download_list>.*?)</ul>' 
3588         result = re.search(DOWNLOAD_LIST_RE, webpage) 
3590             raise ExtractorError(u'Unable to extract download list') 
3591         download_list_html = result.group('download_list').strip() 
3593         # Get all of the links from the page 
3594         LINK_RE = r'(?s)<a href="(?P
<url
>[^
"]+)">' 
3595         links = re.findall(LINK_RE, download_list_html) 
3596         if(len(links) == 0): 
3597             raise ExtractorError(u'ERROR
: no known formats available 
for video
') 
3599         self.to_screen(u'Links found
: %d' % len(links)) 
3604             # A link looks like this: 
3605             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 
3606             # A path looks like this: 
3607             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 
3608             video_url = unescapeHTML( link ) 
3609             path = compat_urllib_parse_urlparse( video_url ).path 
3610             extension = os.path.splitext( path )[1][1:] 
3611             format = path.split('/')[4].split('_
')[:2] 
3614             format = "-".join( format ) 
3615             title = u'%s-%s-%s' % (video_title, size, bitrate) 
3620                 'uploader
': video_uploader, 
3621                 'upload_date
': upload_date, 
3626                 'description
': None, 
3630         if self._downloader.params.get('listformats
', None): 
3631             self._print_formats(formats) 
3634         req_format = self._downloader.params.get('format
', None) 
3635         self.to_screen(u'Format
: %s' % req_format) 
3637         if req_format is None or req_format == 'best
': 
3639         elif req_format == 'worst
': 
3640             return [formats[-1]] 
3641         elif req_format in ('-1', 'all
'): 
3644             format = self._specific( req_format, formats ) 
3646                 raise ExtractorError(u'Requested format 
not available
') 
3651 class PornotubeIE(InfoExtractor): 
3652     """Information extractor for pornotube.com.""" 
3653     _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?pornotube\
.com(/c
/(?P
<channel
>[0-9]+))?
(/m
/(?P
<videoid
>[0-9]+))(/(?P
<title
>.+))$
' 
3655     def _real_extract(self, url): 
3656         mobj = re.match(self._VALID_URL, url) 
3658             raise ExtractorError(u'Invalid URL
: %s' % url) 
3660         video_id = mobj.group('videoid
') 
3661         video_title = mobj.group('title
') 
3663         # Get webpage content 
3664         webpage = self._download_webpage(url, video_id) 
3667         VIDEO_URL_RE = r'url
: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' 
3668         result = re.search(VIDEO_URL_RE, webpage) 
3670             raise ExtractorError(u'Unable to extract video url
') 
3671         video_url = compat_urllib_parse.unquote(result.group('url
')) 
3673         #Get the uploaded date 
3674         VIDEO_UPLOADED_RE = r'<div 
class="video_added_by">Added (?P
<date
>[0-9\
/]+) by
' 
3675         result = re.search(VIDEO_UPLOADED_RE, webpage) 
3677             raise ExtractorError(u'Unable to extract video title
') 
3678         upload_date = unified_strdate(result.group('date
')) 
3680         info = {'id': video_id, 
3683                 'upload_date
': upload_date, 
3684                 'title
': video_title, 
3690 class YouJizzIE(InfoExtractor): 
3691     """Information extractor for youjizz.com.""" 
3692     _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?youjizz\
.com
/videos
/(?P
<videoid
>[^
.]+).html$
' 
3694     def _real_extract(self, url): 
3695         mobj = re.match(self._VALID_URL, url) 
3697             raise ExtractorError(u'Invalid URL
: %s' % url) 
3699         video_id = mobj.group('videoid
') 
3701         # Get webpage content 
3702         webpage = self._download_webpage(url, video_id) 
3704         # Get the video title 
3705         result = re.search(r'<title
>(?P
<title
>.*)</title
>', webpage) 
3707             raise ExtractorError(u'ERROR
: unable to extract video title
') 
3708         video_title = result.group('title
').strip() 
3710         # Get the embed page 
3711         result = re.search(r'https?
://www
.youjizz
.com
/videos
/embed
/(?P
<videoid
>[0-9]+)', webpage) 
3713             raise ExtractorError(u'ERROR
: unable to extract embed page
') 
3715         embed_page_url = result.group(0).strip() 
3716         video_id = result.group('videoid
') 
3718         webpage = self._download_webpage(embed_page_url, video_id) 
3721         result = re.search(r'so
.addVariable\
("file",encodeURIComponent\
("(?P<source>[^"]+)"\)\);', webpage) 
3723             raise ExtractorError(u'ERROR: unable to extract video url') 
3724         video_url = result.group('source') 
3726         info = {'id': video_id, 
3728                 'title': video_title, 
3731                 'player_url': embed_page_url} 
3735 class EightTracksIE(InfoExtractor): 
3737     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$' 
3739     def _real_extract(self, url): 
3740         mobj = re.match(self._VALID_URL, url) 
3742             raise ExtractorError(u'Invalid URL: %s' % url) 
3743         playlist_id = mobj.group('id') 
3745         webpage = self._download_webpage(url, playlist_id) 
3747         m = re.search(r"PAGE
.mix 
= (.*?
);\n", webpage, flags=re.DOTALL) 
3749             raise ExtractorError(u'Cannot find trax information') 
3750         json_like = m.group(1) 
3751         data = json.loads(json_like) 
3753         session = str(random.randint(0, 1000000000)) 
3755         track_count = data['tracks_count'] 
3756         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) 
3757         next_url = first_url 
3759         for i in itertools.count(): 
3760             api_json = self._download_webpage(next_url, playlist_id, 
3761                 note=u'Downloading song information %s/%s' % (str(i+1), track_count), 
3762                 errnote=u'Failed to download song information') 
3763             api_data = json.loads(api_json) 
3764             track_data = api_data[u'set']['track'] 
3766                 'id': track_data['id'], 
3767                 'url': track_data['track_file_stream_url'], 
3768                 'title': track_data['performer'] + u' - ' + track_data['name'], 
3769                 'raw_title': track_data['name'], 
3770                 'uploader_id': data['user']['login'], 
3774             if api_data['set']['at_last_track']: 
3776             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) 
3779 class KeekIE(InfoExtractor): 
3780     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)' 
3783     def _real_extract(self, url): 
3784         m = re.match(self._VALID_URL, url) 
3785         video_id = m.group('videoID') 
3786         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id 
3787         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id 
3788         webpage = self._download_webpage(url, video_id) 
3789         m = re.search(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"', webpage) 
3790         title = unescapeHTML(m.group('title')) 
3791         m = re.search(r'<div class="user
-name
-and-bio
">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage) 
3792         uploader = clean_html(m.group('uploader')) 
3798                 'thumbnail': thumbnail, 
3799                 'uploader': uploader 
3803 class TEDIE(InfoExtractor): 
3804     _VALID_URL=r'''http://www\.ted\.com/ 
3806                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist 
3808                         ((?P<type_talk>talks)) # We have a simple talk 
3810                    (/lang/(.*?))? # The url may contain the language 
3811                    /(?P<name>\w+) # Here goes the name and then ".html
" 
3815     def suitable(cls, url): 
3816         """Receives a URL and returns True if suitable for this IE.""" 
3817         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None 
3819     def _real_extract(self, url): 
3820         m=re.match(self._VALID_URL, url, re.VERBOSE) 
3821         if m.group('type_talk'): 
3822             return [self._talk_info(url)] 
3824             playlist_id=m.group('playlist_id') 
3825             name=m.group('name') 
3826             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) 
3827             return [self._playlist_videos_info(url,name,playlist_id)] 
3829     def _talk_video_link(self,mediaSlug): 
3830         '''Returns the video link for that mediaSlug''' 
3831         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug 
3833     def _playlist_videos_info(self,url,name,playlist_id=0): 
3834         '''Returns the videos of the playlist''' 
3836                      <li\ id="talk_(\d
+)"([.\s]*?)data-id="(?P
<video_id
>\d
+)" 
3837                      ([.\s]*?)data-playlist_item_id="(\d
+)" 
3838                      ([.\s]*?)data-mediaslug="(?P
<mediaSlug
>.+?
)" 
3840         video_name_RE=r'<p\ class="talk
-title
"><a href="(?P
<talk_url
>/talks
/(.+).html
)">(?P<fullname>.+?)</a></p>' 
3841         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') 
3842         m_videos=re.finditer(video_RE,webpage,re.VERBOSE) 
3843         m_names=re.finditer(video_name_RE,webpage) 
3845         playlist_RE = r'div class="headline
">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>' 
3846         m_playlist = re.search(playlist_RE, webpage) 
3847         playlist_title = m_playlist.group('playlist_title') 
3849         playlist_entries = [] 
3850         for m_video, m_name in zip(m_videos,m_names): 
3851             video_id=m_video.group('video_id') 
3852             talk_url='http://www.ted.com%s' % m_name.group('talk_url') 
3853             playlist_entries.append(self.url_result(talk_url, 'TED')) 
3854         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) 
3856     def _talk_info(self, url, video_id=0): 
3857         """Return the video for the talk in the url""" 
3858         m=re.match(self._VALID_URL, url,re.VERBOSE) 
3859         videoName=m.group('name') 
3860         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) 
3861         # If the url includes the language we get the title translated 
3862         title_RE=r'<span id="altHeadline
" >(?P<title>.*)</span>' 
3863         title=re.search(title_RE, webpage).group('title') 
3864         info_RE=r'''<script\ type="text
/javascript
">var\ talkDetails\ =(.*?) 
3865                         "id":(?P<videoID>[\d]+).*? 
3866                         "mediaSlug
":"(?P
<mediaSlug
>[\w\d
]+?
)"''' 
3867         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P
<thumbnail
>.*?
)"' 
3868         thumb_match=re.search(thumb_RE,webpage) 
3869         info_match=re.search(info_RE,webpage,re.VERBOSE) 
3870         video_id=info_match.group('videoID') 
3871         mediaSlug=info_match.group('mediaSlug') 
3872         video_url=self._talk_video_link(mediaSlug) 
3878                 'thumbnail': thumb_match.group('thumbnail') 
3882 class MySpassIE(InfoExtractor): 
3883     _VALID_URL = r'http://www.myspass.de/.*' 
3885     def _real_extract(self, url): 
3886         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' 
3888         # video id is the last path element of the URL 
3889         # usually there is a trailing slash, so also try the second but last 
3890         url_path = compat_urllib_parse_urlparse(url).path 
3891         url_parent_path, video_id = os.path.split(url_path) 
3893             _, video_id = os.path.split(url_parent_path) 
3896         metadata_url = META_DATA_URL_TEMPLATE % video_id 
3897         metadata_text = self._download_webpage(metadata_url, video_id) 
3898         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) 
3900         # extract values from metadata 
3901         url_flv_el = metadata.find('url_flv') 
3902         if url_flv_el is None: 
3903             raise ExtractorError(u'Unable to extract download url') 
3904         video_url = url_flv_el.text 
3905         extension = os.path.splitext(video_url)[1][1:] 
3906         title_el = metadata.find('title') 
3907         if title_el is None: 
3908             raise ExtractorError(u'Unable to extract title') 
3909         title = title_el.text 
3910         format_id_el = metadata.find('format_id') 
3911         if format_id_el is None: 
3914             format = format_id_el.text 
3915         description_el = metadata.find('description') 
3916         if description_el is not None: 
3917             description = description_el.text 
3920         imagePreview_el = metadata.find('imagePreview') 
3921         if imagePreview_el is not None: 
3922             thumbnail = imagePreview_el.text 
3931             'thumbnail': thumbnail, 
3932             'description': description 
3936 class SpiegelIE(InfoExtractor): 
3937     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' 
3939     def _real_extract(self, url): 
3940         m = re.match(self._VALID_URL, url) 
3941         video_id = m.group('videoID') 
3943         webpage = self._download_webpage(url, video_id) 
3944         m = re.search(r'<div class="spVideoTitle
">(.*?)</div>', webpage) 
3946             raise ExtractorError(u'Cannot find title') 
3947         video_title = unescapeHTML(m.group(1)) 
3949         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' 
3950         xml_code = self._download_webpage(xml_url, video_id, 
3951                     note=u'Downloading XML', errnote=u'Failed to download XML') 
3953         idoc = xml.etree.ElementTree.fromstring(xml_code) 
3954         last_type = idoc[-1] 
3955         filename = last_type.findall('./filename')[0].text 
3956         duration = float(last_type.findall('./duration')[0].text) 
3958         video_url = 'http://video2.spiegel.de/flash/' + filename 
3959         video_ext = filename.rpartition('.')[2] 
3964             'title': video_title, 
3965             'duration': duration, 
3969 class LiveLeakIE(InfoExtractor): 
3971     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' 
3972     IE_NAME = u'liveleak' 
3974     def _real_extract(self, url): 
3975         mobj = re.match(self._VALID_URL, url) 
3977             raise ExtractorError(u'Invalid URL: %s' % url) 
3979         video_id = mobj.group('video_id') 
3981         webpage = self._download_webpage(url, video_id) 
3983         m = re.search(r'file: "(.*?
)",', webpage) 
3985             raise ExtractorError(u'Unable to find video url') 
3986         video_url = m.group(1) 
3988         m = re.search(r'<meta property="og
:title
" content="(?P
<title
>.*?
)"', webpage) 
3990             raise ExtractorError(u'Cannot find video title') 
3991         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() 
3993         m = re.search(r'<meta property="og
:description
" content="(?P
<desc
>.*?
)"', webpage) 
3995             desc = unescapeHTML(m.group('desc')) 
3999         m = re.search(r'By:.*?(\w+)</a>', webpage) 
4001             uploader = clean_html(m.group(1)) 
4010             'description': desc, 
4011             'uploader': uploader 
4016 class ARDIE(InfoExtractor): 
4017     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' 
4018     _TITLE = r'<h1(?: class="boxTopHeadline
")?>(?P<title>.*)</h1>' 
4019     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P
<rtmp_url
>[^
"]*)", "(?P<video_url>[^"]*)", "[^
"]*"\
)' 
4021     def _real_extract(self, url): 
4022         # determine video id from url 
4023         m = re.match(self._VALID_URL, url) 
4025         numid = re.search(r'documentId
=([0-9]+)', url) 
4027             video_id = numid.group(1) 
4029             video_id = m.group('video_id
') 
4031         # determine title and media streams from webpage 
4032         html = self._download_webpage(url, video_id) 
4033         title = re.search(self._TITLE, html).group('title
') 
4034         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] 
4036             assert '"fsk"' in html 
4037             raise ExtractorError(u'This video 
is only available after 
8:00 pm
') 
4039         # choose default media type and highest quality for now 
4040         stream = max([s for s in streams if int(s["media_type"]) == 0], 
4041                      key=lambda s: int(s["quality"])) 
4043         # there's two possibilities
: RTMP stream 
or HTTP download
 
4044         info 
= {'id': video_id
, 'title': title
, 'ext': 'mp4'} 
4045         if stream
['rtmp_url']: 
4046             self
.to_screen(u
'RTMP download detected') 
4047             assert stream
['video_url'].startswith('mp4:') 
4048             info
["url"] = stream
["rtmp_url"] 
4049             info
["play_path"] = stream
['video_url'] 
4051             assert stream
["video_url"].endswith('.mp4') 
4052             info
["url"] = stream
["video_url"] 
4055 class TumblrIE(InfoExtractor
): 
4056     _VALID_URL 
= r
'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' 
4058     def _real_extract(self
, url
): 
4059         m_url 
= re
.match(self
._VALID
_URL
, url
) 
4060         video_id 
= m_url
.group('id') 
4061         blog 
= m_url
.group('blog_name') 
4063         url 
= 'http://%s.tumblr.com/post/%s/' % (blog
, video_id
) 
4064         webpage 
= self
._download
_webpage
(url
, video_id
) 
4066         re_video 
= r
'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog
, video_id
) 
4067         video 
= re
.search(re_video
, webpage
) 
4069             self
.to_screen("No video found") 
4071         video_url 
= video
.group('video_url') 
4072         ext 
= video
.group('ext') 
4074         re_thumb 
= r
'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster 
4075         thumb 
= re
.search(re_thumb
, webpage
).group('thumb').replace('\\', '') 
4077         # The only place where you can get a title, it's not complete, 
4078         # but searching in other places doesn't work for all videos 
4079         re_title 
= r
'<title>(?P<title>.*?)</title>' 
4080         title 
= unescapeHTML(re
.search(re_title
, webpage
, re
.DOTALL
).group('title')) 
4082         return [{'id': video_id
, 
4089 class BandcampIE(InfoExtractor
): 
4090     _VALID_URL 
= r
'http://.*?\.bandcamp\.com/track/(?P<title>.*)' 
4092     def _real_extract(self
, url
): 
4093         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4094         title 
= mobj
.group('title') 
4095         webpage 
= self
._download
_webpage
(url
, title
) 
4096         # We get the link to the free download page 
4097         m_download 
= re
.search(r
'freeDownloadPage: "(.*?)"', webpage
) 
4098         if m_download 
is None: 
4099             raise ExtractorError(u
'No free songs founded') 
4101         download_link 
= m_download
.group(1) 
4102         id = re
.search(r
'var TralbumData = {(.*?)id: (?P<id>\d*?)$',  
4103                        webpage
, re
.MULTILINE|re
.DOTALL
).group('id') 
4105         download_webpage 
= self
._download
_webpage
(download_link
, id, 
4106                                                   'Downloading free downloads page') 
4107         # We get the dictionary of the track from some javascrip code 
4108         info 
= re
.search(r
'items: (.*?),$', 
4109                          download_webpage
, re
.MULTILINE
).group(1) 
4110         info 
= json
.loads(info
)[0] 
4111         # We pick mp3-320 for now, until format selection can be easily implemented. 
4112         mp3_info 
= info
[u
'downloads'][u
'mp3-320'] 
4113         # If we try to use this url it says the link has expired 
4114         initial_url 
= mp3_info
[u
'url'] 
4115         re_url 
= r
'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' 
4116         m_url 
= re
.match(re_url
, initial_url
) 
4117         #We build the url we will use to get the final track url 
4118         # This url is build in Bandcamp in the script download_bunde_*.js 
4119         request_url 
= '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url
.group('server'), m_url
.group('fsig'), id, m_url
.group('ts')) 
4120         final_url_webpage 
= self
._download
_webpage
(request_url
, id, 'Requesting download url') 
4121         # If we could correctly generate the .rand field the url would be 
4122         #in the "download_url" key 
4123         final_url 
= re
.search(r
'"retry_url":"(.*?)"', final_url_webpage
).group(1) 
4125         track_info 
= {'id':id, 
4126                       'title' : info
[u
'title'], 
4129                       'thumbnail' : info
[u
'thumb_url'], 
4130                       'uploader' : info
[u
'artist'] 
4135 class RedTubeIE(InfoExtractor
): 
4136     """Information Extractor for redtube""" 
4137     _VALID_URL 
= r
'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' 
4139     def _real_extract(self
,url
): 
4140         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4142             raise ExtractorError(u
'Invalid URL: %s' % url
) 
4144         video_id 
= mobj
.group('id') 
4145         video_extension 
= 'mp4'         
4146         webpage 
= self
._download
_webpage
(url
, video_id
) 
4147         self
.report_extraction(video_id
) 
4148         mobj 
= re
.search(r
'<source src="'+'(.+)'+'" type="video/mp4">',webpage
) 
4151             raise ExtractorError(u
'Unable to extract media URL') 
4153         video_url 
= mobj
.group(1) 
4154         mobj 
= re
.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage
) 
4156             raise ExtractorError(u
'Unable to extract title') 
4157         video_title 
= mobj
.group(1) 
4162             'ext':      video_extension
, 
4163             'title':    video_title
, 
4166 class InaIE(InfoExtractor
): 
4167     """Information Extractor for Ina.fr""" 
4168     _VALID_URL 
= r
'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' 
4170     def _real_extract(self
,url
): 
4171         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4173         video_id 
= mobj
.group('id') 
4174         mrss_url
='http://player.ina.fr/notices/%s.mrss' % video_id
 
4175         video_extension 
= 'mp4' 
4176         webpage 
= self
._download
_webpage
(mrss_url
, video_id
) 
4178         mobj 
= re
.search(r
'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage
) 
4180             raise ExtractorError(u
'Unable to extract media URL') 
4181         video_url 
= mobj
.group(1) 
4183         mobj 
= re
.search(r
'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage
) 
4185             raise ExtractorError(u
'Unable to extract title') 
4186         video_title 
= mobj
.group(1) 
4191             'ext':      video_extension
, 
4192             'title':    video_title
, 
4195 class HowcastIE(InfoExtractor
): 
4196     """Information Extractor for Howcast.com""" 
4197     _VALID_URL 
= r
'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)' 
4199     def _real_extract(self
, url
): 
4200         mobj 
= re
.match(self
._VALID
_URL
, url
) 
4202         video_id 
= mobj
.group('id') 
4203         webpage_url 
= 'http://www.howcast.com/videos/' + video_id
 
4204         webpage 
= self
._download
_webpage
(webpage_url
, video_id
) 
4206         self
.report_extraction(video_id
) 
4208         mobj 
= re
.search(r
'\'?
file\'?
: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) 
4210             raise ExtractorError(u'Unable to extract video URL
') 
4211         video_url = mobj.group(1) 
4213         mobj = re.search(r'<meta content
=(?
:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage) 
4215             raise ExtractorError(u'Unable to extract title') 
4216         video_title = mobj.group(1) or mobj.group(2) 
4218         mobj = re.search(r'<meta content=(?:"([^
"]+)"|
\'([^
\']+)\') name
=\'description
\'', webpage) 
4220             self._downloader.report_warning(u'unable to extract description
') 
4221             video_description = None 
4223             video_description = mobj.group(1) or mobj.group(2) 
4225         mobj = re.search(r'<meta content
=\'(.+?
)\' property=\'og
:image
\'', webpage) 
4227             raise ExtractorError(u'Unable to extract thumbnail
') 
4228         thumbnail = mobj.group(1) 
4234             'title
':    video_title, 
4235             'description
': video_description, 
4236             'thumbnail
': thumbnail, 
4239 class VineIE(InfoExtractor): 
4240     """Information Extractor for Vine.co""" 
4241     _VALID_URL = r'(?
:https?
://)?
(?
:www\
.)?vine\
.co
/v
/(?P
<id>\w
+)' 
4243     def _real_extract(self, url): 
4245         mobj = re.match(self._VALID_URL, url) 
4247         video_id = mobj.group('id') 
4248         webpage_url = 'https
://vine
.co
/v
/' + video_id 
4249         webpage = self._download_webpage(webpage_url, video_id) 
4251         self.report_extraction(video_id) 
4253         mobj = re.search(r'<meta 
property="twitter:player:stream" content
="(.+?)"', webpage) 
4255             raise ExtractorError(u'Unable to extract video URL
') 
4256         video_url = mobj.group(1) 
4258         mobj = re.search(r'<meta 
property="og:title" content
="(.+?)"', webpage) 
4260             raise ExtractorError(u'Unable to extract title
') 
4261         video_title = mobj.group(1) 
4263         mobj = re.search(r'<meta 
property="og:image" content
="(.+?)(\?.*?)?"', webpage) 
4265             raise ExtractorError(u'Unable to extract thumbnail
') 
4266         thumbnail = mobj.group(1) 
4268         mobj = re.search(r'<div 
class="user">.*?
<h2
>(.+?
)</h2
>', webpage, re.DOTALL) 
4270             raise ExtractorError(u'Unable to extract uploader
') 
4271         uploader = mobj.group(1) 
4277             'title
':     video_title, 
4278             'thumbnail
': thumbnail, 
4279             'uploader
':  uploader, 
4282 class FlickrIE(InfoExtractor): 
4283     """Information Extractor for Flickr videos""" 
4284     _VALID_URL = r'(?
:https?
://)?
(?
:www\
.)?flickr\
.com
/photos
/(?P
<uploader_id
>[\w\
-_
@]+)/(?P
<id>\d
+).*' 
4286     def _real_extract(self, url): 
4287         mobj = re.match(self._VALID_URL, url) 
4289         video_id = mobj.group('id') 
4290         video_uploader_id = mobj.group('uploader_id
') 
4291         webpage_url = 'http
://www
.flickr
.com
/photos
/' + video_uploader_id + '/' + video_id 
4292         webpage = self._download_webpage(webpage_url, video_id) 
4294         mobj = re.search(r"photo_secret: '(\w
+)'", webpage) 
4296             raise ExtractorError(u'Unable to extract video secret
') 
4297         secret = mobj.group(1) 
4299         first_url = 'https
://secure
.flickr
.com
/apps
/video
/video_mtl_xml
.gne?v
=x
&photo_id
=' + video_id + '&secret
=' + secret + '&bitrate
=700&target
=_self
' 
4300         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage
') 
4302         mobj = re.search(r'<Item 
id="id">(\d
+-\d
+)</Item
>', first_xml) 
4304             raise ExtractorError(u'Unable to extract node_id
') 
4305         node_id = mobj.group(1) 
4307         second_url = 'https
://secure
.flickr
.com
/video_playlist
.gne?node_id
=' + node_id + '&tech
=flash
&mode
=playlist
&bitrate
=700&secret
=' + secret + '&rd
=video
.yahoo
.com
&noad
=1' 
4308         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage
') 
4310         self.report_extraction(video_id) 
4312         mobj = re.search(r'<STREAM APP
="(.+?)" FULLPATH
="(.+?)"', second_xml) 
4314             raise ExtractorError(u'Unable to extract video url
') 
4315         video_url = mobj.group(1) + unescapeHTML(mobj.group(2)) 
4317         mobj = re.search(r'<meta 
property="og:title" content
=(?
:"([^"]+)"|\'([^\']+)\')', webpage) 
4319             raise ExtractorError(u'Unable to extract title') 
4320         video_title = mobj.group(1) or mobj.group(2) 
4322         mobj = re.search(r'<meta property="og
:description
" content=(?:"([^
"]+)"|
\'([^
\']+)\')', webpage) 
4324             self._downloader.report_warning(u'unable to extract description
') 
4325             video_description = None 
4327             video_description = mobj.group(1) or mobj.group(2) 
4329         mobj = re.search(r'<meta 
property="og:image" content
=(?
:"([^"]+)"|\'([^\']+)\')', webpage) 
4331             raise ExtractorError(u'Unable to extract thumbnail') 
4332         thumbnail = mobj.group(1) or mobj.group(2) 
4338             'title':       video_title, 
4339             'description': video_description, 
4340             'thumbnail':   thumbnail, 
4341             'uploader_id': video_uploader_id, 
4344 class TeamcocoIE(InfoExtractor): 
4345     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)' 
4347     def _real_extract(self, url): 
4348         mobj = re.match(self._VALID_URL, url) 
4350             raise ExtractorError(u'Invalid URL: %s' % url) 
4351         url_title = mobj.group('url_title') 
4352         webpage = self._download_webpage(url, url_title) 
4354         mobj = re.search(r'<article class="video
" data-id="(\d
+?
)"', webpage) 
4355         video_id = mobj.group(1) 
4357         self.report_extraction(video_id) 
4359         mobj = re.search(r'<meta property="og
:title
" content="(.+?
)"', webpage) 
4361             raise ExtractorError(u'Unable to extract title') 
4362         video_title = mobj.group(1) 
4364         mobj = re.search(r'<meta property="og
:image
" content="(.+?
)"', webpage) 
4366             raise ExtractorError(u'Unable to extract thumbnail') 
4367         thumbnail = mobj.group(1) 
4369         mobj = re.search(r'<meta property="og
:description
" content="(.*?
)"', webpage) 
4371             raise ExtractorError(u'Unable to extract description') 
4372         description = mobj.group(1) 
4374         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id 
4375         data = self._download_webpage(data_url, video_id, 'Downloading data webpage') 
4376         mobj = re.search(r'<file type="high
".*?>(.*?)</file>', data) 
4378             raise ExtractorError(u'Unable to extract video url') 
4379         video_url = mobj.group(1) 
4385             'title':       video_title, 
4386             'thumbnail':   thumbnail, 
4387             'description': description, 
4390 def gen_extractors(): 
4391     """ Return a list of an instance of every supported extractor. 
4392     The order does matter; the first extractor matched is the one handling the URL. 
4395         YoutubePlaylistIE(), 
4420         StanfordOpenClassroomIE(), 
4430         WorldStarHipHopIE(), 
4454 def get_info_extractor(ie_name): 
4455     """Returns the info extractor class with the given ie_name""" 
4456     return globals()[ie_name+'IE']