Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns a tuple (page content as string, URL handle) """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         content = webpage_bytes.decode(encoding, 'replace')
 146         return (content, urlh)
 147
 148     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 149         """ Returns the data of the page as a string """
 150         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
 151
 152     def to_screen(self, msg):
 153         """Print msg to screen, prefixing it with '[ie_name]'"""
 154         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 155
 156     def report_extraction(self, id_or_name):
 157         """Report information extraction."""
 158         self.to_screen(u'%s: Extracting information' % id_or_name)
 159
 160     def report_download_webpage(self, video_id):
 161         """Report webpage download."""
 162         self.to_screen(u'%s: Downloading webpage' % video_id)
 163
 164     def report_age_confirmation(self):
 165         """Report attempt to confirm age."""
 166         self.to_screen(u'Confirming age')
 167
 168     #Methods for following #608
 169     #They set the correct value of the '_type' key
 170     def video_result(self, video_info):
 171         """Returns a video"""
 172         video_info['_type'] = 'video'
 173         return video_info
 174     def url_result(self, url, ie=None):
 175         """Returns a url that points to a page that should be processed"""
 176         #TODO: ie should be the class used for getting the info
 177         video_info = {'_type': 'url',
 178                       'url': url,
 179                       'ie_key': ie}
 180         return video_info
 181     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 182         """Returns a playlist"""
 183         video_info = {'_type': 'playlist',
 184                       'entries': entries}
 185         if playlist_id:
 186             video_info['id'] = playlist_id
 187         if playlist_title:
 188             video_info['title'] = playlist_title
 189         return video_info
 190
 191
 192 class YoutubeIE(InfoExtractor):
 193     """Information extractor for youtube.com."""
 194
 195     _VALID_URL = r"""^
 196                      (
 197                          (?:https?://)?                                       # http(s):// (optional)
 198                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 199                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 200                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 201                          (?:                                                  # the various things that can precede the ID:
 202                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 203                              |(?:                                             # or the v= param in all its forms
 204                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 205                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 206                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 207                                  v=
 208                              )
 209                          )?                                                   # optional -> youtube.com/xxxx is OK
 210                      )?                                                       # all until now is optional -> you can pass the naked ID
 211                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 212                      (?(1).+)?                                                # if we found the ID, everything can follow
 213                      $"""
 214     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 215     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 216     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 217     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 218     _NETRC_MACHINE = 'youtube'
 219     # Listed in order of quality
 220     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 221     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 222     _video_extensions = {
 223         '13': '3gp',
 224         '17': 'mp4',
 225         '18': 'mp4',
 226         '22': 'mp4',
 227         '37': 'mp4',
 228         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 229         '43': 'webm',
 230         '44': 'webm',
 231         '45': 'webm',
 232         '46': 'webm',
 233     }
 234     _video_dimensions = {
 235         '5': '240x400',
 236         '6': '???',
 237         '13': '???',
 238         '17': '144x176',
 239         '18': '360x640',
 240         '22': '720x1280',
 241         '34': '360x640',
 242         '35': '480x854',
 243         '37': '1080x1920',
 244         '38': '3072x4096',
 245         '43': '360x640',
 246         '44': '480x854',
 247         '45': '720x1280',
 248         '46': '1080x1920',
 249     }
 250     IE_NAME = u'youtube'
 251
 252     @classmethod
 253     def suitable(cls, url):
 254         """Receives a URL and returns True if suitable for this IE."""
 255         if YoutubePlaylistIE.suitable(url): return False
 256         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 257
 258     def report_lang(self):
 259         """Report attempt to set language."""
 260         self.to_screen(u'Setting language')
 261
 262     def report_login(self):
 263         """Report attempt to log in."""
 264         self.to_screen(u'Logging in')
 265
 266     def report_video_webpage_download(self, video_id):
 267         """Report attempt to download video webpage."""
 268         self.to_screen(u'%s: Downloading video webpage' % video_id)
 269
 270     def report_video_info_webpage_download(self, video_id):
 271         """Report attempt to download video info webpage."""
 272         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 273
 274     def report_video_subtitles_download(self, video_id):
 275         """Report attempt to download video info webpage."""
 276         self.to_screen(u'%s: Checking available subtitles' % video_id)
 277
 278     def report_video_subtitles_request(self, video_id, sub_lang, format):
 279         """Report attempt to download video info webpage."""
 280         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 281
 282     def report_video_subtitles_available(self, video_id, sub_lang_list):
 283         """Report available subtitles."""
 284         sub_lang = ",".join(list(sub_lang_list.keys()))
 285         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 286
 287     def report_information_extraction(self, video_id):
 288         """Report attempt to extract video information."""
 289         self.to_screen(u'%s: Extracting video information' % video_id)
 290
 291     def report_unavailable_format(self, video_id, format):
 292         """Report extracted video URL."""
 293         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 294
 295     def report_rtmp_download(self):
 296         """Indicate the download will use the RTMP protocol."""
 297         self.to_screen(u'RTMP download detected')
 298
 299     def _get_available_subtitles(self, video_id):
 300         self.report_video_subtitles_download(video_id)
 301         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 302         try:
 303             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 304         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 305             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 306         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 307         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 308         if not sub_lang_list:
 309             return (u'video doesn\'t have subtitles', None)
 310         return sub_lang_list
 311
 312     def _list_available_subtitles(self, video_id):
 313         sub_lang_list = self._get_available_subtitles(video_id)
 314         self.report_video_subtitles_available(video_id, sub_lang_list)
 315
 316     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 317         """
 318         Return tuple:
 319         (error_message, sub_lang, sub)
 320         """
 321         self.report_video_subtitles_request(video_id, sub_lang, format)
 322         params = compat_urllib_parse.urlencode({
 323             'lang': sub_lang,
 324             'name': sub_name,
 325             'v': video_id,
 326             'fmt': format,
 327         })
 328         url = 'http://www.youtube.com/api/timedtext?' + params
 329         try:
 330             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 331         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 332             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 333         if not sub:
 334             return (u'Did not fetch video subtitles', None, None)
 335         return (None, sub_lang, sub)
 336
 337     def _extract_subtitle(self, video_id):
 338         """
 339         Return a list with a tuple:
 340         [(error_message, sub_lang, sub)]
 341         """
 342         sub_lang_list = self._get_available_subtitles(video_id)
 343         sub_format = self._downloader.params.get('subtitlesformat')
 344         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 345             return [(sub_lang_list[0], None, None)]
 346         if self._downloader.params.get('subtitleslang', False):
 347             sub_lang = self._downloader.params.get('subtitleslang')
 348         elif 'en' in sub_lang_list:
 349             sub_lang = 'en'
 350         else:
 351             sub_lang = list(sub_lang_list.keys())[0]
 352         if not sub_lang in sub_lang_list:
 353             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 354
 355         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 356         return [subtitle]
 357
 358     def _extract_all_subtitles(self, video_id):
 359         sub_lang_list = self._get_available_subtitles(video_id)
 360         sub_format = self._downloader.params.get('subtitlesformat')
 361         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 362             return [(sub_lang_list[0], None, None)]
 363         subtitles = []
 364         for sub_lang in sub_lang_list:
 365             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 366             subtitles.append(subtitle)
 367         return subtitles
 368
 369     def _print_formats(self, formats):
 370         print('Available formats:')
 371         for x in formats:
 372             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         username = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             username = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     username = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         # Set language
 399         request = compat_urllib_request.Request(self._LANG_URL)
 400         try:
 401             self.report_lang()
 402             compat_urllib_request.urlopen(request).read()
 403         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 405             return
 406
 407         # No authentication to be performed
 408         if username is None:
 409             return
 410
 411         request = compat_urllib_request.Request(self._LOGIN_URL)
 412         try:
 413             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 416             return
 417
 418         galx = None
 419         dsh = None
 420         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 421         if match:
 422           galx = match.group(1)
 423
 424         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 425         if match:
 426           dsh = match.group(1)
 427
 428         # Log in
 429         login_form_strs = {
 430                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 431                 u'Email': username,
 432                 u'GALX': galx,
 433                 u'Passwd': password,
 434                 u'PersistentCookie': u'yes',
 435                 u'_utf8': u'霱',
 436                 u'bgresponse': u'js_disabled',
 437                 u'checkConnection': u'',
 438                 u'checkedDomains': u'youtube',
 439                 u'dnConn': u'',
 440                 u'dsh': dsh,
 441                 u'pstMsg': u'0',
 442                 u'rmShown': u'1',
 443                 u'secTok': u'',
 444                 u'signIn': u'Sign in',
 445                 u'timeStmp': u'',
 446                 u'service': u'youtube',
 447                 u'uilel': u'3',
 448                 u'hl': u'en_US',
 449         }
 450         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 451         # chokes on unicode
 452         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 453         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 454         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 455         try:
 456             self.report_login()
 457             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 458             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 459                 self._downloader.report_warning(u'unable to log in: bad username or password')
 460                 return
 461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 462             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 463             return
 464
 465         # Confirm age
 466         age_form = {
 467                 'next_url':     '/',
 468                 'action_confirm':   'Confirm',
 469                 }
 470         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 471         try:
 472             self.report_age_confirmation()
 473             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 474         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 475             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 476
 477     def _extract_id(self, url):
 478         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 479         if mobj is None:
 480             raise ExtractorError(u'Invalid URL: %s' % url)
 481         video_id = mobj.group(2)
 482         return video_id
 483
 484     def _real_extract(self, url):
 485         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 486         mobj = re.search(self._NEXT_URL_RE, url)
 487         if mobj:
 488             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 489         video_id = self._extract_id(url)
 490
 491         # Get video webpage
 492         self.report_video_webpage_download(video_id)
 493         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 494         request = compat_urllib_request.Request(url)
 495         try:
 496             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 499
 500         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 501
 502         # Attempt to extract SWF player URL
 503         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 504         if mobj is not None:
 505             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 506         else:
 507             player_url = None
 508
 509         # Get video info
 510         self.report_video_info_webpage_download(video_id)
 511         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 512             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 513                     % (video_id, el_type))
 514             video_info_webpage = self._download_webpage(video_info_url, video_id,
 515                                     note=False,
 516                                     errnote='unable to download video info webpage')
 517             video_info = compat_parse_qs(video_info_webpage)
 518             if 'token' in video_info:
 519                 break
 520         if 'token' not in video_info:
 521             if 'reason' in video_info:
 522                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 523             else:
 524                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 525
 526         # Check for "rental" videos
 527         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 528             raise ExtractorError(u'"rental" videos not supported')
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             raise ExtractorError(u'Unable to extract uploader name')
 536         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 537
 538         # uploader_id
 539         video_uploader_id = None
 540         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 541         if mobj is not None:
 542             video_uploader_id = mobj.group(1)
 543         else:
 544             self._downloader.report_warning(u'unable to extract uploader nickname')
 545
 546         # title
 547         if 'title' not in video_info:
 548             raise ExtractorError(u'Unable to extract video title')
 549         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 550
 551         # thumbnail image
 552         if 'thumbnail_url' not in video_info:
 553             self._downloader.report_warning(u'unable to extract video thumbnail')
 554             video_thumbnail = ''
 555         else:   # don't panic if we can't find it
 556             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 557
 558         # upload date
 559         upload_date = None
 560         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 561         if mobj is not None:
 562             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 563             upload_date = unified_strdate(upload_date)
 564
 565         # description
 566         video_description = get_element_by_id("eow-description", video_webpage)
 567         if video_description:
 568             video_description = clean_html(video_description)
 569         else:
 570             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 571             if fd_mobj:
 572                 video_description = unescapeHTML(fd_mobj.group(1))
 573             else:
 574                 video_description = u''
 575
 576         # subtitles
 577         video_subtitles = None
 578
 579         if self._downloader.params.get('writesubtitles', False):
 580             video_subtitles = self._extract_subtitle(video_id)
 581             if video_subtitles:
 582                 (sub_error, sub_lang, sub) = video_subtitles[0]
 583                 if sub_error:
 584                     self._downloader.report_error(sub_error)
 585
 586         if self._downloader.params.get('allsubtitles', False):
 587             video_subtitles = self._extract_all_subtitles(video_id)
 588             for video_subtitle in video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitle
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('listsubtitles', False):
 594             sub_lang_list = self._list_available_subtitles(video_id)
 595             return
 596
 597         if 'length_seconds' not in video_info:
 598             self._downloader.report_warning(u'unable to extract video duration')
 599             video_duration = ''
 600         else:
 601             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 602
 603         # token
 604         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 605
 606         # Decide which formats to download
 607         req_format = self._downloader.params.get('format', None)
 608
 609         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 610             self.report_rtmp_download()
 611             video_url_list = [(None, video_info['conn'][0])]
 612         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 613             url_map = {}
 614             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 615                 url_data = compat_parse_qs(url_data_str)
 616                 if 'itag' in url_data and 'url' in url_data:
 617                     url = url_data['url'][0] + '&signature=' + url_data['sig'][0]
 618                     if not 'ratebypass' in url: url += '&ratebypass=yes'
 619                     url_map[url_data['itag'][0]] = url
 620
 621             format_limit = self._downloader.params.get('format_limit', None)
 622             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 623             if format_limit is not None and format_limit in available_formats:
 624                 format_list = available_formats[available_formats.index(format_limit):]
 625             else:
 626                 format_list = available_formats
 627             existing_formats = [x for x in format_list if x in url_map]
 628             if len(existing_formats) == 0:
 629                 raise ExtractorError(u'no known formats available for video')
 630             if self._downloader.params.get('listformats', None):
 631                 self._print_formats(existing_formats)
 632                 return
 633             if req_format is None or req_format == 'best':
 634                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 635             elif req_format == 'worst':
 636                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 637             elif req_format in ('-1', 'all'):
 638                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 639             else:
 640                 # Specific formats. We pick the first in a slash-delimeted sequence.
 641                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 642                 req_formats = req_format.split('/')
 643                 video_url_list = None
 644                 for rf in req_formats:
 645                     if rf in url_map:
 646                         video_url_list = [(rf, url_map[rf])]
 647                         break
 648                 if video_url_list is None:
 649                     raise ExtractorError(u'requested format not available')
 650         else:
 651             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 652
 653         results = []
 654         for format_param, video_real_url in video_url_list:
 655             # Extension
 656             video_extension = self._video_extensions.get(format_param, 'flv')
 657
 658             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 659                                               self._video_dimensions.get(format_param, '???'))
 660
 661             results.append({
 662                 'id':       video_id,
 663                 'url':      video_real_url,
 664                 'uploader': video_uploader,
 665                 'uploader_id': video_uploader_id,
 666                 'upload_date':  upload_date,
 667                 'title':    video_title,
 668                 'ext':      video_extension,
 669                 'format':   video_format,
 670                 'thumbnail':    video_thumbnail,
 671                 'description':  video_description,
 672                 'player_url':   player_url,
 673                 'subtitles':    video_subtitles,
 674                 'duration':     video_duration
 675             })
 676         return results
 677
 678
 679 class MetacafeIE(InfoExtractor):
 680     """Information Extractor for metacafe.com."""
 681
 682     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 683     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 684     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 685     IE_NAME = u'metacafe'
 686
 687     def report_disclaimer(self):
 688         """Report disclaimer retrieval."""
 689         self.to_screen(u'Retrieving disclaimer')
 690
 691     def _real_initialize(self):
 692         # Retrieve disclaimer
 693         request = compat_urllib_request.Request(self._DISCLAIMER)
 694         try:
 695             self.report_disclaimer()
 696             disclaimer = compat_urllib_request.urlopen(request).read()
 697         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 698             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 711
 712     def _real_extract(self, url):
 713         # Extract id and simplified title from URL
 714         mobj = re.match(self._VALID_URL, url)
 715         if mobj is None:
 716             raise ExtractorError(u'Invalid URL: %s' % url)
 717
 718         video_id = mobj.group(1)
 719
 720         # Check if video comes from YouTube
 721         mobj2 = re.match(r'^yt-(.*)$', video_id)
 722         if mobj2 is not None:
 723             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 724
 725         # Retrieve video webpage to extract further information
 726         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 727
 728         # Extract URL, uploader and title from webpage
 729         self.report_extraction(video_id)
 730         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 731         if mobj is not None:
 732             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 733             video_extension = mediaURL[-3:]
 734
 735             # Extract gdaKey if available
 736             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 737             if mobj is None:
 738                 video_url = mediaURL
 739             else:
 740                 gdaKey = mobj.group(1)
 741                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 742         else:
 743             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 744             if mobj is None:
 745                 raise ExtractorError(u'Unable to extract media URL')
 746             vardict = compat_parse_qs(mobj.group(1))
 747             if 'mediaData' not in vardict:
 748                 raise ExtractorError(u'Unable to extract media URL')
 749             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 750             if mobj is None:
 751                 raise ExtractorError(u'Unable to extract media URL')
 752             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 753             video_extension = mediaURL[-3:]
 754             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 755
 756         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 757         if mobj is None:
 758             raise ExtractorError(u'Unable to extract title')
 759         video_title = mobj.group(1).decode('utf-8')
 760
 761         mobj = re.search(r'submitter=(.*?);', webpage)
 762         if mobj is None:
 763             raise ExtractorError(u'Unable to extract uploader nickname')
 764         video_uploader = mobj.group(1)
 765
 766         return [{
 767             'id':       video_id.decode('utf-8'),
 768             'url':      video_url.decode('utf-8'),
 769             'uploader': video_uploader.decode('utf-8'),
 770             'upload_date':  None,
 771             'title':    video_title,
 772             'ext':      video_extension.decode('utf-8'),
 773         }]
 774
 775 class DailymotionIE(InfoExtractor):
 776     """Information Extractor for Dailymotion"""
 777
 778     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 779     IE_NAME = u'dailymotion'
 780
 781     def _real_extract(self, url):
 782         # Extract id and simplified title from URL
 783         mobj = re.match(self._VALID_URL, url)
 784         if mobj is None:
 785             raise ExtractorError(u'Invalid URL: %s' % url)
 786
 787         video_id = mobj.group(1).split('_')[0].split('?')[0]
 788
 789         video_extension = 'mp4'
 790
 791         # Retrieve video webpage to extract further information
 792         request = compat_urllib_request.Request(url)
 793         request.add_header('Cookie', 'family_filter=off')
 794         webpage = self._download_webpage(request, video_id)
 795
 796         # Extract URL, uploader and title from webpage
 797         self.report_extraction(video_id)
 798         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 799         if mobj is None:
 800             raise ExtractorError(u'Unable to extract media URL')
 801         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 802
 803         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 804             if key in flashvars:
 805                 max_quality = key
 806                 self.to_screen(u'Using %s' % key)
 807                 break
 808         else:
 809             raise ExtractorError(u'Unable to extract video URL')
 810
 811         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 812         if mobj is None:
 813             raise ExtractorError(u'Unable to extract video URL')
 814
 815         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 816
 817         # TODO: support choosing qualities
 818
 819         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 820         if mobj is None:
 821             raise ExtractorError(u'Unable to extract title')
 822         video_title = unescapeHTML(mobj.group('title'))
 823
 824         video_uploader = None
 825         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 826         if mobj is None:
 827             # lookin for official user
 828             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 829             if mobj_official is None:
 830                 self._downloader.report_warning(u'unable to extract uploader nickname')
 831             else:
 832                 video_uploader = mobj_official.group(1)
 833         else:
 834             video_uploader = mobj.group(1)
 835
 836         video_upload_date = None
 837         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 838         if mobj is not None:
 839             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 840
 841         return [{
 842             'id':       video_id,
 843             'url':      video_url,
 844             'uploader': video_uploader,
 845             'upload_date':  video_upload_date,
 846             'title':    video_title,
 847             'ext':      video_extension,
 848         }]
 849
 850
 851 class PhotobucketIE(InfoExtractor):
 852     """Information extractor for photobucket.com."""
 853
 854     # TODO: the original _VALID_URL was:
 855     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 856     # Check if it's necessary to keep the old extracion process
 857     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 858     IE_NAME = u'photobucket'
 859
 860     def _real_extract(self, url):
 861         # Extract id from URL
 862         mobj = re.match(self._VALID_URL, url)
 863         if mobj is None:
 864             raise ExtractorError(u'Invalid URL: %s' % url)
 865
 866         video_id = mobj.group('id')
 867
 868         video_extension = mobj.group('ext')
 869
 870         # Retrieve video webpage to extract further information
 871         webpage = self._download_webpage(url, video_id)
 872
 873         # Extract URL, uploader, and title from webpage
 874         self.report_extraction(video_id)
 875         # We try first by looking the javascript code:
 876         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 877         if mobj is not None:
 878             info = json.loads(mobj.group('json'))
 879             return [{
 880                 'id':       video_id,
 881                 'url':      info[u'downloadUrl'],
 882                 'uploader': info[u'username'],
 883                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 884                 'title':    info[u'title'],
 885                 'ext':      video_extension,
 886                 'thumbnail': info[u'thumbUrl'],
 887             }]
 888
 889         # We try looking in other parts of the webpage
 890         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 891         if mobj is None:
 892             raise ExtractorError(u'Unable to extract media URL')
 893         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 894
 895         video_url = mediaURL
 896
 897         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 898         if mobj is None:
 899             raise ExtractorError(u'Unable to extract title')
 900         video_title = mobj.group(1).decode('utf-8')
 901
 902         video_uploader = mobj.group(2).decode('utf-8')
 903
 904         return [{
 905             'id':       video_id.decode('utf-8'),
 906             'url':      video_url.decode('utf-8'),
 907             'uploader': video_uploader,
 908             'upload_date':  None,
 909             'title':    video_title,
 910             'ext':      video_extension.decode('utf-8'),
 911         }]
 912
 913
 914 class YahooIE(InfoExtractor):
 915     """Information extractor for screen.yahoo.com."""
 916     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 917
 918     def _real_extract(self, url):
 919         mobj = re.match(self._VALID_URL, url)
 920         if mobj is None:
 921             raise ExtractorError(u'Invalid URL: %s' % url)
 922         video_id = mobj.group('id')
 923         webpage = self._download_webpage(url, video_id)
 924         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 925
 926         if m_id is None:
 927             # TODO: Check which url parameters are required
 928             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 929             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 930             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 931                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 932                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 933                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 934                         '''
 935             self.report_extraction(video_id)
 936             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 937             if m_info is None:
 938                 raise ExtractorError(u'Unable to extract video info')
 939             video_title = m_info.group('title')
 940             video_description = m_info.group('description')
 941             video_thumb = m_info.group('thumb')
 942             video_date = m_info.group('date')
 943             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 944
 945             # TODO: Find a way to get mp4 videos
 946             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 947             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 948             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 949             video_url = m_rest.group('url')
 950             video_path = m_rest.group('path')
 951             if m_rest is None:
 952                 raise ExtractorError(u'Unable to extract video url')
 953
 954         else: # We have to use a different method if another id is defined
 955             long_id = m_id.group('new_id')
 956             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 957             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
 958             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
 959             info = json.loads(json_str)
 960             res = info[u'query'][u'results'][u'mediaObj'][0]
 961             stream = res[u'streams'][0]
 962             video_path = stream[u'path']
 963             video_url = stream[u'host']
 964             meta = res[u'meta']
 965             video_title = meta[u'title']
 966             video_description = meta[u'description']
 967             video_thumb = meta[u'thumbnail']
 968             video_date = None # I can't find it
 969
 970         info_dict = {
 971                      'id': video_id,
 972                      'url': video_url,
 973                      'play_path': video_path,
 974                      'title':video_title,
 975                      'description': video_description,
 976                      'thumbnail': video_thumb,
 977                      'upload_date': video_date,
 978                      'ext': 'flv',
 979                      }
 980         return info_dict
 981
 982 class VimeoIE(InfoExtractor):
 983     """Information extractor for vimeo.com."""
 984
 985     # _VALID_URL matches Vimeo URLs
 986     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 987     IE_NAME = u'vimeo'
 988
 989     def _real_extract(self, url, new_video=True):
 990         # Extract ID from URL
 991         mobj = re.match(self._VALID_URL, url)
 992         if mobj is None:
 993             raise ExtractorError(u'Invalid URL: %s' % url)
 994
 995         video_id = mobj.group('id')
 996         if not mobj.group('proto'):
 997             url = 'https://' + url
 998         if mobj.group('direct_link'):
 999             url = 'https://vimeo.com/' + video_id
1000
1001         # Retrieve video webpage to extract further information
1002         request = compat_urllib_request.Request(url, None, std_headers)
1003         webpage = self._download_webpage(request, video_id)
1004
1005         # Now we begin extracting as much information as we can from what we
1006         # retrieved. First we extract the information common to all extractors,
1007         # and latter we extract those that are Vimeo specific.
1008         self.report_extraction(video_id)
1009
1010         # Extract the config JSON
1011         try:
1012             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1013             config = json.loads(config)
1014         except:
1015             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1016                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
1017             else:
1018                 raise ExtractorError(u'Unable to extract info section')
1019
1020         # Extract title
1021         video_title = config["video"]["title"]
1022
1023         # Extract uploader and uploader_id
1024         video_uploader = config["video"]["owner"]["name"]
1025         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1026
1027         # Extract video thumbnail
1028         video_thumbnail = config["video"]["thumbnail"]
1029
1030         # Extract video description
1031         video_description = get_element_by_attribute("itemprop", "description", webpage)
1032         if video_description: video_description = clean_html(video_description)
1033         else: video_description = u''
1034
1035         # Extract upload date
1036         video_upload_date = None
1037         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1038         if mobj is not None:
1039             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1040
1041         # Vimeo specific: extract request signature and timestamp
1042         sig = config['request']['signature']
1043         timestamp = config['request']['timestamp']
1044
1045         # Vimeo specific: extract video codec and quality information
1046         # First consider quality, then codecs, then take everything
1047         # TODO bind to format param
1048         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1049         files = { 'hd': [], 'sd': [], 'other': []}
1050         for codec_name, codec_extension in codecs:
1051             if codec_name in config["video"]["files"]:
1052                 if 'hd' in config["video"]["files"][codec_name]:
1053                     files['hd'].append((codec_name, codec_extension, 'hd'))
1054                 elif 'sd' in config["video"]["files"][codec_name]:
1055                     files['sd'].append((codec_name, codec_extension, 'sd'))
1056                 else:
1057                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1058
1059         for quality in ('hd', 'sd', 'other'):
1060             if len(files[quality]) > 0:
1061                 video_quality = files[quality][0][2]
1062                 video_codec = files[quality][0][0]
1063                 video_extension = files[quality][0][1]
1064                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1065                 break
1066         else:
1067             raise ExtractorError(u'No known codec found')
1068
1069         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1070                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1071
1072         return [{
1073             'id':       video_id,
1074             'url':      video_url,
1075             'uploader': video_uploader,
1076             'uploader_id': video_uploader_id,
1077             'upload_date':  video_upload_date,
1078             'title':    video_title,
1079             'ext':      video_extension,
1080             'thumbnail':    video_thumbnail,
1081             'description':  video_description,
1082         }]
1083
1084
1085 class ArteTvIE(InfoExtractor):
1086     """arte.tv information extractor."""
1087
1088     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1089     _LIVE_URL = r'index-[0-9]+\.html$'
1090
1091     IE_NAME = u'arte.tv'
1092
1093     def fetch_webpage(self, url):
1094         request = compat_urllib_request.Request(url)
1095         try:
1096             self.report_download_webpage(url)
1097             webpage = compat_urllib_request.urlopen(request).read()
1098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1100         except ValueError as err:
1101             raise ExtractorError(u'Invalid URL: %s' % url)
1102         return webpage
1103
1104     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1105         page = self.fetch_webpage(url)
1106         mobj = re.search(regex, page, regexFlags)
1107         info = {}
1108
1109         if mobj is None:
1110             raise ExtractorError(u'Invalid URL: %s' % url)
1111
1112         for (i, key, err) in matchTuples:
1113             if mobj.group(i) is None:
1114                 raise ExtractorError(err)
1115             else:
1116                 info[key] = mobj.group(i)
1117
1118         return info
1119
1120     def extractLiveStream(self, url):
1121         video_lang = url.split('/')[-4]
1122         info = self.grep_webpage(
1123             url,
1124             r'src="(.*?/videothek_js.*?\.js)',
1125             0,
1126             [
1127                 (1, 'url', u'Invalid URL: %s' % url)
1128             ]
1129         )
1130         http_host = url.split('/')[2]
1131         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1132         info = self.grep_webpage(
1133             next_url,
1134             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1135                 '(http://.*?\.swf).*?' +
1136                 '(rtmp://.*?)\'',
1137             re.DOTALL,
1138             [
1139                 (1, 'path',   u'could not extract video path: %s' % url),
1140                 (2, 'player', u'could not extract video player: %s' % url),
1141                 (3, 'url',    u'could not extract video url: %s' % url)
1142             ]
1143         )
1144         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1145
1146     def extractPlus7Stream(self, url):
1147         video_lang = url.split('/')[-3]
1148         info = self.grep_webpage(
1149             url,
1150             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1151             0,
1152             [
1153                 (1, 'url', u'Invalid URL: %s' % url)
1154             ]
1155         )
1156         next_url = compat_urllib_parse.unquote(info.get('url'))
1157         info = self.grep_webpage(
1158             next_url,
1159             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1160             0,
1161             [
1162                 (1, 'url', u'Could not find <video> tag: %s' % url)
1163             ]
1164         )
1165         next_url = compat_urllib_parse.unquote(info.get('url'))
1166
1167         info = self.grep_webpage(
1168             next_url,
1169             r'<video id="(.*?)".*?>.*?' +
1170                 '<name>(.*?)</name>.*?' +
1171                 '<dateVideo>(.*?)</dateVideo>.*?' +
1172                 '<url quality="hd">(.*?)</url>',
1173             re.DOTALL,
1174             [
1175                 (1, 'id',    u'could not extract video id: %s' % url),
1176                 (2, 'title', u'could not extract video title: %s' % url),
1177                 (3, 'date',  u'could not extract video date: %s' % url),
1178                 (4, 'url',   u'could not extract video url: %s' % url)
1179             ]
1180         )
1181
1182         return {
1183             'id':           info.get('id'),
1184             'url':          compat_urllib_parse.unquote(info.get('url')),
1185             'uploader':     u'arte.tv',
1186             'upload_date':  unified_strdate(info.get('date')),
1187             'title':        info.get('title').decode('utf-8'),
1188             'ext':          u'mp4',
1189             'format':       u'NA',
1190             'player_url':   None,
1191         }
1192
1193     def _real_extract(self, url):
1194         video_id = url.split('/')[-1]
1195         self.report_extraction(video_id)
1196
1197         if re.search(self._LIVE_URL, video_id) is not None:
1198             self.extractLiveStream(url)
1199             return
1200         else:
1201             info = self.extractPlus7Stream(url)
1202
1203         return [info]
1204
1205
1206 class GenericIE(InfoExtractor):
1207     """Generic last-resort information extractor."""
1208
1209     _VALID_URL = r'.*'
1210     IE_NAME = u'generic'
1211
1212     def report_download_webpage(self, video_id):
1213         """Report webpage download."""
1214         if not self._downloader.params.get('test', False):
1215             self._downloader.report_warning(u'Falling back on generic information extractor.')
1216         super(GenericIE, self).report_download_webpage(video_id)
1217
1218     def report_following_redirect(self, new_url):
1219         """Report information extraction."""
1220         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1221
1222     def _test_redirect(self, url):
1223         """Check if it is a redirect, like url shorteners, in case return the new url."""
1224         class HeadRequest(compat_urllib_request.Request):
1225             def get_method(self):
1226                 return "HEAD"
1227
1228         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1229             """
1230             Subclass the HTTPRedirectHandler to make it use our
1231             HeadRequest also on the redirected URL
1232             """
1233             def redirect_request(self, req, fp, code, msg, headers, newurl):
1234                 if code in (301, 302, 303, 307):
1235                     newurl = newurl.replace(' ', '%20')
1236                     newheaders = dict((k,v) for k,v in req.headers.items()
1237                                       if k.lower() not in ("content-length", "content-type"))
1238                     return HeadRequest(newurl,
1239                                        headers=newheaders,
1240                                        origin_req_host=req.get_origin_req_host(),
1241                                        unverifiable=True)
1242                 else:
1243                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1244
1245         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1246             """
1247             Fallback to GET if HEAD is not allowed (405 HTTP error)
1248             """
1249             def http_error_405(self, req, fp, code, msg, headers):
1250                 fp.read()
1251                 fp.close()
1252
1253                 newheaders = dict((k,v) for k,v in req.headers.items()
1254                                   if k.lower() not in ("content-length", "content-type"))
1255                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1256                                                  headers=newheaders,
1257                                                  origin_req_host=req.get_origin_req_host(),
1258                                                  unverifiable=True))
1259
1260         # Build our opener
1261         opener = compat_urllib_request.OpenerDirector()
1262         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1263                         HTTPMethodFallback, HEADRedirectHandler,
1264                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1265             opener.add_handler(handler())
1266
1267         response = opener.open(HeadRequest(url))
1268         new_url = response.geturl()
1269
1270         if url == new_url:
1271             return False
1272
1273         self.report_following_redirect(new_url)
1274         return new_url
1275
1276     def _real_extract(self, url):
1277         new_url = self._test_redirect(url)
1278         if new_url: return [self.url_result(new_url)]
1279
1280         video_id = url.split('/')[-1]
1281         try:
1282             webpage = self._download_webpage(url, video_id)
1283         except ValueError as err:
1284             # since this is the last-resort InfoExtractor, if
1285             # this error is thrown, it'll be thrown here
1286             raise ExtractorError(u'Invalid URL: %s' % url)
1287
1288         self.report_extraction(video_id)
1289         # Start with something easy: JW Player in SWFObject
1290         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1291         if mobj is None:
1292             # Broaden the search a little bit
1293             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1294         if mobj is None:
1295             # Broaden the search a little bit: JWPlayer JS loader
1296             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1297         if mobj is None:
1298             raise ExtractorError(u'Invalid URL: %s' % url)
1299
1300         # It's possible that one of the regexes
1301         # matched, but returned an empty group:
1302         if mobj.group(1) is None:
1303             raise ExtractorError(u'Invalid URL: %s' % url)
1304
1305         video_url = compat_urllib_parse.unquote(mobj.group(1))
1306         video_id = os.path.basename(video_url)
1307
1308         # here's a fun little line of code for you:
1309         video_extension = os.path.splitext(video_id)[1][1:]
1310         video_id = os.path.splitext(video_id)[0]
1311
1312         # it's tempting to parse this further, but you would
1313         # have to take into account all the variations like
1314         #   Video Title - Site Name
1315         #   Site Name | Video Title
1316         #   Video Title - Tagline | Site Name
1317         # and so on and so forth; it's just not practical
1318         mobj = re.search(r'<title>(.*)</title>', webpage)
1319         if mobj is None:
1320             raise ExtractorError(u'Unable to extract title')
1321         video_title = mobj.group(1)
1322
1323         # video uploader is domain name
1324         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1325         if mobj is None:
1326             raise ExtractorError(u'Unable to extract title')
1327         video_uploader = mobj.group(1)
1328
1329         return [{
1330             'id':       video_id,
1331             'url':      video_url,
1332             'uploader': video_uploader,
1333             'upload_date':  None,
1334             'title':    video_title,
1335             'ext':      video_extension,
1336         }]
1337
1338
1339 class YoutubeSearchIE(InfoExtractor):
1340     """Information Extractor for YouTube search queries."""
1341     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1342     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1343     _max_youtube_results = 1000
1344     IE_NAME = u'youtube:search'
1345
1346     def report_download_page(self, query, pagenum):
1347         """Report attempt to download search page with given number."""
1348         query = query.decode(preferredencoding())
1349         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1350
1351     def _real_extract(self, query):
1352         mobj = re.match(self._VALID_URL, query)
1353         if mobj is None:
1354             raise ExtractorError(u'Invalid search query "%s"' % query)
1355
1356         prefix, query = query.split(':')
1357         prefix = prefix[8:]
1358         query = query.encode('utf-8')
1359         if prefix == '':
1360             return self._get_n_results(query, 1)
1361         elif prefix == 'all':
1362             self._get_n_results(query, self._max_youtube_results)
1363         else:
1364             try:
1365                 n = int(prefix)
1366                 if n <= 0:
1367                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1368                 elif n > self._max_youtube_results:
1369                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1370                     n = self._max_youtube_results
1371                 return self._get_n_results(query, n)
1372             except ValueError: # parsing prefix as integer fails
1373                 return self._get_n_results(query, 1)
1374
1375     def _get_n_results(self, query, n):
1376         """Get a specified number of results for a query"""
1377
1378         video_ids = []
1379         pagenum = 0
1380         limit = n
1381
1382         while (50 * pagenum) < limit:
1383             self.report_download_page(query, pagenum+1)
1384             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1385             request = compat_urllib_request.Request(result_url)
1386             try:
1387                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1388             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1389                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1390             api_response = json.loads(data)['data']
1391
1392             if not 'items' in api_response:
1393                 raise ExtractorError(u'[youtube] No video results')
1394
1395             new_ids = list(video['id'] for video in api_response['items'])
1396             video_ids += new_ids
1397
1398             limit = min(n, api_response['totalItems'])
1399             pagenum += 1
1400
1401         if len(video_ids) > n:
1402             video_ids = video_ids[:n]
1403         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1404         return self.playlist_result(videos, query)
1405
1406
1407 class GoogleSearchIE(InfoExtractor):
1408     """Information Extractor for Google Video search queries."""
1409     _VALID_URL = r'gvsearch(?P<prefix>|\d+|all):(?P<query>[\s\S]+)'
1410     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1411     _max_google_results = 1000
1412     IE_NAME = u'video.google:search'
1413
1414     def _real_extract(self, query):
1415         mobj = re.match(self._VALID_URL, query)
1416
1417         prefix = mobj.group('prefix')
1418         query = mobj.group('query')
1419         if prefix == '':
1420             return self._get_n_results(query, 1)
1421         elif prefix == 'all':
1422             return self._get_n_results(query, self._max_google_results)
1423         else:
1424             n = int(prefix)
1425             if n <= 0:
1426                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
1427             elif n > self._max_google_results:
1428                 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1429                 n = self._max_google_results
1430             return self._get_n_results(query, n)
1431
1432     def _get_n_results(self, query, n):
1433         """Get a specified number of results for a query"""
1434
1435         res = {
1436             '_type': 'playlist',
1437             'id': query,
1438             'entries': []
1439         }
1440
1441         for pagenum in itertools.count(1):
1442             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1443             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1444                                              note='Downloading result page ' + str(pagenum))
1445
1446             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1447                 e = {
1448                     '_type': 'url',
1449                     'url': mobj.group(1)
1450                 }
1451                 res['entries'].append(e)
1452
1453             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1454                 return res
1455
1456 class YahooSearchIE(InfoExtractor):
1457     """Information Extractor for Yahoo! Video search queries."""
1458
1459     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1460
1461     _max_yahoo_results = 1000
1462     IE_NAME = u'screen.yahoo:search'
1463
1464     def _real_extract(self, query):
1465         mobj = re.match(self._VALID_URL, query)
1466         if mobj is None:
1467             raise ExtractorError(u'Invalid search query "%s"' % query)
1468
1469         prefix, query = query.split(':')
1470         prefix = prefix[8:]
1471         query = query.encode('utf-8')
1472         if prefix == '':
1473             return self._get_n_results(query, 1)
1474         elif prefix == 'all':
1475             return self._get_n_results(query, self._max_yahoo_results)
1476         else:
1477             try:
1478                 n = int(prefix)
1479                 if n <= 0:
1480                     raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query))
1481                 elif n > self._max_yahoo_results:
1482                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1483                     n = self._max_yahoo_results
1484                 return self._get_n_results(query, n)
1485             except ValueError: # parsing prefix as integer fails
1486                 return self._get_n_results(query, 1)
1487
1488     def _get_n_results(self, query, n):
1489         """Get a specified number of results for a query"""
1490
1491         res = {
1492             '_type': 'playlist',
1493             'id': query,
1494             'entries': []
1495         }
1496         for pagenum in itertools.count(0):
1497             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1498             webpage = self._download_webpage(result_url, query,
1499                                              note='Downloading results page '+str(pagenum+1))
1500             info = json.loads(webpage)
1501             m = info[u'm']
1502             results = info[u'results']
1503
1504             for (i, r) in enumerate(results):
1505                 if (pagenum * 30) +i >= n:
1506                     break
1507                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1508                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1509                 res['entries'].append(e)
1510             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1511                 break
1512
1513         return res
1514
1515
1516 class YoutubePlaylistIE(InfoExtractor):
1517     """Information Extractor for YouTube playlists."""
1518
1519     _VALID_URL = r"""(?:
1520                         (?:https?://)?
1521                         (?:\w+\.)?
1522                         youtube\.com/
1523                         (?:
1524                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1525                            \? (?:.*?&)*? (?:p|a|list)=
1526                         |  p/
1527                         )
1528                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1529                         .*
1530                      |
1531                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1532                      )"""
1533     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1534     _MAX_RESULTS = 50
1535     IE_NAME = u'youtube:playlist'
1536
1537     @classmethod
1538     def suitable(cls, url):
1539         """Receives a URL and returns True if suitable for this IE."""
1540         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1541
1542     def _real_extract(self, url):
1543         # Extract playlist id
1544         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1545         if mobj is None:
1546             raise ExtractorError(u'Invalid URL: %s' % url)
1547
1548         # Download playlist videos from API
1549         playlist_id = mobj.group(1) or mobj.group(2)
1550         page_num = 1
1551         videos = []
1552
1553         while True:
1554             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1555             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1556
1557             try:
1558                 response = json.loads(page)
1559             except ValueError as err:
1560                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1561
1562             if 'feed' not in response:
1563                 raise ExtractorError(u'Got a malformed response from YouTube API')
1564             playlist_title = response['feed']['title']['$t']
1565             if 'entry' not in response['feed']:
1566                 # Number of videos is a multiple of self._MAX_RESULTS
1567                 break
1568
1569             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1570                         for entry in response['feed']['entry']
1571                         if 'content' in entry ]
1572
1573             if len(response['feed']['entry']) < self._MAX_RESULTS:
1574                 break
1575             page_num += 1
1576
1577         videos = [v[1] for v in sorted(videos)]
1578
1579         url_results = [self.url_result(url, 'Youtube') for url in videos]
1580         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1581
1582
1583 class YoutubeChannelIE(InfoExtractor):
1584     """Information Extractor for YouTube channels."""
1585
1586     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1587     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1588     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1589     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1590     IE_NAME = u'youtube:channel'
1591
1592     def extract_videos_from_page(self, page):
1593         ids_in_page = []
1594         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1595             if mobj.group(1) not in ids_in_page:
1596                 ids_in_page.append(mobj.group(1))
1597         return ids_in_page
1598
1599     def _real_extract(self, url):
1600         # Extract channel id
1601         mobj = re.match(self._VALID_URL, url)
1602         if mobj is None:
1603             raise ExtractorError(u'Invalid URL: %s' % url)
1604
1605         # Download channel page
1606         channel_id = mobj.group(1)
1607         video_ids = []
1608         pagenum = 1
1609
1610         url = self._TEMPLATE_URL % (channel_id, pagenum)
1611         page = self._download_webpage(url, channel_id,
1612                                       u'Downloading page #%s' % pagenum)
1613
1614         # Extract video identifiers
1615         ids_in_page = self.extract_videos_from_page(page)
1616         video_ids.extend(ids_in_page)
1617
1618         # Download any subsequent channel pages using the json-based channel_ajax query
1619         if self._MORE_PAGES_INDICATOR in page:
1620             while True:
1621                 pagenum = pagenum + 1
1622
1623                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1624                 page = self._download_webpage(url, channel_id,
1625                                               u'Downloading page #%s' % pagenum)
1626
1627                 page = json.loads(page)
1628
1629                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1630                 video_ids.extend(ids_in_page)
1631
1632                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1633                     break
1634
1635         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1636
1637         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1638         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1639         return [self.playlist_result(url_entries, channel_id)]
1640
1641
1642 class YoutubeUserIE(InfoExtractor):
1643     """Information Extractor for YouTube users."""
1644
1645     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1646     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1647     _GDATA_PAGE_SIZE = 50
1648     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1649     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1650     IE_NAME = u'youtube:user'
1651
1652     def _real_extract(self, url):
1653         # Extract username
1654         mobj = re.match(self._VALID_URL, url)
1655         if mobj is None:
1656             raise ExtractorError(u'Invalid URL: %s' % url)
1657
1658         username = mobj.group(1)
1659
1660         # Download video ids using YouTube Data API. Result size per
1661         # query is limited (currently to 50 videos) so we need to query
1662         # page by page until there are no video ids - it means we got
1663         # all of them.
1664
1665         video_ids = []
1666         pagenum = 0
1667
1668         while True:
1669             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670
1671             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1672             page = self._download_webpage(gdata_url, username,
1673                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674
1675             # Extract video identifiers
1676             ids_in_page = []
1677
1678             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1679                 if mobj.group(1) not in ids_in_page:
1680                     ids_in_page.append(mobj.group(1))
1681
1682             video_ids.extend(ids_in_page)
1683
1684             # A little optimization - if current page is not
1685             # "full", ie. does not contain PAGE_SIZE video ids then
1686             # we can assume that this page is the last one - there
1687             # are no more ids on further pages - no need to query
1688             # again.
1689
1690             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1691                 break
1692
1693             pagenum += 1
1694
1695         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1696         url_results = [self.url_result(url, 'Youtube') for url in urls]
1697         return [self.playlist_result(url_results, playlist_title = username)]
1698
1699
1700 class BlipTVUserIE(InfoExtractor):
1701     """Information Extractor for blip.tv users."""
1702
1703     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1704     _PAGE_SIZE = 12
1705     IE_NAME = u'blip.tv:user'
1706
1707     def _real_extract(self, url):
1708         # Extract username
1709         mobj = re.match(self._VALID_URL, url)
1710         if mobj is None:
1711             raise ExtractorError(u'Invalid URL: %s' % url)
1712
1713         username = mobj.group(1)
1714
1715         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1716
1717         page = self._download_webpage(url, username, u'Downloading user page')
1718         mobj = re.search(r'data-users-id="([^"]+)"', page)
1719         page_base = page_base % mobj.group(1)
1720
1721
1722         # Download video ids using BlipTV Ajax calls. Result size per
1723         # query is limited (currently to 12 videos) so we need to query
1724         # page by page until there are no video ids - it means we got
1725         # all of them.
1726
1727         video_ids = []
1728         pagenum = 1
1729
1730         while True:
1731             url = page_base + "&page=" + str(pagenum)
1732             page = self._download_webpage(url, username,
1733                                           u'Downloading video ids from page %d' % pagenum)
1734
1735             # Extract video identifiers
1736             ids_in_page = []
1737
1738             for mobj in re.finditer(r'href="/([^"]+)"', page):
1739                 if mobj.group(1) not in ids_in_page:
1740                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1741
1742             video_ids.extend(ids_in_page)
1743
1744             # A little optimization - if current page is not
1745             # "full", ie. does not contain PAGE_SIZE video ids then
1746             # we can assume that this page is the last one - there
1747             # are no more ids on further pages - no need to query
1748             # again.
1749
1750             if len(ids_in_page) < self._PAGE_SIZE:
1751                 break
1752
1753             pagenum += 1
1754
1755         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1756         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1757         return [self.playlist_result(url_entries, playlist_title = username)]
1758
1759
1760 class DepositFilesIE(InfoExtractor):
1761     """Information extractor for depositfiles.com"""
1762
1763     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1764
1765     def _real_extract(self, url):
1766         file_id = url.split('/')[-1]
1767         # Rebuild url in english locale
1768         url = 'http://depositfiles.com/en/files/' + file_id
1769
1770         # Retrieve file webpage with 'Free download' button pressed
1771         free_download_indication = { 'gateway_result' : '1' }
1772         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1773         try:
1774             self.report_download_webpage(file_id)
1775             webpage = compat_urllib_request.urlopen(request).read()
1776         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1778
1779         # Search for the real file URL
1780         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1781         if (mobj is None) or (mobj.group(1) is None):
1782             # Try to figure out reason of the error.
1783             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1784             if (mobj is not None) and (mobj.group(1) is not None):
1785                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1786                 raise ExtractorError(u'%s' % restriction_message)
1787             else:
1788                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1789
1790         file_url = mobj.group(1)
1791         file_extension = os.path.splitext(file_url)[1][1:]
1792
1793         # Search for file title
1794         mobj = re.search(r'<b title="(.*?)">', webpage)
1795         if mobj is None:
1796             raise ExtractorError(u'Unable to extract title')
1797         file_title = mobj.group(1).decode('utf-8')
1798
1799         return [{
1800             'id':       file_id.decode('utf-8'),
1801             'url':      file_url.decode('utf-8'),
1802             'uploader': None,
1803             'upload_date':  None,
1804             'title':    file_title,
1805             'ext':      file_extension.decode('utf-8'),
1806         }]
1807
1808
1809 class FacebookIE(InfoExtractor):
1810     """Information Extractor for Facebook"""
1811
1812     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1813     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1814     _NETRC_MACHINE = 'facebook'
1815     IE_NAME = u'facebook'
1816
1817     def report_login(self):
1818         """Report attempt to log in."""
1819         self.to_screen(u'Logging in')
1820
1821     def _real_initialize(self):
1822         if self._downloader is None:
1823             return
1824
1825         useremail = None
1826         password = None
1827         downloader_params = self._downloader.params
1828
1829         # Attempt to use provided username and password or .netrc data
1830         if downloader_params.get('username', None) is not None:
1831             useremail = downloader_params['username']
1832             password = downloader_params['password']
1833         elif downloader_params.get('usenetrc', False):
1834             try:
1835                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1836                 if info is not None:
1837                     useremail = info[0]
1838                     password = info[2]
1839                 else:
1840                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1841             except (IOError, netrc.NetrcParseError) as err:
1842                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1843                 return
1844
1845         if useremail is None:
1846             return
1847
1848         # Log in
1849         login_form = {
1850             'email': useremail,
1851             'pass': password,
1852             'login': 'Log+In'
1853             }
1854         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1855         try:
1856             self.report_login()
1857             login_results = compat_urllib_request.urlopen(request).read()
1858             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1859                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1860                 return
1861         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1862             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1863             return
1864
1865     def _real_extract(self, url):
1866         mobj = re.match(self._VALID_URL, url)
1867         if mobj is None:
1868             raise ExtractorError(u'Invalid URL: %s' % url)
1869         video_id = mobj.group('ID')
1870
1871         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1872         webpage = self._download_webpage(url, video_id)
1873
1874         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1875         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1876         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1877         if not m:
1878             raise ExtractorError(u'Cannot parse data')
1879         data = dict(json.loads(m.group(1)))
1880         params_raw = compat_urllib_parse.unquote(data['params'])
1881         params = json.loads(params_raw)
1882         video_data = params['video_data'][0]
1883         video_url = video_data.get('hd_src')
1884         if not video_url:
1885             video_url = video_data['sd_src']
1886         if not video_url:
1887             raise ExtractorError(u'Cannot find video URL')
1888         video_duration = int(video_data['video_duration'])
1889         thumbnail = video_data['thumbnail_src']
1890
1891         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
1892         if not m:
1893             raise ExtractorError(u'Cannot find title in webpage')
1894         video_title = unescapeHTML(m.group(1))
1895
1896         info = {
1897             'id': video_id,
1898             'title': video_title,
1899             'url': video_url,
1900             'ext': 'mp4',
1901             'duration': video_duration,
1902             'thumbnail': thumbnail,
1903         }
1904         return [info]
1905
1906
1907 class BlipTVIE(InfoExtractor):
1908     """Information extractor for blip.tv"""
1909
1910     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1911     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1912     IE_NAME = u'blip.tv'
1913
1914     def report_direct_download(self, title):
1915         """Report information extraction."""
1916         self.to_screen(u'%s: Direct download detected' % title)
1917
1918     def _real_extract(self, url):
1919         mobj = re.match(self._VALID_URL, url)
1920         if mobj is None:
1921             raise ExtractorError(u'Invalid URL: %s' % url)
1922
1923         urlp = compat_urllib_parse_urlparse(url)
1924         if urlp.path.startswith('/play/'):
1925             request = compat_urllib_request.Request(url)
1926             response = compat_urllib_request.urlopen(request)
1927             redirecturl = response.geturl()
1928             rurlp = compat_urllib_parse_urlparse(redirecturl)
1929             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1930             url = 'http://blip.tv/a/a-' + file_id
1931             return self._real_extract(url)
1932
1933
1934         if '?' in url:
1935             cchar = '&'
1936         else:
1937             cchar = '?'
1938         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1939         request = compat_urllib_request.Request(json_url)
1940         request.add_header('User-Agent', 'iTunes/10.6.1')
1941         self.report_extraction(mobj.group(1))
1942         info = None
1943         try:
1944             urlh = compat_urllib_request.urlopen(request)
1945             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1946                 basename = url.split('/')[-1]
1947                 title,ext = os.path.splitext(basename)
1948                 title = title.decode('UTF-8')
1949                 ext = ext.replace('.', '')
1950                 self.report_direct_download(title)
1951                 info = {
1952                     'id': title,
1953                     'url': url,
1954                     'uploader': None,
1955                     'upload_date': None,
1956                     'title': title,
1957                     'ext': ext,
1958                     'urlhandle': urlh
1959                 }
1960         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1961             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1962         if info is None: # Regular URL
1963             try:
1964                 json_code_bytes = urlh.read()
1965                 json_code = json_code_bytes.decode('utf-8')
1966             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1967                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1968
1969             try:
1970                 json_data = json.loads(json_code)
1971                 if 'Post' in json_data:
1972                     data = json_data['Post']
1973                 else:
1974                     data = json_data
1975
1976                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1977                 video_url = data['media']['url']
1978                 umobj = re.match(self._URL_EXT, video_url)
1979                 if umobj is None:
1980                     raise ValueError('Can not determine filename extension')
1981                 ext = umobj.group(1)
1982
1983                 info = {
1984                     'id': data['item_id'],
1985                     'url': video_url,
1986                     'uploader': data['display_name'],
1987                     'upload_date': upload_date,
1988                     'title': data['title'],
1989                     'ext': ext,
1990                     'format': data['media']['mimeType'],
1991                     'thumbnail': data['thumbnailUrl'],
1992                     'description': data['description'],
1993                     'player_url': data['embedUrl'],
1994                     'user_agent': 'iTunes/10.6.1',
1995                 }
1996             except (ValueError,KeyError) as err:
1997                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1998
1999         return [info]
2000
2001
2002 class MyVideoIE(InfoExtractor):
2003     """Information Extractor for myvideo.de."""
2004
2005     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2006     IE_NAME = u'myvideo'
2007
2008     def _real_extract(self,url):
2009         mobj = re.match(self._VALID_URL, url)
2010         if mobj is None:
2011             raise ExtractorError(u'Invalid URL: %s' % url)
2012
2013         video_id = mobj.group(1)
2014
2015         # Get video webpage
2016         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2017         webpage = self._download_webpage(webpage_url, video_id)
2018
2019         self.report_extraction(video_id)
2020         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2021                  webpage)
2022         if mobj is None:
2023             raise ExtractorError(u'Unable to extract media URL')
2024         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2025
2026         mobj = re.search('<title>([^<]+)</title>', webpage)
2027         if mobj is None:
2028             raise ExtractorError(u'Unable to extract title')
2029
2030         video_title = mobj.group(1)
2031
2032         return [{
2033             'id':       video_id,
2034             'url':      video_url,
2035             'uploader': None,
2036             'upload_date':  None,
2037             'title':    video_title,
2038             'ext':      u'flv',
2039         }]
2040
2041 class ComedyCentralIE(InfoExtractor):
2042     """Information extractor for The Daily Show and Colbert Report """
2043
2044     # urls can be abbreviations like :thedailyshow or :colbert
2045     # urls for episodes like:
2046     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2047     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2048     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2049     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2050                       |(https?://)?(www\.)?
2051                           (?P<showname>thedailyshow|colbertnation)\.com/
2052                          (full-episodes/(?P<episode>.*)|
2053                           (?P<clip>
2054                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2055                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2056                      $"""
2057
2058     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2059
2060     _video_extensions = {
2061         '3500': 'mp4',
2062         '2200': 'mp4',
2063         '1700': 'mp4',
2064         '1200': 'mp4',
2065         '750': 'mp4',
2066         '400': 'mp4',
2067     }
2068     _video_dimensions = {
2069         '3500': '1280x720',
2070         '2200': '960x540',
2071         '1700': '768x432',
2072         '1200': '640x360',
2073         '750': '512x288',
2074         '400': '384x216',
2075     }
2076
2077     @classmethod
2078     def suitable(cls, url):
2079         """Receives a URL and returns True if suitable for this IE."""
2080         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2081
2082     def _print_formats(self, formats):
2083         print('Available formats:')
2084         for x in formats:
2085             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2086
2087
2088     def _real_extract(self, url):
2089         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2090         if mobj is None:
2091             raise ExtractorError(u'Invalid URL: %s' % url)
2092
2093         if mobj.group('shortname'):
2094             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2095                 url = u'http://www.thedailyshow.com/full-episodes/'
2096             else:
2097                 url = u'http://www.colbertnation.com/full-episodes/'
2098             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2099             assert mobj is not None
2100
2101         if mobj.group('clip'):
2102             if mobj.group('showname') == 'thedailyshow':
2103                 epTitle = mobj.group('tdstitle')
2104             else:
2105                 epTitle = mobj.group('cntitle')
2106             dlNewest = False
2107         else:
2108             dlNewest = not mobj.group('episode')
2109             if dlNewest:
2110                 epTitle = mobj.group('showname')
2111             else:
2112                 epTitle = mobj.group('episode')
2113
2114         self.report_extraction(epTitle)
2115         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2116         if dlNewest:
2117             url = htmlHandle.geturl()
2118             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2119             if mobj is None:
2120                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2121             if mobj.group('episode') == '':
2122                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2123             epTitle = mobj.group('episode')
2124
2125         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2126
2127         if len(mMovieParams) == 0:
2128             # The Colbert Report embeds the information in a without
2129             # a URL prefix; so extract the alternate reference
2130             # and then add the URL prefix manually.
2131
2132             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2133             if len(altMovieParams) == 0:
2134                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2135             else:
2136                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2137
2138         uri = mMovieParams[0][1]
2139         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2140         indexXml = self._download_webpage(indexUrl, epTitle,
2141                                           u'Downloading show index',
2142                                           u'unable to download episode index')
2143
2144         results = []
2145
2146         idoc = xml.etree.ElementTree.fromstring(indexXml)
2147         itemEls = idoc.findall('.//item')
2148         for partNum,itemEl in enumerate(itemEls):
2149             mediaId = itemEl.findall('./guid')[0].text
2150             shortMediaId = mediaId.split(':')[-1]
2151             showId = mediaId.split(':')[-2].replace('.com', '')
2152             officialTitle = itemEl.findall('./title')[0].text
2153             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2154
2155             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2156                         compat_urllib_parse.urlencode({'uri': mediaId}))
2157             configXml = self._download_webpage(configUrl, epTitle,
2158                                                u'Downloading configuration for %s' % shortMediaId)
2159
2160             cdoc = xml.etree.ElementTree.fromstring(configXml)
2161             turls = []
2162             for rendition in cdoc.findall('.//rendition'):
2163                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2164                 turls.append(finfo)
2165
2166             if len(turls) == 0:
2167                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2168                 continue
2169
2170             if self._downloader.params.get('listformats', None):
2171                 self._print_formats([i[0] for i in turls])
2172                 return
2173
2174             # For now, just pick the highest bitrate
2175             format,rtmp_video_url = turls[-1]
2176
2177             # Get the format arg from the arg stream
2178             req_format = self._downloader.params.get('format', None)
2179
2180             # Select format if we can find one
2181             for f,v in turls:
2182                 if f == req_format:
2183                     format, rtmp_video_url = f, v
2184                     break
2185
2186             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2187             if not m:
2188                 raise ExtractorError(u'Cannot transform RTMP url')
2189             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2190             video_url = base + m.group('finalid')
2191
2192             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2193             info = {
2194                 'id': shortMediaId,
2195                 'url': video_url,
2196                 'uploader': showId,
2197                 'upload_date': officialDate,
2198                 'title': effTitle,
2199                 'ext': 'mp4',
2200                 'format': format,
2201                 'thumbnail': None,
2202                 'description': officialTitle,
2203             }
2204             results.append(info)
2205
2206         return results
2207
2208
2209 class EscapistIE(InfoExtractor):
2210     """Information extractor for The Escapist """
2211
2212     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2213     IE_NAME = u'escapist'
2214
2215     def _real_extract(self, url):
2216         mobj = re.match(self._VALID_URL, url)
2217         if mobj is None:
2218             raise ExtractorError(u'Invalid URL: %s' % url)
2219         showName = mobj.group('showname')
2220         videoId = mobj.group('episode')
2221
2222         self.report_extraction(showName)
2223         webPage = self._download_webpage(url, showName)
2224
2225         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2226         description = unescapeHTML(descMatch.group(1))
2227         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2228         imgUrl = unescapeHTML(imgMatch.group(1))
2229         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2230         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2231         configUrlMatch = re.search('config=(.*)$', playerUrl)
2232         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2233
2234         configJSON = self._download_webpage(configUrl, showName,
2235                                             u'Downloading configuration',
2236                                             u'unable to download configuration')
2237
2238         # Technically, it's JavaScript, not JSON
2239         configJSON = configJSON.replace("'", '"')
2240
2241         try:
2242             config = json.loads(configJSON)
2243         except (ValueError,) as err:
2244             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2245
2246         playlist = config['playlist']
2247         videoUrl = playlist[1]['url']
2248
2249         info = {
2250             'id': videoId,
2251             'url': videoUrl,
2252             'uploader': showName,
2253             'upload_date': None,
2254             'title': showName,
2255             'ext': 'mp4',
2256             'thumbnail': imgUrl,
2257             'description': description,
2258             'player_url': playerUrl,
2259         }
2260
2261         return [info]
2262
2263 class CollegeHumorIE(InfoExtractor):
2264     """Information extractor for collegehumor.com"""
2265
2266     _WORKING = False
2267     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2268     IE_NAME = u'collegehumor'
2269
2270     def report_manifest(self, video_id):
2271         """Report information extraction."""
2272         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2273
2274     def _real_extract(self, url):
2275         mobj = re.match(self._VALID_URL, url)
2276         if mobj is None:
2277             raise ExtractorError(u'Invalid URL: %s' % url)
2278         video_id = mobj.group('videoid')
2279
2280         info = {
2281             'id': video_id,
2282             'uploader': None,
2283             'upload_date': None,
2284         }
2285
2286         self.report_extraction(video_id)
2287         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2288         try:
2289             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2290         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2291             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2292
2293         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2294         try:
2295             videoNode = mdoc.findall('./video')[0]
2296             info['description'] = videoNode.findall('./description')[0].text
2297             info['title'] = videoNode.findall('./caption')[0].text
2298             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2299             manifest_url = videoNode.findall('./file')[0].text
2300         except IndexError:
2301             raise ExtractorError(u'Invalid metadata XML file')
2302
2303         manifest_url += '?hdcore=2.10.3'
2304         self.report_manifest(video_id)
2305         try:
2306             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2308             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2309
2310         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2311         try:
2312             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2313             node_id = media_node.attrib['url']
2314             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2315         except IndexError as err:
2316             raise ExtractorError(u'Invalid manifest file')
2317
2318         url_pr = compat_urllib_parse_urlparse(manifest_url)
2319         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2320
2321         info['url'] = url
2322         info['ext'] = 'f4f'
2323         return [info]
2324
2325
2326 class XVideosIE(InfoExtractor):
2327     """Information extractor for xvideos.com"""
2328
2329     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2330     IE_NAME = u'xvideos'
2331
2332     def _real_extract(self, url):
2333         mobj = re.match(self._VALID_URL, url)
2334         if mobj is None:
2335             raise ExtractorError(u'Invalid URL: %s' % url)
2336         video_id = mobj.group(1)
2337
2338         webpage = self._download_webpage(url, video_id)
2339
2340         self.report_extraction(video_id)
2341
2342
2343         # Extract video URL
2344         mobj = re.search(r'flv_url=(.+?)&', webpage)
2345         if mobj is None:
2346             raise ExtractorError(u'Unable to extract video url')
2347         video_url = compat_urllib_parse.unquote(mobj.group(1))
2348
2349
2350         # Extract title
2351         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2352         if mobj is None:
2353             raise ExtractorError(u'Unable to extract video title')
2354         video_title = mobj.group(1)
2355
2356
2357         # Extract video thumbnail
2358         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2359         if mobj is None:
2360             raise ExtractorError(u'Unable to extract video thumbnail')
2361         video_thumbnail = mobj.group(0)
2362
2363         info = {
2364             'id': video_id,
2365             'url': video_url,
2366             'uploader': None,
2367             'upload_date': None,
2368             'title': video_title,
2369             'ext': 'flv',
2370             'thumbnail': video_thumbnail,
2371             'description': None,
2372         }
2373
2374         return [info]
2375
2376
2377 class SoundcloudIE(InfoExtractor):
2378     """Information extractor for soundcloud.com
2379        To access the media, the uid of the song and a stream token
2380        must be extracted from the page source and the script must make
2381        a request to media.soundcloud.com/crossdomain.xml. Then
2382        the media can be grabbed by requesting from an url composed
2383        of the stream token and uid
2384      """
2385
2386     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2387     IE_NAME = u'soundcloud'
2388
2389     def report_resolve(self, video_id):
2390         """Report information extraction."""
2391         self.to_screen(u'%s: Resolving id' % video_id)
2392
2393     def _real_extract(self, url):
2394         mobj = re.match(self._VALID_URL, url)
2395         if mobj is None:
2396             raise ExtractorError(u'Invalid URL: %s' % url)
2397
2398         # extract uploader (which is in the url)
2399         uploader = mobj.group(1)
2400         # extract simple title (uploader + slug of song title)
2401         slug_title =  mobj.group(2)
2402         simple_title = uploader + u'-' + slug_title
2403         full_title = '%s/%s' % (uploader, slug_title)
2404
2405         self.report_resolve(full_title)
2406
2407         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2408         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2409         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2410
2411         info = json.loads(info_json)
2412         video_id = info['id']
2413         self.report_extraction(full_title)
2414
2415         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2416         stream_json = self._download_webpage(streams_url, full_title,
2417                                              u'Downloading stream definitions',
2418                                              u'unable to download stream definitions')
2419
2420         streams = json.loads(stream_json)
2421         mediaURL = streams['http_mp3_128_url']
2422         upload_date = unified_strdate(info['created_at'])
2423
2424         return [{
2425             'id':       info['id'],
2426             'url':      mediaURL,
2427             'uploader': info['user']['username'],
2428             'upload_date': upload_date,
2429             'title':    info['title'],
2430             'ext':      u'mp3',
2431             'description': info['description'],
2432         }]
2433
2434 class SoundcloudSetIE(InfoExtractor):
2435     """Information extractor for soundcloud.com sets
2436        To access the media, the uid of the song and a stream token
2437        must be extracted from the page source and the script must make
2438        a request to media.soundcloud.com/crossdomain.xml. Then
2439        the media can be grabbed by requesting from an url composed
2440        of the stream token and uid
2441      """
2442
2443     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2444     IE_NAME = u'soundcloud:set'
2445
2446     def report_resolve(self, video_id):
2447         """Report information extraction."""
2448         self.to_screen(u'%s: Resolving id' % video_id)
2449
2450     def _real_extract(self, url):
2451         mobj = re.match(self._VALID_URL, url)
2452         if mobj is None:
2453             raise ExtractorError(u'Invalid URL: %s' % url)
2454
2455         # extract uploader (which is in the url)
2456         uploader = mobj.group(1)
2457         # extract simple title (uploader + slug of song title)
2458         slug_title =  mobj.group(2)
2459         simple_title = uploader + u'-' + slug_title
2460         full_title = '%s/sets/%s' % (uploader, slug_title)
2461
2462         self.report_resolve(full_title)
2463
2464         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2465         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2466         info_json = self._download_webpage(resolv_url, full_title)
2467
2468         videos = []
2469         info = json.loads(info_json)
2470         if 'errors' in info:
2471             for err in info['errors']:
2472                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2473             return
2474
2475         self.report_extraction(full_title)
2476         for track in info['tracks']:
2477             video_id = track['id']
2478
2479             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2480             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2481
2482             self.report_extraction(video_id)
2483             streams = json.loads(stream_json)
2484             mediaURL = streams['http_mp3_128_url']
2485
2486             videos.append({
2487                 'id':       video_id,
2488                 'url':      mediaURL,
2489                 'uploader': track['user']['username'],
2490                 'upload_date':  unified_strdate(track['created_at']),
2491                 'title':    track['title'],
2492                 'ext':      u'mp3',
2493                 'description': track['description'],
2494             })
2495         return videos
2496
2497
2498 class InfoQIE(InfoExtractor):
2499     """Information extractor for infoq.com"""
2500     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2501
2502     def _real_extract(self, url):
2503         mobj = re.match(self._VALID_URL, url)
2504         if mobj is None:
2505             raise ExtractorError(u'Invalid URL: %s' % url)
2506
2507         webpage = self._download_webpage(url, video_id=url)
2508         self.report_extraction(url)
2509
2510         # Extract video URL
2511         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2512         if mobj is None:
2513             raise ExtractorError(u'Unable to extract video url')
2514         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2515         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2516
2517         # Extract title
2518         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2519         if mobj is None:
2520             raise ExtractorError(u'Unable to extract video title')
2521         video_title = mobj.group(1)
2522
2523         # Extract description
2524         video_description = u'No description available.'
2525         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2526         if mobj is not None:
2527             video_description = mobj.group(1)
2528
2529         video_filename = video_url.split('/')[-1]
2530         video_id, extension = video_filename.split('.')
2531
2532         info = {
2533             'id': video_id,
2534             'url': video_url,
2535             'uploader': None,
2536             'upload_date': None,
2537             'title': video_title,
2538             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2539             'thumbnail': None,
2540             'description': video_description,
2541         }
2542
2543         return [info]
2544
2545 class MixcloudIE(InfoExtractor):
2546     """Information extractor for www.mixcloud.com"""
2547
2548     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2549     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2550     IE_NAME = u'mixcloud'
2551
2552     def report_download_json(self, file_id):
2553         """Report JSON download."""
2554         self.to_screen(u'Downloading json')
2555
2556     def get_urls(self, jsonData, fmt, bitrate='best'):
2557         """Get urls from 'audio_formats' section in json"""
2558         file_url = None
2559         try:
2560             bitrate_list = jsonData[fmt]
2561             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2562                 bitrate = max(bitrate_list) # select highest
2563
2564             url_list = jsonData[fmt][bitrate]
2565         except TypeError: # we have no bitrate info.
2566             url_list = jsonData[fmt]
2567         return url_list
2568
2569     def check_urls(self, url_list):
2570         """Returns 1st active url from list"""
2571         for url in url_list:
2572             try:
2573                 compat_urllib_request.urlopen(url)
2574                 return url
2575             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2576                 url = None
2577
2578         return None
2579
2580     def _print_formats(self, formats):
2581         print('Available formats:')
2582         for fmt in formats.keys():
2583             for b in formats[fmt]:
2584                 try:
2585                     ext = formats[fmt][b][0]
2586                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2587                 except TypeError: # we have no bitrate info
2588                     ext = formats[fmt][0]
2589                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2590                     break
2591
2592     def _real_extract(self, url):
2593         mobj = re.match(self._VALID_URL, url)
2594         if mobj is None:
2595             raise ExtractorError(u'Invalid URL: %s' % url)
2596         # extract uploader & filename from url
2597         uploader = mobj.group(1).decode('utf-8')
2598         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2599
2600         # construct API request
2601         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2602         # retrieve .json file with links to files
2603         request = compat_urllib_request.Request(file_url)
2604         try:
2605             self.report_download_json(file_url)
2606             jsonData = compat_urllib_request.urlopen(request).read()
2607         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2609
2610         # parse JSON
2611         json_data = json.loads(jsonData)
2612         player_url = json_data['player_swf_url']
2613         formats = dict(json_data['audio_formats'])
2614
2615         req_format = self._downloader.params.get('format', None)
2616         bitrate = None
2617
2618         if self._downloader.params.get('listformats', None):
2619             self._print_formats(formats)
2620             return
2621
2622         if req_format is None or req_format == 'best':
2623             for format_param in formats.keys():
2624                 url_list = self.get_urls(formats, format_param)
2625                 # check urls
2626                 file_url = self.check_urls(url_list)
2627                 if file_url is not None:
2628                     break # got it!
2629         else:
2630             if req_format not in formats:
2631                 raise ExtractorError(u'Format is not available')
2632
2633             url_list = self.get_urls(formats, req_format)
2634             file_url = self.check_urls(url_list)
2635             format_param = req_format
2636
2637         return [{
2638             'id': file_id.decode('utf-8'),
2639             'url': file_url.decode('utf-8'),
2640             'uploader': uploader.decode('utf-8'),
2641             'upload_date': None,
2642             'title': json_data['name'],
2643             'ext': file_url.split('.')[-1].decode('utf-8'),
2644             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2645             'thumbnail': json_data['thumbnail_url'],
2646             'description': json_data['description'],
2647             'player_url': player_url.decode('utf-8'),
2648         }]
2649
2650 class StanfordOpenClassroomIE(InfoExtractor):
2651     """Information extractor for Stanford's Open ClassRoom"""
2652
2653     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2654     IE_NAME = u'stanfordoc'
2655
2656     def _real_extract(self, url):
2657         mobj = re.match(self._VALID_URL, url)
2658         if mobj is None:
2659             raise ExtractorError(u'Invalid URL: %s' % url)
2660
2661         if mobj.group('course') and mobj.group('video'): # A specific video
2662             course = mobj.group('course')
2663             video = mobj.group('video')
2664             info = {
2665                 'id': course + '_' + video,
2666                 'uploader': None,
2667                 'upload_date': None,
2668             }
2669
2670             self.report_extraction(info['id'])
2671             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2672             xmlUrl = baseUrl + video + '.xml'
2673             try:
2674                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2675             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2677             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2678             try:
2679                 info['title'] = mdoc.findall('./title')[0].text
2680                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2681             except IndexError:
2682                 raise ExtractorError(u'Invalid metadata XML file')
2683             info['ext'] = info['url'].rpartition('.')[2]
2684             return [info]
2685         elif mobj.group('course'): # A course page
2686             course = mobj.group('course')
2687             info = {
2688                 'id': course,
2689                 'type': 'playlist',
2690                 'uploader': None,
2691                 'upload_date': None,
2692             }
2693
2694             coursepage = self._download_webpage(url, info['id'],
2695                                         note='Downloading course info page',
2696                                         errnote='Unable to download course info page')
2697
2698             m = re.search('<h1>([^<]+)</h1>', coursepage)
2699             if m:
2700                 info['title'] = unescapeHTML(m.group(1))
2701             else:
2702                 info['title'] = info['id']
2703
2704             m = re.search('<description>([^<]+)</description>', coursepage)
2705             if m:
2706                 info['description'] = unescapeHTML(m.group(1))
2707
2708             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2709             info['list'] = [
2710                 {
2711                     'type': 'reference',
2712                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2713                 }
2714                     for vpage in links]
2715             results = []
2716             for entry in info['list']:
2717                 assert entry['type'] == 'reference'
2718                 results += self.extract(entry['url'])
2719             return results
2720         else: # Root page
2721             info = {
2722                 'id': 'Stanford OpenClassroom',
2723                 'type': 'playlist',
2724                 'uploader': None,
2725                 'upload_date': None,
2726             }
2727
2728             self.report_download_webpage(info['id'])
2729             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2730             try:
2731                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2732             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2734
2735             info['title'] = info['id']
2736
2737             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2738             info['list'] = [
2739                 {
2740                     'type': 'reference',
2741                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2742                 }
2743                     for cpage in links]
2744
2745             results = []
2746             for entry in info['list']:
2747                 assert entry['type'] == 'reference'
2748                 results += self.extract(entry['url'])
2749             return results
2750
2751 class MTVIE(InfoExtractor):
2752     """Information extractor for MTV.com"""
2753
2754     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2755     IE_NAME = u'mtv'
2756
2757     def _real_extract(self, url):
2758         mobj = re.match(self._VALID_URL, url)
2759         if mobj is None:
2760             raise ExtractorError(u'Invalid URL: %s' % url)
2761         if not mobj.group('proto'):
2762             url = 'http://' + url
2763         video_id = mobj.group('videoid')
2764
2765         webpage = self._download_webpage(url, video_id)
2766
2767         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2768         if mobj is None:
2769             raise ExtractorError(u'Unable to extract song name')
2770         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2771         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2772         if mobj is None:
2773             raise ExtractorError(u'Unable to extract performer')
2774         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2775         video_title = performer + ' - ' + song_name
2776
2777         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2778         if mobj is None:
2779             raise ExtractorError(u'Unable to mtvn_uri')
2780         mtvn_uri = mobj.group(1)
2781
2782         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2783         if mobj is None:
2784             raise ExtractorError(u'Unable to extract content id')
2785         content_id = mobj.group(1)
2786
2787         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2788         self.report_extraction(video_id)
2789         request = compat_urllib_request.Request(videogen_url)
2790         try:
2791             metadataXml = compat_urllib_request.urlopen(request).read()
2792         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2793             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2794
2795         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2796         renditions = mdoc.findall('.//rendition')
2797
2798         # For now, always pick the highest quality.
2799         rendition = renditions[-1]
2800
2801         try:
2802             _,_,ext = rendition.attrib['type'].partition('/')
2803             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2804             video_url = rendition.find('./src').text
2805         except KeyError:
2806             raise ExtractorError('Invalid rendition field.')
2807
2808         info = {
2809             'id': video_id,
2810             'url': video_url,
2811             'uploader': performer,
2812             'upload_date': None,
2813             'title': video_title,
2814             'ext': ext,
2815             'format': format,
2816         }
2817
2818         return [info]
2819
2820
2821 class YoukuIE(InfoExtractor):
2822     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2823
2824     def _gen_sid(self):
2825         nowTime = int(time.time() * 1000)
2826         random1 = random.randint(1000,1998)
2827         random2 = random.randint(1000,9999)
2828
2829         return "%d%d%d" %(nowTime,random1,random2)
2830
2831     def _get_file_ID_mix_string(self, seed):
2832         mixed = []
2833         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2834         seed = float(seed)
2835         for i in range(len(source)):
2836             seed  =  (seed * 211 + 30031 ) % 65536
2837             index  =  math.floor(seed / 65536 * len(source) )
2838             mixed.append(source[int(index)])
2839             source.remove(source[int(index)])
2840         #return ''.join(mixed)
2841         return mixed
2842
2843     def _get_file_id(self, fileId, seed):
2844         mixed = self._get_file_ID_mix_string(seed)
2845         ids = fileId.split('*')
2846         realId = []
2847         for ch in ids:
2848             if ch:
2849                 realId.append(mixed[int(ch)])
2850         return ''.join(realId)
2851
2852     def _real_extract(self, url):
2853         mobj = re.match(self._VALID_URL, url)
2854         if mobj is None:
2855             raise ExtractorError(u'Invalid URL: %s' % url)
2856         video_id = mobj.group('ID')
2857
2858         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2859
2860         jsondata = self._download_webpage(info_url, video_id)
2861
2862         self.report_extraction(video_id)
2863         try:
2864             config = json.loads(jsondata)
2865
2866             video_title =  config['data'][0]['title']
2867             seed = config['data'][0]['seed']
2868
2869             format = self._downloader.params.get('format', None)
2870             supported_format = list(config['data'][0]['streamfileids'].keys())
2871
2872             if format is None or format == 'best':
2873                 if 'hd2' in supported_format:
2874                     format = 'hd2'
2875                 else:
2876                     format = 'flv'
2877                 ext = u'flv'
2878             elif format == 'worst':
2879                 format = 'mp4'
2880                 ext = u'mp4'
2881             else:
2882                 format = 'flv'
2883                 ext = u'flv'
2884
2885
2886             fileid = config['data'][0]['streamfileids'][format]
2887             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2888         except (UnicodeDecodeError, ValueError, KeyError):
2889             raise ExtractorError(u'Unable to extract info section')
2890
2891         files_info=[]
2892         sid = self._gen_sid()
2893         fileid = self._get_file_id(fileid, seed)
2894
2895         #column 8,9 of fileid represent the segment number
2896         #fileid[7:9] should be changed
2897         for index, key in enumerate(keys):
2898
2899             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2900             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2901
2902             info = {
2903                 'id': '%s_part%02d' % (video_id, index),
2904                 'url': download_url,
2905                 'uploader': None,
2906                 'upload_date': None,
2907                 'title': video_title,
2908                 'ext': ext,
2909             }
2910             files_info.append(info)
2911
2912         return files_info
2913
2914
2915 class XNXXIE(InfoExtractor):
2916     """Information extractor for xnxx.com"""
2917
2918     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2919     IE_NAME = u'xnxx'
2920     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2921     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2922     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2923
2924     def _real_extract(self, url):
2925         mobj = re.match(self._VALID_URL, url)
2926         if mobj is None:
2927             raise ExtractorError(u'Invalid URL: %s' % url)
2928         video_id = mobj.group(1)
2929
2930         # Get webpage content
2931         webpage = self._download_webpage(url, video_id)
2932
2933         result = re.search(self.VIDEO_URL_RE, webpage)
2934         if result is None:
2935             raise ExtractorError(u'Unable to extract video url')
2936         video_url = compat_urllib_parse.unquote(result.group(1))
2937
2938         result = re.search(self.VIDEO_TITLE_RE, webpage)
2939         if result is None:
2940             raise ExtractorError(u'Unable to extract video title')
2941         video_title = result.group(1)
2942
2943         result = re.search(self.VIDEO_THUMB_RE, webpage)
2944         if result is None:
2945             raise ExtractorError(u'Unable to extract video thumbnail')
2946         video_thumbnail = result.group(1)
2947
2948         return [{
2949             'id': video_id,
2950             'url': video_url,
2951             'uploader': None,
2952             'upload_date': None,
2953             'title': video_title,
2954             'ext': 'flv',
2955             'thumbnail': video_thumbnail,
2956             'description': None,
2957         }]
2958
2959
2960 class GooglePlusIE(InfoExtractor):
2961     """Information extractor for plus.google.com."""
2962
2963     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2964     IE_NAME = u'plus.google'
2965
2966     def report_extract_entry(self, url):
2967         """Report downloading extry"""
2968         self.to_screen(u'Downloading entry: %s' % url)
2969
2970     def report_date(self, upload_date):
2971         """Report downloading extry"""
2972         self.to_screen(u'Entry date: %s' % upload_date)
2973
2974     def report_uploader(self, uploader):
2975         """Report downloading extry"""
2976         self.to_screen(u'Uploader: %s' % uploader)
2977
2978     def report_title(self, video_title):
2979         """Report downloading extry"""
2980         self.to_screen(u'Title: %s' % video_title)
2981
2982     def report_extract_vid_page(self, video_page):
2983         """Report information extraction."""
2984         self.to_screen(u'Extracting video page: %s' % video_page)
2985
2986     def _real_extract(self, url):
2987         # Extract id from URL
2988         mobj = re.match(self._VALID_URL, url)
2989         if mobj is None:
2990             raise ExtractorError(u'Invalid URL: %s' % url)
2991
2992         post_url = mobj.group(0)
2993         video_id = mobj.group(1)
2994
2995         video_extension = 'flv'
2996
2997         # Step 1, Retrieve post webpage to extract further information
2998         self.report_extract_entry(post_url)
2999         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3000
3001         # Extract update date
3002         upload_date = None
3003         pattern = 'title="Timestamp">(.*?)</a>'
3004         mobj = re.search(pattern, webpage)
3005         if mobj:
3006             upload_date = mobj.group(1)
3007             # Convert timestring to a format suitable for filename
3008             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3009             upload_date = upload_date.strftime('%Y%m%d')
3010         self.report_date(upload_date)
3011
3012         # Extract uploader
3013         uploader = None
3014         pattern = r'rel\="author".*?>(.*?)</a>'
3015         mobj = re.search(pattern, webpage)
3016         if mobj:
3017             uploader = mobj.group(1)
3018         self.report_uploader(uploader)
3019
3020         # Extract title
3021         # Get the first line for title
3022         video_title = u'NA'
3023         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3024         mobj = re.search(pattern, webpage)
3025         if mobj:
3026             video_title = mobj.group(1)
3027         self.report_title(video_title)
3028
3029         # Step 2, Stimulate clicking the image box to launch video
3030         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3031         mobj = re.search(pattern, webpage)
3032         if mobj is None:
3033             raise ExtractorError(u'Unable to extract video page URL')
3034
3035         video_page = mobj.group(1)
3036         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3037         self.report_extract_vid_page(video_page)
3038
3039
3040         # Extract video links on video page
3041         """Extract video links of all sizes"""
3042         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3043         mobj = re.findall(pattern, webpage)
3044         if len(mobj) == 0:
3045             raise ExtractorError(u'Unable to extract video links')
3046
3047         # Sort in resolution
3048         links = sorted(mobj)
3049
3050         # Choose the lowest of the sort, i.e. highest resolution
3051         video_url = links[-1]
3052         # Only get the url. The resolution part in the tuple has no use anymore
3053         video_url = video_url[-1]
3054         # Treat escaped \u0026 style hex
3055         try:
3056             video_url = video_url.decode("unicode_escape")
3057         except AttributeError: # Python 3
3058             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3059
3060
3061         return [{
3062             'id':       video_id,
3063             'url':      video_url,
3064             'uploader': uploader,
3065             'upload_date':  upload_date,
3066             'title':    video_title,
3067             'ext':      video_extension,
3068         }]
3069
3070 class NBAIE(InfoExtractor):
3071     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3072     IE_NAME = u'nba'
3073
3074     def _real_extract(self, url):
3075         mobj = re.match(self._VALID_URL, url)
3076         if mobj is None:
3077             raise ExtractorError(u'Invalid URL: %s' % url)
3078
3079         video_id = mobj.group(1)
3080         if video_id.endswith('/index.html'):
3081             video_id = video_id[:-len('/index.html')]
3082
3083         webpage = self._download_webpage(url, video_id)
3084
3085         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3086         def _findProp(rexp, default=None):
3087             m = re.search(rexp, webpage)
3088             if m:
3089                 return unescapeHTML(m.group(1))
3090             else:
3091                 return default
3092
3093         shortened_video_id = video_id.rpartition('/')[2]
3094         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3095         info = {
3096             'id': shortened_video_id,
3097             'url': video_url,
3098             'ext': 'mp4',
3099             'title': title,
3100             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3101             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3102         }
3103         return [info]
3104
3105 class JustinTVIE(InfoExtractor):
3106     """Information extractor for justin.tv and twitch.tv"""
3107     # TODO: One broadcast may be split into multiple videos. The key
3108     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3109     # starts at 1 and increases. Can we treat all parts as one video?
3110
3111     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3112         (?:
3113             (?P<channelid>[^/]+)|
3114             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3115             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3116         )
3117         /?(?:\#.*)?$
3118         """
3119     _JUSTIN_PAGE_LIMIT = 100
3120     IE_NAME = u'justin.tv'
3121
3122     def report_download_page(self, channel, offset):
3123         """Report attempt to download a single page of videos."""
3124         self.to_screen(u'%s: Downloading video information from %d to %d' %
3125                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3126
3127     # Return count of items, list of *valid* items
3128     def _parse_page(self, url, video_id):
3129         webpage = self._download_webpage(url, video_id,
3130                                          u'Downloading video info JSON',
3131                                          u'unable to download video info JSON')
3132
3133         response = json.loads(webpage)
3134         if type(response) != list:
3135             error_text = response.get('error', 'unknown error')
3136             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3137         info = []
3138         for clip in response:
3139             video_url = clip['video_file_url']
3140             if video_url:
3141                 video_extension = os.path.splitext(video_url)[1][1:]
3142                 video_date = re.sub('-', '', clip['start_time'][:10])
3143                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3144                 video_id = clip['id']
3145                 video_title = clip.get('title', video_id)
3146                 info.append({
3147                     'id': video_id,
3148                     'url': video_url,
3149                     'title': video_title,
3150                     'uploader': clip.get('channel_name', video_uploader_id),
3151                     'uploader_id': video_uploader_id,
3152                     'upload_date': video_date,
3153                     'ext': video_extension,
3154                 })
3155         return (len(response), info)
3156
3157     def _real_extract(self, url):
3158         mobj = re.match(self._VALID_URL, url)
3159         if mobj is None:
3160             raise ExtractorError(u'invalid URL: %s' % url)
3161
3162         api_base = 'http://api.justin.tv'
3163         paged = False
3164         if mobj.group('channelid'):
3165             paged = True
3166             video_id = mobj.group('channelid')
3167             api = api_base + '/channel/archives/%s.json' % video_id
3168         elif mobj.group('chapterid'):
3169             chapter_id = mobj.group('chapterid')
3170
3171             webpage = self._download_webpage(url, chapter_id)
3172             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3173             if not m:
3174                 raise ExtractorError(u'Cannot find archive of a chapter')
3175             archive_id = m.group(1)
3176
3177             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3178             chapter_info_xml = self._download_webpage(api, chapter_id,
3179                                              note=u'Downloading chapter information',
3180                                              errnote=u'Chapter information download failed')
3181             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3182             for a in doc.findall('.//archive'):
3183                 if archive_id == a.find('./id').text:
3184                     break
3185             else:
3186                 raise ExtractorError(u'Could not find chapter in chapter information')
3187
3188             video_url = a.find('./video_file_url').text
3189             video_ext = video_url.rpartition('.')[2] or u'flv'
3190
3191             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3192             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3193                                    note='Downloading chapter metadata',
3194                                    errnote='Download of chapter metadata failed')
3195             chapter_info = json.loads(chapter_info_json)
3196
3197             bracket_start = int(doc.find('.//bracket_start').text)
3198             bracket_end = int(doc.find('.//bracket_end').text)
3199
3200             # TODO determine start (and probably fix up file)
3201             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3202             #video_url += u'?start=' + TODO:start_timestamp
3203             # bracket_start is 13290, but we want 51670615
3204             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3205                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3206
3207             info = {
3208                 'id': u'c' + chapter_id,
3209                 'url': video_url,
3210                 'ext': video_ext,
3211                 'title': chapter_info['title'],
3212                 'thumbnail': chapter_info['preview'],
3213                 'description': chapter_info['description'],
3214                 'uploader': chapter_info['channel']['display_name'],
3215                 'uploader_id': chapter_info['channel']['name'],
3216             }
3217             return [info]
3218         else:
3219             video_id = mobj.group('videoid')
3220             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3221
3222         self.report_extraction(video_id)
3223
3224         info = []
3225         offset = 0
3226         limit = self._JUSTIN_PAGE_LIMIT
3227         while True:
3228             if paged:
3229                 self.report_download_page(video_id, offset)
3230             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3231             page_count, page_info = self._parse_page(page_url, video_id)
3232             info.extend(page_info)
3233             if not paged or page_count != limit:
3234                 break
3235             offset += limit
3236         return info
3237
3238 class FunnyOrDieIE(InfoExtractor):
3239     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3240
3241     def _real_extract(self, url):
3242         mobj = re.match(self._VALID_URL, url)
3243         if mobj is None:
3244             raise ExtractorError(u'invalid URL: %s' % url)
3245
3246         video_id = mobj.group('id')
3247         webpage = self._download_webpage(url, video_id)
3248
3249         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3250         if not m:
3251             raise ExtractorError(u'Unable to find video information')
3252         video_url = unescapeHTML(m.group('url'))
3253
3254         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3255         if not m:
3256             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3257             if not m:
3258                 raise ExtractorError(u'Cannot find video title')
3259         title = clean_html(m.group('title'))
3260
3261         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3262         if m:
3263             desc = unescapeHTML(m.group('desc'))
3264         else:
3265             desc = None
3266
3267         info = {
3268             'id': video_id,
3269             'url': video_url,
3270             'ext': 'mp4',
3271             'title': title,
3272             'description': desc,
3273         }
3274         return [info]
3275
3276 class SteamIE(InfoExtractor):
3277     _VALID_URL = r"""http://store\.steampowered\.com/
3278                 (agecheck/)?
3279                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3280                 (?P<gameID>\d+)/?
3281                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3282                 """
3283
3284     @classmethod
3285     def suitable(cls, url):
3286         """Receives a URL and returns True if suitable for this IE."""
3287         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3288
3289     def _real_extract(self, url):
3290         m = re.match(self._VALID_URL, url, re.VERBOSE)
3291         gameID = m.group('gameID')
3292         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3293         self.report_age_confirmation()
3294         webpage = self._download_webpage(videourl, gameID)
3295         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3296
3297         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3298         mweb = re.finditer(urlRE, webpage)
3299         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3300         titles = re.finditer(namesRE, webpage)
3301         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3302         thumbs = re.finditer(thumbsRE, webpage)
3303         videos = []
3304         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3305             video_id = vid.group('videoID')
3306             title = vtitle.group('videoName')
3307             video_url = vid.group('videoURL')
3308             video_thumb = thumb.group('thumbnail')
3309             if not video_url:
3310                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3311             info = {
3312                 'id':video_id,
3313                 'url':video_url,
3314                 'ext': 'flv',
3315                 'title': unescapeHTML(title),
3316                 'thumbnail': video_thumb
3317                   }
3318             videos.append(info)
3319         return [self.playlist_result(videos, gameID, game_title)]
3320
3321 class UstreamIE(InfoExtractor):
3322     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3323     IE_NAME = u'ustream'
3324
3325     def _real_extract(self, url):
3326         m = re.match(self._VALID_URL, url)
3327         video_id = m.group('videoID')
3328         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3329         webpage = self._download_webpage(url, video_id)
3330         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3331         title = m.group('title')
3332         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3333         uploader = m.group('uploader')
3334         info = {
3335                 'id':video_id,
3336                 'url':video_url,
3337                 'ext': 'flv',
3338                 'title': title,
3339                 'uploader': uploader
3340                   }
3341         return [info]
3342
3343 class WorldStarHipHopIE(InfoExtractor):
3344     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3345     IE_NAME = u'WorldStarHipHop'
3346
3347     def _real_extract(self, url):
3348         _src_url = r'so\.addVariable\("file","(.*?)"\)'
3349
3350         m = re.match(self._VALID_URL, url)
3351         video_id = m.group('id')
3352
3353         webpage_src = self._download_webpage(url, video_id)
3354
3355         mobj = re.search(_src_url, webpage_src)
3356
3357         if mobj is not None:
3358             video_url = mobj.group(1)
3359             if 'mp4' in video_url:
3360                 ext = 'mp4'
3361             else:
3362                 ext = 'flv'
3363         else:
3364             raise ExtractorError(u'Cannot find video url for %s' % video_id)
3365
3366         mobj = re.search(r"<title>(.*)</title>", webpage_src)
3367
3368         if mobj is None:
3369             raise ExtractorError(u'Cannot determine title')
3370         title = mobj.group(1)
3371
3372         mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
3373         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3374         if mobj is not None:
3375             thumbnail = mobj.group(1)
3376         else:
3377             _title = r"""candytitles.*>(.*)</span>"""
3378             mobj = re.search(_title, webpage_src)
3379             if mobj is not None:
3380                 title = mobj.group(1)
3381             thumbnail = None
3382
3383         results = [{
3384                     'id': video_id,
3385                     'url' : video_url,
3386                     'title' : title,
3387                     'thumbnail' : thumbnail,
3388                     'ext' : ext,
3389                     }]
3390         return results
3391
3392 class RBMARadioIE(InfoExtractor):
3393     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3394
3395     def _real_extract(self, url):
3396         m = re.match(self._VALID_URL, url)
3397         video_id = m.group('videoID')
3398
3399         webpage = self._download_webpage(url, video_id)
3400         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3401         if not m:
3402             raise ExtractorError(u'Cannot find metadata')
3403         json_data = m.group(1)
3404
3405         try:
3406             data = json.loads(json_data)
3407         except ValueError as e:
3408             raise ExtractorError(u'Invalid JSON: ' + str(e))
3409
3410         video_url = data['akamai_url'] + '&cbr=256'
3411         url_parts = compat_urllib_parse_urlparse(video_url)
3412         video_ext = url_parts.path.rpartition('.')[2]
3413         info = {
3414                 'id': video_id,
3415                 'url': video_url,
3416                 'ext': video_ext,
3417                 'title': data['title'],
3418                 'description': data.get('teaser_text'),
3419                 'location': data.get('country_of_origin'),
3420                 'uploader': data.get('host', {}).get('name'),
3421                 'uploader_id': data.get('host', {}).get('slug'),
3422                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3423                 'duration': data.get('duration'),
3424         }
3425         return [info]
3426
3427
3428 class YouPornIE(InfoExtractor):
3429     """Information extractor for youporn.com."""
3430     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3431
3432     def _print_formats(self, formats):
3433         """Print all available formats"""
3434         print(u'Available formats:')
3435         print(u'ext\t\tformat')
3436         print(u'---------------------------------')
3437         for format in formats:
3438             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3439
3440     def _specific(self, req_format, formats):
3441         for x in formats:
3442             if(x["format"]==req_format):
3443                 return x
3444         return None
3445
3446     def _real_extract(self, url):
3447         mobj = re.match(self._VALID_URL, url)
3448         if mobj is None:
3449             raise ExtractorError(u'Invalid URL: %s' % url)
3450
3451         video_id = mobj.group('videoid')
3452
3453         req = compat_urllib_request.Request(url)
3454         req.add_header('Cookie', 'age_verified=1')
3455         webpage = self._download_webpage(req, video_id)
3456
3457         # Get the video title
3458         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3459         if result is None:
3460             raise ExtractorError(u'Unable to extract video title')
3461         video_title = result.group('title').strip()
3462
3463         # Get the video date
3464         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3465         if result is None:
3466             self._downloader.report_warning(u'unable to extract video date')
3467             upload_date = None
3468         else:
3469             upload_date = unified_strdate(result.group('date').strip())
3470
3471         # Get the video uploader
3472         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3473         if result is None:
3474             self._downloader.report_warning(u'unable to extract uploader')
3475             video_uploader = None
3476         else:
3477             video_uploader = result.group('uploader').strip()
3478             video_uploader = clean_html( video_uploader )
3479
3480         # Get all of the formats available
3481         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3482         result = re.search(DOWNLOAD_LIST_RE, webpage)
3483         if result is None:
3484             raise ExtractorError(u'Unable to extract download list')
3485         download_list_html = result.group('download_list').strip()
3486
3487         # Get all of the links from the page
3488         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3489         links = re.findall(LINK_RE, download_list_html)
3490         if(len(links) == 0):
3491             raise ExtractorError(u'ERROR: no known formats available for video')
3492
3493         self.to_screen(u'Links found: %d' % len(links))
3494
3495         formats = []
3496         for link in links:
3497
3498             # A link looks like this:
3499             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3500             # A path looks like this:
3501             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3502             video_url = unescapeHTML( link )
3503             path = compat_urllib_parse_urlparse( video_url ).path
3504             extension = os.path.splitext( path )[1][1:]
3505             format = path.split('/')[4].split('_')[:2]
3506             size = format[0]
3507             bitrate = format[1]
3508             format = "-".join( format )
3509             title = u'%s-%s-%s' % (video_title, size, bitrate)
3510
3511             formats.append({
3512                 'id': video_id,
3513                 'url': video_url,
3514                 'uploader': video_uploader,
3515                 'upload_date': upload_date,
3516                 'title': title,
3517                 'ext': extension,
3518                 'format': format,
3519                 'thumbnail': None,
3520                 'description': None,
3521                 'player_url': None
3522             })
3523
3524         if self._downloader.params.get('listformats', None):
3525             self._print_formats(formats)
3526             return
3527
3528         req_format = self._downloader.params.get('format', None)
3529         self.to_screen(u'Format: %s' % req_format)
3530
3531         if req_format is None or req_format == 'best':
3532             return [formats[0]]
3533         elif req_format == 'worst':
3534             return [formats[-1]]
3535         elif req_format in ('-1', 'all'):
3536             return formats
3537         else:
3538             format = self._specific( req_format, formats )
3539             if result is None:
3540                 raise ExtractorError(u'Requested format not available')
3541             return [format]
3542
3543
3544
3545 class PornotubeIE(InfoExtractor):
3546     """Information extractor for pornotube.com."""
3547     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3548
3549     def _real_extract(self, url):
3550         mobj = re.match(self._VALID_URL, url)
3551         if mobj is None:
3552             raise ExtractorError(u'Invalid URL: %s' % url)
3553
3554         video_id = mobj.group('videoid')
3555         video_title = mobj.group('title')
3556
3557         # Get webpage content
3558         webpage = self._download_webpage(url, video_id)
3559
3560         # Get the video URL
3561         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3562         result = re.search(VIDEO_URL_RE, webpage)
3563         if result is None:
3564             raise ExtractorError(u'Unable to extract video url')
3565         video_url = compat_urllib_parse.unquote(result.group('url'))
3566
3567         #Get the uploaded date
3568         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3569         result = re.search(VIDEO_UPLOADED_RE, webpage)
3570         if result is None:
3571             raise ExtractorError(u'Unable to extract video title')
3572         upload_date = unified_strdate(result.group('date'))
3573
3574         info = {'id': video_id,
3575                 'url': video_url,
3576                 'uploader': None,
3577                 'upload_date': upload_date,
3578                 'title': video_title,
3579                 'ext': 'flv',
3580                 'format': 'flv'}
3581
3582         return [info]
3583
3584 class YouJizzIE(InfoExtractor):
3585     """Information extractor for youjizz.com."""
3586     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3587
3588     def _real_extract(self, url):
3589         mobj = re.match(self._VALID_URL, url)
3590         if mobj is None:
3591             raise ExtractorError(u'Invalid URL: %s' % url)
3592
3593         video_id = mobj.group('videoid')
3594
3595         # Get webpage content
3596         webpage = self._download_webpage(url, video_id)
3597
3598         # Get the video title
3599         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3600         if result is None:
3601             raise ExtractorError(u'ERROR: unable to extract video title')
3602         video_title = result.group('title').strip()
3603
3604         # Get the embed page
3605         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3606         if result is None:
3607             raise ExtractorError(u'ERROR: unable to extract embed page')
3608
3609         embed_page_url = result.group(0).strip()
3610         video_id = result.group('videoid')
3611
3612         webpage = self._download_webpage(embed_page_url, video_id)
3613
3614         # Get the video URL
3615         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3616         if result is None:
3617             raise ExtractorError(u'ERROR: unable to extract video url')
3618         video_url = result.group('source')
3619
3620         info = {'id': video_id,
3621                 'url': video_url,
3622                 'title': video_title,
3623                 'ext': 'flv',
3624                 'format': 'flv',
3625                 'player_url': embed_page_url}
3626
3627         return [info]
3628
3629 class EightTracksIE(InfoExtractor):
3630     IE_NAME = '8tracks'
3631     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3632
3633     def _real_extract(self, url):
3634         mobj = re.match(self._VALID_URL, url)
3635         if mobj is None:
3636             raise ExtractorError(u'Invalid URL: %s' % url)
3637         playlist_id = mobj.group('id')
3638
3639         webpage = self._download_webpage(url, playlist_id)
3640
3641         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3642         if not m:
3643             raise ExtractorError(u'Cannot find trax information')
3644         json_like = m.group(1)
3645         data = json.loads(json_like)
3646
3647         session = str(random.randint(0, 1000000000))
3648         mix_id = data['id']
3649         track_count = data['tracks_count']
3650         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3651         next_url = first_url
3652         res = []
3653         for i in itertools.count():
3654             api_json = self._download_webpage(next_url, playlist_id,
3655                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3656                 errnote=u'Failed to download song information')
3657             api_data = json.loads(api_json)
3658             track_data = api_data[u'set']['track']
3659             info = {
3660                 'id': track_data['id'],
3661                 'url': track_data['track_file_stream_url'],
3662                 'title': track_data['performer'] + u' - ' + track_data['name'],
3663                 'raw_title': track_data['name'],
3664                 'uploader_id': data['user']['login'],
3665                 'ext': 'm4a',
3666             }
3667             res.append(info)
3668             if api_data['set']['at_last_track']:
3669                 break
3670             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3671         return res
3672
3673 class KeekIE(InfoExtractor):
3674     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3675     IE_NAME = u'keek'
3676
3677     def _real_extract(self, url):
3678         m = re.match(self._VALID_URL, url)
3679         video_id = m.group('videoID')
3680         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3681         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3682         webpage = self._download_webpage(url, video_id)
3683         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3684         title = unescapeHTML(m.group('title'))
3685         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3686         uploader = clean_html(m.group('uploader'))
3687         info = {
3688                 'id': video_id,
3689                 'url': video_url,
3690                 'ext': 'mp4',
3691                 'title': title,
3692                 'thumbnail': thumbnail,
3693                 'uploader': uploader
3694         }
3695         return [info]
3696
3697 class TEDIE(InfoExtractor):
3698     _VALID_URL=r'''http://www\.ted\.com/
3699                    (
3700                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3701                         |
3702                         ((?P<type_talk>talks)) # We have a simple talk
3703                    )
3704                    (/lang/(.*?))? # The url may contain the language
3705                    /(?P<name>\w+) # Here goes the name and then ".html"
3706                    '''
3707
3708     @classmethod
3709     def suitable(cls, url):
3710         """Receives a URL and returns True if suitable for this IE."""
3711         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3712
3713     def _real_extract(self, url):
3714         m=re.match(self._VALID_URL, url, re.VERBOSE)
3715         if m.group('type_talk'):
3716             return [self._talk_info(url)]
3717         else :
3718             playlist_id=m.group('playlist_id')
3719             name=m.group('name')
3720             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3721             return [self._playlist_videos_info(url,name,playlist_id)]
3722
3723     def _talk_video_link(self,mediaSlug):
3724         '''Returns the video link for that mediaSlug'''
3725         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3726
3727     def _playlist_videos_info(self,url,name,playlist_id=0):
3728         '''Returns the videos of the playlist'''
3729         video_RE=r'''
3730                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3731                      ([.\s]*?)data-playlist_item_id="(\d+)"
3732                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3733                      '''
3734         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3735         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3736         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3737         m_names=re.finditer(video_name_RE,webpage)
3738
3739         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3740         m_playlist = re.search(playlist_RE, webpage)
3741         playlist_title = m_playlist.group('playlist_title')
3742
3743         playlist_entries = []
3744         for m_video, m_name in zip(m_videos,m_names):
3745             video_id=m_video.group('video_id')
3746             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3747             playlist_entries.append(self.url_result(talk_url, 'TED'))
3748         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3749
3750     def _talk_info(self, url, video_id=0):
3751         """Return the video for the talk in the url"""
3752         m=re.match(self._VALID_URL, url,re.VERBOSE)
3753         videoName=m.group('name')
3754         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3755         # If the url includes the language we get the title translated
3756         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3757         title=re.search(title_RE, webpage).group('title')
3758         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3759                         "id":(?P<videoID>[\d]+).*?
3760                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3761         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3762         thumb_match=re.search(thumb_RE,webpage)
3763         info_match=re.search(info_RE,webpage,re.VERBOSE)
3764         video_id=info_match.group('videoID')
3765         mediaSlug=info_match.group('mediaSlug')
3766         video_url=self._talk_video_link(mediaSlug)
3767         info = {
3768                 'id': video_id,
3769                 'url': video_url,
3770                 'ext': 'mp4',
3771                 'title': title,
3772                 'thumbnail': thumb_match.group('thumbnail')
3773                 }
3774         return info
3775
3776 class MySpassIE(InfoExtractor):
3777     _VALID_URL = r'http://www.myspass.de/.*'
3778
3779     def _real_extract(self, url):
3780         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3781
3782         # video id is the last path element of the URL
3783         # usually there is a trailing slash, so also try the second but last
3784         url_path = compat_urllib_parse_urlparse(url).path
3785         url_parent_path, video_id = os.path.split(url_path)
3786         if not video_id:
3787             _, video_id = os.path.split(url_parent_path)
3788
3789         # get metadata
3790         metadata_url = META_DATA_URL_TEMPLATE % video_id
3791         metadata_text = self._download_webpage(metadata_url, video_id)
3792         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3793
3794         # extract values from metadata
3795         url_flv_el = metadata.find('url_flv')
3796         if url_flv_el is None:
3797             raise ExtractorError(u'Unable to extract download url')
3798         video_url = url_flv_el.text
3799         extension = os.path.splitext(video_url)[1][1:]
3800         title_el = metadata.find('title')
3801         if title_el is None:
3802             raise ExtractorError(u'Unable to extract title')
3803         title = title_el.text
3804         format_id_el = metadata.find('format_id')
3805         if format_id_el is None:
3806             format = ext
3807         else:
3808             format = format_id_el.text
3809         description_el = metadata.find('description')
3810         if description_el is not None:
3811             description = description_el.text
3812         else:
3813             description = None
3814         imagePreview_el = metadata.find('imagePreview')
3815         if imagePreview_el is not None:
3816             thumbnail = imagePreview_el.text
3817         else:
3818             thumbnail = None
3819         info = {
3820             'id': video_id,
3821             'url': video_url,
3822             'title': title,
3823             'ext': extension,
3824             'format': format,
3825             'thumbnail': thumbnail,
3826             'description': description
3827         }
3828         return [info]
3829
3830 class SpiegelIE(InfoExtractor):
3831     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3832
3833     def _real_extract(self, url):
3834         m = re.match(self._VALID_URL, url)
3835         video_id = m.group('videoID')
3836
3837         webpage = self._download_webpage(url, video_id)
3838         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3839         if not m:
3840             raise ExtractorError(u'Cannot find title')
3841         video_title = unescapeHTML(m.group(1))
3842
3843         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3844         xml_code = self._download_webpage(xml_url, video_id,
3845                     note=u'Downloading XML', errnote=u'Failed to download XML')
3846
3847         idoc = xml.etree.ElementTree.fromstring(xml_code)
3848         last_type = idoc[-1]
3849         filename = last_type.findall('./filename')[0].text
3850         duration = float(last_type.findall('./duration')[0].text)
3851
3852         video_url = 'http://video2.spiegel.de/flash/' + filename
3853         video_ext = filename.rpartition('.')[2]
3854         info = {
3855             'id': video_id,
3856             'url': video_url,
3857             'ext': video_ext,
3858             'title': video_title,
3859             'duration': duration,
3860         }
3861         return [info]
3862
3863 class LiveLeakIE(InfoExtractor):
3864
3865     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3866     IE_NAME = u'liveleak'
3867
3868     def _real_extract(self, url):
3869         mobj = re.match(self._VALID_URL, url)
3870         if mobj is None:
3871             raise ExtractorError(u'Invalid URL: %s' % url)
3872
3873         video_id = mobj.group('video_id')
3874
3875         webpage = self._download_webpage(url, video_id)
3876
3877         m = re.search(r'file: "(.*?)",', webpage)
3878         if not m:
3879             raise ExtractorError(u'Unable to find video url')
3880         video_url = m.group(1)
3881
3882         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3883         if not m:
3884             raise ExtractorError(u'Cannot find video title')
3885         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
3886
3887         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3888         if m:
3889             desc = unescapeHTML(m.group('desc'))
3890         else:
3891             desc = None
3892
3893         m = re.search(r'By:.*?(\w+)</a>', webpage)
3894         if m:
3895             uploader = clean_html(m.group(1))
3896         else:
3897             uploader = None
3898
3899         info = {
3900             'id':  video_id,
3901             'url': video_url,
3902             'ext': 'mp4',
3903             'title': title,
3904             'description': desc,
3905             'uploader': uploader
3906         }
3907
3908         return [info]
3909
3910 class ARDIE(InfoExtractor):
3911     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3912     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3913     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3914
3915     def _real_extract(self, url):
3916         # determine video id from url
3917         m = re.match(self._VALID_URL, url)
3918
3919         numid = re.search(r'documentId=([0-9]+)', url)
3920         if numid:
3921             video_id = numid.group(1)
3922         else:
3923             video_id = m.group('video_id')
3924
3925         # determine title and media streams from webpage
3926         html = self._download_webpage(url, video_id)
3927         title = re.search(self._TITLE, html).group('title')
3928         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3929         if not streams:
3930             assert '"fsk"' in html
3931             raise ExtractorError(u'This video is only available after 8:00 pm')
3932
3933         # choose default media type and highest quality for now
3934         stream = max([s for s in streams if int(s["media_type"]) == 0],
3935                      key=lambda s: int(s["quality"]))
3936
3937         # there's two possibilities: RTMP stream or HTTP download
3938         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3939         if stream['rtmp_url']:
3940             self.to_screen(u'RTMP download detected')
3941             assert stream['video_url'].startswith('mp4:')
3942             info["url"] = stream["rtmp_url"]
3943             info["play_path"] = stream['video_url']
3944         else:
3945             assert stream["video_url"].endswith('.mp4')
3946             info["url"] = stream["video_url"]
3947         return [info]
3948
3949 class TumblrIE(InfoExtractor):
3950     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3951
3952     def _real_extract(self, url):
3953         m_url = re.match(self._VALID_URL, url)
3954         video_id = m_url.group('id')
3955         blog = m_url.group('blog_name')
3956
3957         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3958         webpage = self._download_webpage(url, video_id)
3959
3960         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3961         video = re.search(re_video, webpage)
3962         if video is None:
3963             self.to_screen("No video founded")
3964             return []
3965         video_url = video.group('video_url')
3966         ext = video.group('ext')
3967
3968         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
3969         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
3970
3971         # The only place where you can get a title, it's not complete,
3972         # but searching in other places doesn't work for all videos
3973         re_title = r'<title>(?P<title>.*?)</title>'
3974         title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
3975
3976         return [{'id': video_id,
3977                  'url': video_url,
3978                  'title': title,
3979                  'thumbnail': thumb,
3980                  'ext': ext
3981                  }]
3982
3983 class BandcampIE(InfoExtractor):
3984     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3985
3986     def _real_extract(self, url):
3987         mobj = re.match(self._VALID_URL, url)
3988         title = mobj.group('title')
3989         webpage = self._download_webpage(url, title)
3990         # We get the link to the free download page
3991         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3992         if m_download is None:
3993             raise ExtractorError(u'No free songs founded')
3994
3995         download_link = m_download.group(1)
3996         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3997                        webpage, re.MULTILINE|re.DOTALL).group('id')
3998
3999         download_webpage = self._download_webpage(download_link, id,
4000                                                   'Downloading free downloads page')
4001         # We get the dictionary of the track from some javascrip code
4002         info = re.search(r'items: (.*?),$',
4003                          download_webpage, re.MULTILINE).group(1)
4004         info = json.loads(info)[0]
4005         # We pick mp3-320 for now, until format selection can be easily implemented.
4006         mp3_info = info[u'downloads'][u'mp3-320']
4007         # If we try to use this url it says the link has expired
4008         initial_url = mp3_info[u'url']
4009         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4010         m_url = re.match(re_url, initial_url)
4011         #We build the url we will use to get the final track url
4012         # This url is build in Bandcamp in the script download_bunde_*.js
4013         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4014         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4015         # If we could correctly generate the .rand field the url would be
4016         #in the "download_url" key
4017         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4018
4019         track_info = {'id':id,
4020                       'title' : info[u'title'],
4021                       'ext' : 'mp3',
4022                       'url' : final_url,
4023                       'thumbnail' : info[u'thumb_url'],
4024                       'uploader' : info[u'artist']
4025                       }
4026
4027         return [track_info]
4028
4029 class RedTubeIE(InfoExtractor):
4030     """Information Extractor for redtube"""
4031     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4032
4033     def _real_extract(self,url):
4034         mobj = re.match(self._VALID_URL, url)
4035         if mobj is None:
4036             raise ExtractorError(u'Invalid URL: %s' % url)
4037
4038         video_id = mobj.group('id')
4039         video_extension = 'mp4'
4040         webpage = self._download_webpage(url, video_id)
4041         self.report_extraction(video_id)
4042         mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4043
4044         if mobj is None:
4045             raise ExtractorError(u'Unable to extract media URL')
4046
4047         video_url = mobj.group(1)
4048         mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
4049         if mobj is None:
4050             raise ExtractorError(u'Unable to extract title')
4051         video_title = mobj.group(1)
4052
4053         return [{
4054             'id':       video_id,
4055             'url':      video_url,
4056             'ext':      video_extension,
4057             'title':    video_title,
4058         }]
4059
4060 class InaIE(InfoExtractor):
4061     """Information Extractor for Ina.fr"""
4062     _VALID_URL = r'(?:http://)?(?:www.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
4063
4064     def _real_extract(self,url):
4065         mobj = re.match(self._VALID_URL, url)
4066
4067         video_id = mobj.group('id')
4068         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
4069         video_extension = 'mp4'
4070         webpage = self._download_webpage(mrss_url, video_id)
4071
4072         mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
4073         if mobj is None:
4074             raise ExtractorError(u'Unable to extract media URL')
4075         video_url = mobj.group(1)
4076
4077         mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
4078         if mobj is None:
4079             raise ExtractorError(u'Unable to extract title')
4080         video_title = mobj.group(1)
4081
4082         return [{
4083             'id':       video_id,
4084             'url':      video_url,
4085             'ext':      video_extension,
4086             'title':    video_title,
4087         }]
4088
4089 def gen_extractors():
4090     """ Return a list of an instance of every supported extractor.
4091     The order does matter; the first extractor matched is the one handling the URL.
4092     """
4093     return [
4094         YoutubePlaylistIE(),
4095         YoutubeChannelIE(),
4096         YoutubeUserIE(),
4097         YoutubeSearchIE(),
4098         YoutubeIE(),
4099         MetacafeIE(),
4100         DailymotionIE(),
4101         GoogleSearchIE(),
4102         PhotobucketIE(),
4103         YahooIE(),
4104         YahooSearchIE(),
4105         DepositFilesIE(),
4106         FacebookIE(),
4107         BlipTVUserIE(),
4108         BlipTVIE(),
4109         VimeoIE(),
4110         MyVideoIE(),
4111         ComedyCentralIE(),
4112         EscapistIE(),
4113         CollegeHumorIE(),
4114         XVideosIE(),
4115         SoundcloudSetIE(),
4116         SoundcloudIE(),
4117         InfoQIE(),
4118         MixcloudIE(),
4119         StanfordOpenClassroomIE(),
4120         MTVIE(),
4121         YoukuIE(),
4122         XNXXIE(),
4123         YouJizzIE(),
4124         PornotubeIE(),
4125         YouPornIE(),
4126         GooglePlusIE(),
4127         ArteTvIE(),
4128         NBAIE(),
4129         WorldStarHipHopIE(),
4130         JustinTVIE(),
4131         FunnyOrDieIE(),
4132         SteamIE(),
4133         UstreamIE(),
4134         RBMARadioIE(),
4135         EightTracksIE(),
4136         KeekIE(),
4137         TEDIE(),
4138         MySpassIE(),
4139         SpiegelIE(),
4140         LiveLeakIE(),
4141         ARDIE(),
4142         TumblrIE(),
4143         BandcampIE(),
4144         RedTubeIE(),
4145         InaIE(),
4146         GenericIE()
4147     ]
4148
4149 def get_info_extractor(ie_name):
4150     """Returns the info extractor class with the given ie_name"""
4151     return globals()[ie_name+'IE']