]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
   2 # -*- coding: utf-8 -*- 
   3 # Author: Ricardo Garcia Gonzalez 
   4 # Author: Danny Colligan 
   5 # Author: Benjamin Johnson 
   6 # License: Public domain code 
  24 # parse_qs was moved from the cgi module to the urlparse module recently. 
  26         from urlparse 
import parse_qs
 
  28         from cgi 
import parse_qs
 
  31         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.11) Gecko/20101019 Firefox/3.6.11', 
  32         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
  33         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
  34         'Accept-Language': 'en-us,en;q=0.5', 
  37 simple_title_chars 
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii') 
  39 def preferredencoding(): 
  40         """Get preferred encoding. 
  42         Returns the best encoding scheme for the system, based on 
  43         locale.getpreferredencoding() and some further tweaks. 
  45         def yield_preferredencoding(): 
  47                         pref 
= locale
.getpreferredencoding() 
  53         return yield_preferredencoding().next() 
  55 def htmlentity_transform(matchobj
): 
  56         """Transforms an HTML entity to a Unicode character. 
  58         This function receives a match object and is intended to be used with 
  59         the re.sub() function. 
  61         entity 
= matchobj
.group(1) 
  63         # Known non-numeric HTML entity 
  64         if entity 
in htmlentitydefs
.name2codepoint
: 
  65                 return unichr(htmlentitydefs
.name2codepoint
[entity
]) 
  68         mobj 
= re
.match(ur
'(?u)#(x?\d+)', entity
) 
  70                 numstr 
= mobj
.group(1) 
  71                 if numstr
.startswith(u
'x'): 
  73                         numstr 
= u
'0%s' % numstr
 
  76                 return unichr(long(numstr
, base
)) 
  78         # Unknown entity in name, return its literal representation 
  79         return (u
'&%s;' % entity
) 
  81 def sanitize_title(utitle
): 
  82         """Sanitizes a video title so it could be used as part of a filename.""" 
  83         utitle 
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
) 
  84         return utitle
.replace(unicode(os
.sep
), u
'%') 
  86 def sanitize_open(filename
, open_mode
): 
  87         """Try to open the given filename, and slightly tweak it if this fails. 
  89         Attempts to open the given filename. If this fails, it tries to change 
  90         the filename slightly, step by step, until it's either able to open it 
  91         or it fails and raises a final exception, like the standard open() 
  94         It returns the tuple (stream, definitive_file_name). 
  98                         if sys
.platform 
== 'win32': 
 100                                 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
) 
 101                         return (sys
.stdout
, filename
) 
 102                 stream 
= open(filename
, open_mode
) 
 103                 return (stream
, filename
) 
 104         except (IOError, OSError), err
: 
 105                 # In case of error, try to remove win32 forbidden chars 
 106                 filename 
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
) 
 108                 # An exception here should be caught in the caller 
 109                 stream 
= open(filename
, open_mode
) 
 110                 return (stream
, filename
) 
 113 class DownloadError(Exception): 
 114         """Download Error exception. 
 116         This exception may be thrown by FileDownloader objects if they are not 
 117         configured to continue on errors. They will contain the appropriate 
 122 class SameFileError(Exception): 
 123         """Same File exception. 
 125         This exception will be thrown by FileDownloader objects if they detect 
 126         multiple files would have to be downloaded to the same file on disk. 
 130 class PostProcessingError(Exception): 
 131         """Post Processing exception. 
 133         This exception may be raised by PostProcessor's .run() method to 
 134         indicate an error in the postprocessing task. 
 138 class UnavailableVideoError(Exception): 
 139         """Unavailable Format exception. 
 141         This exception will be thrown when a video is requested 
 142         in a format that is not available for that video. 
 146 class ContentTooShortError(Exception): 
 147         """Content Too Short exception. 
 149         This exception may be raised by FileDownloader objects when a file they 
 150         download is too small for what the server announced first, indicating 
 151         the connection was probably interrupted. 
 157         def __init__(self
, downloaded
, expected
): 
 158                 self
.downloaded 
= downloaded
 
 159                 self
.expected 
= expected
 
 161 class FileDownloader(object): 
 162         """File Downloader class. 
 164         File downloader objects are the ones responsible of downloading the 
 165         actual video file and writing it to disk if the user has requested 
 166         it, among some other tasks. In most cases there should be one per 
 167         program. As, given a video URL, the downloader doesn't know how to 
 168         extract all the needed information, task that InfoExtractors do, it 
 169         has to pass the URL to one of them. 
 171         For this, file downloader objects have a method that allows 
 172         InfoExtractors to be registered in a given order. When it is passed 
 173         a URL, the file downloader handles it to the first InfoExtractor it 
 174         finds that reports being able to handle it. The InfoExtractor extracts 
 175         all the information about the video or videos the URL refers to, and 
 176         asks the FileDownloader to process the video information, possibly 
 177         downloading the video. 
 179         File downloaders accept a lot of parameters. In order not to saturate 
 180         the object constructor with arguments, it receives a dictionary of 
 181         options instead. These options are available through the params 
 182         attribute for the InfoExtractors to use. The FileDownloader also 
 183         registers itself as the downloader in charge for the InfoExtractors 
 184         that are added to it, so this is a "mutual registration". 
 188         username:         Username for authentication purposes. 
 189         password:         Password for authentication purposes. 
 190         usenetrc:         Use netrc for authentication instead. 
 191         quiet:            Do not print messages to stdout. 
 192         forceurl:         Force printing final URL. 
 193         forcetitle:       Force printing title. 
 194         forcethumbnail:   Force printing thumbnail URL. 
 195         forcedescription: Force printing description. 
 196         simulate:         Do not download the video files. 
 197         format:           Video format code. 
 198         format_limit:     Highest quality format to try. 
 199         outtmpl:          Template for output names. 
 200         ignoreerrors:     Do not stop on download errors. 
 201         ratelimit:        Download speed limit, in bytes/sec. 
 202         nooverwrites:     Prevent overwriting files. 
 203         retries:          Number of times to retry for HTTP error 5xx 
 204         continuedl:       Try to continue downloads if possible. 
 205         noprogress:       Do not print the progress bar. 
 206         playliststart:    Playlist item to start at. 
 207         logtostderr:      Log messages to stderr instead of stdout. 
 213         _download_retcode 
= None 
 214         _num_downloads 
= None 
 217         def __init__(self
, params
): 
 218                 """Create a FileDownloader object with the given options.""" 
 221                 self
._download
_retcode 
= 0 
 222                 self
._num
_downloads 
= 0 
 223                 self
._screen
_file 
= [sys
.stdout
, sys
.stderr
][params
.get('logtostderr', False)] 
 227         def pmkdir(filename
): 
 228                 """Create directory components in filename. Similar to Unix "mkdir -p".""" 
 229                 components 
= filename
.split(os
.sep
) 
 230                 aggregate 
= [os
.sep
.join(components
[0:x
]) for x 
in xrange(1, len(components
))] 
 231                 aggregate 
= ['%s%s' % (x
, os
.sep
) for x 
in aggregate
] # Finish names with separator 
 232                 for dir in aggregate
: 
 233                         if not os
.path
.exists(dir): 
 237         def format_bytes(bytes): 
 240                 if type(bytes) is str: 
 245                         exponent 
= long(math
.log(bytes, 1024.0)) 
 246                 suffix 
= 'bkMGTPEZY'[exponent
] 
 247                 converted 
= float(bytes) / float(1024**exponent
) 
 248                 return '%.2f%s' % (converted
, suffix
) 
 251         def calc_percent(byte_counter
, data_len
): 
 254                 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0)) 
 257         def calc_eta(start
, now
, total
, current
): 
 261                 if current 
== 0 or dif 
< 0.001: # One millisecond 
 263                 rate 
= float(current
) / dif
 
 264                 eta 
= long((float(total
) - float(current
)) / rate
) 
 265                 (eta_mins
, eta_secs
) = divmod(eta
, 60) 
 268                 return '%02d:%02d' % (eta_mins
, eta_secs
) 
 271         def calc_speed(start
, now
, bytes): 
 273                 if bytes == 0 or dif 
< 0.001: # One millisecond 
 274                         return '%10s' % '---b/s' 
 275                 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
)) 
 278         def best_block_size(elapsed_time
, bytes): 
 279                 new_min 
= max(bytes / 2.0, 1.0) 
 280                 new_max 
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB 
 281                 if elapsed_time 
< 0.001: 
 283                 rate 
= bytes / elapsed_time
 
 291         def parse_bytes(bytestr
): 
 292                 """Parse a string indicating a byte quantity into a long integer.""" 
 293                 matchobj 
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
) 
 296                 number 
= float(matchobj
.group(1)) 
 297                 multiplier 
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower()) 
 298                 return long(round(number 
* multiplier
)) 
 300         def add_info_extractor(self
, ie
): 
 301                 """Add an InfoExtractor object to the end of the list.""" 
 303                 ie
.set_downloader(self
) 
 305         def add_post_processor(self
, pp
): 
 306                 """Add a PostProcessor object to the end of the chain.""" 
 308                 pp
.set_downloader(self
) 
 310         def to_screen(self
, message
, skip_eol
=False, ignore_encoding_errors
=False): 
 311                 """Print message to stdout if not in quiet mode.""" 
 313                         if not self
.params
.get('quiet', False): 
 314                                 terminator 
= [u
'\n', u
''][skip_eol
] 
 315                                 print >>self
._screen
_file
, (u
'%s%s' % (message
, terminator
)).encode(preferredencoding()), 
 316                         self
._screen
_file
.flush() 
 317                 except (UnicodeEncodeError), err
: 
 318                         if not ignore_encoding_errors
: 
 321         def to_stderr(self
, message
): 
 322                 """Print message to stderr.""" 
 323                 print >>sys
.stderr
, message
.encode(preferredencoding()) 
 325         def fixed_template(self
): 
 326                 """Checks if the output template is fixed.""" 
 327                 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None) 
 329         def trouble(self
, message
=None): 
 330                 """Determine action to take when a download problem appears. 
 332                 Depending on if the downloader has been configured to ignore 
 333                 download errors or not, this method may throw an exception or 
 334                 not when errors are found, after printing the message. 
 336                 if message 
is not None: 
 337                         self
.to_stderr(message
) 
 338                 if not self
.params
.get('ignoreerrors', False): 
 339                         raise DownloadError(message
) 
 340                 self
._download
_retcode 
= 1 
 342         def slow_down(self
, start_time
, byte_counter
): 
 343                 """Sleep if the download speed is over the rate limit.""" 
 344                 rate_limit 
= self
.params
.get('ratelimit', None) 
 345                 if rate_limit 
is None or byte_counter 
== 0: 
 348                 elapsed 
= now 
- start_time
 
 351                 speed 
= float(byte_counter
) / elapsed
 
 352                 if speed 
> rate_limit
: 
 353                         time
.sleep((byte_counter 
- rate_limit 
* (now 
- start_time
)) / rate_limit
) 
 355         def report_destination(self
, filename
): 
 356                 """Report destination filename.""" 
 357                 self
.to_screen(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True) 
 359         def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
): 
 360                 """Report download progress.""" 
 361                 if self
.params
.get('noprogress', False): 
 363                 self
.to_screen(u
'\r[download] %s of %s at %s ETA %s' % 
 364                                 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True) 
 366         def report_resuming_byte(self
, resume_len
): 
 367                 """Report attempt to resume at given byte.""" 
 368                 self
.to_screen(u
'[download] Resuming download at byte %s' % resume_len
) 
 370         def report_retry(self
, count
, retries
): 
 371                 """Report retry in case of HTTP error 5xx""" 
 372                 self
.to_screen(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
)) 
 374         def report_file_already_downloaded(self
, file_name
): 
 375                 """Report file has already been fully downloaded.""" 
 377                         self
.to_screen(u
'[download] %s has already been downloaded' % file_name
) 
 378                 except (UnicodeEncodeError), err
: 
 379                         self
.to_screen(u
'[download] The file has already been downloaded') 
 381         def report_unable_to_resume(self
): 
 382                 """Report it was impossible to resume download.""" 
 383                 self
.to_screen(u
'[download] Unable to resume') 
 385         def report_finish(self
): 
 386                 """Report download finished.""" 
 387                 if self
.params
.get('noprogress', False): 
 388                         self
.to_screen(u
'[download] Download completed') 
 392         def increment_downloads(self
): 
 393                 """Increment the ordinal that assigns a number to each file.""" 
 394                 self
._num
_downloads 
+= 1 
 396         def process_info(self
, info_dict
): 
 397                 """Process a single dictionary returned by an InfoExtractor.""" 
 398                 # Do nothing else if in simulate mode 
 399                 if self
.params
.get('simulate', False): 
 401                         if self
.params
.get('forcetitle', False): 
 402                                 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace') 
 403                         if self
.params
.get('forceurl', False): 
 404                                 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace') 
 405                         if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
: 
 406                                 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') 
 407                         if self
.params
.get('forcedescription', False) and 'description' in info_dict
: 
 408                                 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace') 
 413                         template_dict 
= dict(info_dict
) 
 414                         template_dict
['epoch'] = unicode(long(time
.time())) 
 415                         template_dict
['ord'] = unicode('%05d' % self
._num
_downloads
) 
 416                         filename 
= self
.params
['outtmpl'] % template_dict
 
 417                 except (ValueError, KeyError), err
: 
 418                         self
.trouble(u
'ERROR: invalid system charset or erroneous output template') 
 420                 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
): 
 421                         self
.to_stderr(u
'WARNING: file exists and will be skipped') 
 425                         self
.pmkdir(filename
) 
 426                 except (OSError, IOError), err
: 
 427                         self
.trouble(u
'ERROR: unable to create directories: %s' % str(err
)) 
 431                         success 
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None)) 
 432                 except (OSError, IOError), err
: 
 433                         raise UnavailableVideoError
 
 434                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 435                         self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
)) 
 437                 except (ContentTooShortError
, ), err
: 
 438                         self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
)) 
 443                                 self
.post_process(filename
, info_dict
) 
 444                         except (PostProcessingError
), err
: 
 445                                 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
)) 
 448         def download(self
, url_list
): 
 449                 """Download a given list of URLs.""" 
 450                 if len(url_list
) > 1 and self
.fixed_template(): 
 451                         raise SameFileError(self
.params
['outtmpl']) 
 454                         suitable_found 
= False 
 456                                 # Go to next InfoExtractor if not suitable 
 457                                 if not ie
.suitable(url
): 
 460                                 # Suitable InfoExtractor found 
 461                                 suitable_found 
= True 
 463                                 # Extract information from URL and process it 
 466                                 # Suitable InfoExtractor had been found; go to next URL 
 469                         if not suitable_found
: 
 470                                 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
) 
 472                 return self
._download
_retcode
 
 474         def post_process(self
, filename
, ie_info
): 
 475                 """Run the postprocessing chain on the given file.""" 
 477                 info
['filepath'] = filename
 
 483         def _download_with_rtmpdump(self
, filename
, url
, player_url
): 
 484                 self
.report_destination(filename
) 
 486                 # Check for rtmpdump first 
 488                         subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
) 
 489                 except (OSError, IOError): 
 490                         self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run') 
 493                 # Download using rtmpdump. rtmpdump returns exit code 2 when 
 494                 # the connection was interrumpted and resuming appears to be 
 495                 # possible. This is part of rtmpdump's normal usage, AFAIK. 
 496                 basic_args 
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url 
is not None] + ['-r', url
, '-o', filename
] 
 497                 retval 
= subprocess
.call(basic_args 
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)]) 
 498                 while retval 
== 2 or retval 
== 1: 
 499                         prevsize 
= os
.path
.getsize(filename
) 
 500                         self
.to_screen(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True) 
 501                         time
.sleep(5.0) # This seems to be needed 
 502                         retval 
= subprocess
.call(basic_args 
+ ['-e'] + [[], ['-k', '1']][retval 
== 1]) 
 503                         cursize 
= os
.path
.getsize(filename
) 
 504                         if prevsize 
== cursize 
and retval 
== 1: 
 507                         self
.to_screen(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(filename
)) 
 510                         self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
) 
 513         def _do_download(self
, filename
, url
, player_url
): 
 514                 # Attempt to download using rtmpdump 
 515                 if url
.startswith('rtmp'): 
 516                         return self
._download
_with
_rtmpdump
(filename
, url
, player_url
) 
 520                 basic_request 
= urllib2
.Request(url
, None, std_headers
) 
 521                 request 
= urllib2
.Request(url
, None, std_headers
) 
 523                 # Establish possible resume length 
 524                 if os
.path
.isfile(filename
): 
 525                         resume_len 
= os
.path
.getsize(filename
) 
 529                 # Request parameters in case of being able to resume 
 530                 if self
.params
.get('continuedl', False) and resume_len 
!= 0: 
 531                         self
.report_resuming_byte(resume_len
) 
 532                         request
.add_header('Range','bytes=%d-' % resume_len
) 
 536                 retries 
= self
.params
.get('retries', 0) 
 537                 while count 
<= retries
: 
 538                         # Establish connection 
 540                                 data 
= urllib2
.urlopen(request
) 
 542                         except (urllib2
.HTTPError
, ), err
: 
 543                                 if (err
.code 
< 500 or err
.code 
>= 600) and err
.code 
!= 416: 
 544                                         # Unexpected HTTP error 
 546                                 elif err
.code 
== 416: 
 547                                         # Unable to resume (requested range not satisfiable) 
 549                                                 # Open the connection again without the range header 
 550                                                 data 
= urllib2
.urlopen(basic_request
) 
 551                                                 content_length 
= data
.info()['Content-Length'] 
 552                                         except (urllib2
.HTTPError
, ), err
: 
 553                                                 if err
.code 
< 500 or err
.code 
>= 600: 
 556                                                 # Examine the reported length 
 557                                                 if (content_length 
is not None and 
 558                                                     (resume_len 
- 100 < long(content_length
) < resume_len 
+ 100)): 
 559                                                         # The file had already been fully downloaded. 
 560                                                         # Explanation to the above condition: in issue #175 it was revealed that 
 561                                                         # YouTube sometimes adds or removes a few bytes from the end of the file, 
 562                                                         # changing the file size slightly and causing problems for some users. So 
 563                                                         # I decided to implement a suggested change and consider the file 
 564                                                         # completely downloaded if the file size differs less than 100 bytes from 
 565                                                         # the one in the hard drive. 
 566                                                         self
.report_file_already_downloaded(filename
) 
 569                                                         # The length does not match, we start the download over 
 570                                                         self
.report_unable_to_resume() 
 576                                 self
.report_retry(count
, retries
) 
 579                         self
.trouble(u
'ERROR: giving up after %s retries' % retries
) 
 582                 data_len 
= data
.info().get('Content-length', None) 
 583                 data_len_str 
= self
.format_bytes(data_len
) 
 590                         data_block 
= data
.read(block_size
) 
 592                         data_block_len 
= len(data_block
) 
 593                         if data_block_len 
== 0: 
 595                         byte_counter 
+= data_block_len
 
 597                         # Open file just in time 
 600                                         (stream
, filename
) = sanitize_open(filename
, open_mode
) 
 601                                         self
.report_destination(filename
) 
 602                                 except (OSError, IOError), err
: 
 603                                         self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
)) 
 606                                 stream
.write(data_block
) 
 607                         except (IOError, OSError), err
: 
 608                                 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
)) 
 610                         block_size 
= self
.best_block_size(after 
- before
, data_block_len
) 
 613                         percent_str 
= self
.calc_percent(byte_counter
, data_len
) 
 614                         eta_str 
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
) 
 615                         speed_str 
= self
.calc_speed(start
, time
.time(), byte_counter
) 
 616                         self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
) 
 619                         self
.slow_down(start
, byte_counter
) 
 622                 if data_len 
is not None and str(byte_counter
) != data_len
: 
 623                         raise ContentTooShortError(byte_counter
, long(data_len
)) 
 626 class InfoExtractor(object): 
 627         """Information Extractor class. 
 629         Information extractors are the classes that, given a URL, extract 
 630         information from the video (or videos) the URL refers to. This 
 631         information includes the real video URL, the video title and simplified 
 632         title, author and others. The information is stored in a dictionary 
 633         which is then passed to the FileDownloader. The FileDownloader 
 634         processes this information possibly downloading the video to the file 
 635         system, among other possible outcomes. The dictionaries must include 
 636         the following fields: 
 638         id:             Video identifier. 
 639         url:            Final video URL. 
 640         uploader:       Nickname of the video uploader. 
 641         title:          Literal title. 
 642         stitle:         Simplified title. 
 643         ext:            Video filename extension. 
 644         format:         Video format. 
 645         player_url:     SWF Player URL (may be None). 
 647         The following fields are optional. Their primary purpose is to allow 
 648         youtube-dl to serve as the backend for a video search function, such 
 649         as the one in youtube2mp3.  They are only used when their respective 
 650         forced printing functions are called: 
 652         thumbnail:      Full URL to a video thumbnail image. 
 653         description:    One-line video description. 
 655         Subclasses of this one should re-define the _real_initialize() and 
 656         _real_extract() methods, as well as the suitable() static method. 
 657         Probably, they should also be instantiated and added to the main 
 664         def __init__(self
, downloader
=None): 
 665                 """Constructor. Receives an optional downloader.""" 
 667                 self
.set_downloader(downloader
) 
 671                 """Receives a URL and returns True if suitable for this IE.""" 
 674         def initialize(self
): 
 675                 """Initializes an instance (authentication, etc).""" 
 677                         self
._real
_initialize
() 
 680         def extract(self
, url
): 
 681                 """Extracts URL information and returns it in list of dicts.""" 
 683                 return self
._real
_extract
(url
) 
 685         def set_downloader(self
, downloader
): 
 686                 """Sets the downloader for this IE.""" 
 687                 self
._downloader 
= downloader
 
 689         def _real_initialize(self
): 
 690                 """Real initialization process. Redefine in subclasses.""" 
 693         def _real_extract(self
, url
): 
 694                 """Real extraction process. Redefine in subclasses.""" 
 697 class YoutubeIE(InfoExtractor
): 
 698         """Information extractor for youtube.com.""" 
 700         _VALID_URL 
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$' 
 701         _LANG_URL 
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 
 702         _LOGIN_URL 
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en' 
 703         _AGE_URL 
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 
 704         _NETRC_MACHINE 
= 'youtube' 
 705         # Listed in order of quality 
 706         _available_formats 
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13'] 
 707         _video_extensions 
= { 
 713                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 
 720                 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None) 
 722         def report_lang(self
): 
 723                 """Report attempt to set language.""" 
 724                 self
._downloader
.to_screen(u
'[youtube] Setting language') 
 726         def report_login(self
): 
 727                 """Report attempt to log in.""" 
 728                 self
._downloader
.to_screen(u
'[youtube] Logging in') 
 730         def report_age_confirmation(self
): 
 731                 """Report attempt to confirm age.""" 
 732                 self
._downloader
.to_screen(u
'[youtube] Confirming age') 
 734         def report_video_webpage_download(self
, video_id
): 
 735                 """Report attempt to download video webpage.""" 
 736                 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
) 
 738         def report_video_info_webpage_download(self
, video_id
): 
 739                 """Report attempt to download video info webpage.""" 
 740                 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
) 
 742         def report_information_extraction(self
, video_id
): 
 743                 """Report attempt to extract video information.""" 
 744                 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
) 
 746         def report_unavailable_format(self
, video_id
, format
): 
 747                 """Report extracted video URL.""" 
 748                 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
)) 
 750         def report_rtmp_download(self
): 
 751                 """Indicate the download will use the RTMP protocol.""" 
 752                 self
._downloader
.to_screen(u
'[youtube] RTMP download detected') 
 754         def _real_initialize(self
): 
 755                 if self
._downloader 
is None: 
 760                 downloader_params 
= self
._downloader
.params
 
 762                 # Attempt to use provided username and password or .netrc data 
 763                 if downloader_params
.get('username', None) is not None: 
 764                         username 
= downloader_params
['username'] 
 765                         password 
= downloader_params
['password'] 
 766                 elif downloader_params
.get('usenetrc', False): 
 768                                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
 773                                         raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
 774                         except (IOError, netrc
.NetrcParseError
), err
: 
 775                                 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
)) 
 779                 request 
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
) 
 782                         urllib2
.urlopen(request
).read() 
 783                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 784                         self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
)) 
 787                 # No authentication to be performed 
 793                                 'current_form': 'loginForm', 
 795                                 'action_login': 'Log In', 
 796                                 'username':     username
, 
 797                                 'password':     password
, 
 799                 request 
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
) 
 802                         login_results 
= urllib2
.urlopen(request
).read() 
 803                         if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None: 
 804                                 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password') 
 806                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 807                         self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
)) 
 813                                 'action_confirm':       'Confirm', 
 815                 request 
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
) 
 817                         self
.report_age_confirmation() 
 818                         age_results 
= urllib2
.urlopen(request
).read() 
 819                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 820                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
 823         def _real_extract(self
, url
): 
 824                 # Extract video id from URL 
 825                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 827                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
 829                 video_id 
= mobj
.group(2) 
 832                 self
.report_video_webpage_download(video_id
) 
 833                 request 
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
, None, std_headers
) 
 835                         video_webpage 
= urllib2
.urlopen(request
).read() 
 836                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 837                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
 840                 # Attempt to extract SWF player URL 
 841                 mobj 
= re
.search(r
'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage
) 
 843                         player_url 
= mobj
.group(1) 
 848                 self
.report_video_info_webpage_download(video_id
) 
 849                 for el_type 
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 
 850                         video_info_url 
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 
 851                                            % (video_id
, el_type
)) 
 852                         request 
= urllib2
.Request(video_info_url
, None, std_headers
) 
 854                                 video_info_webpage 
= urllib2
.urlopen(request
).read() 
 855                                 video_info 
= parse_qs(video_info_webpage
) 
 856                                 if 'token' in video_info
: 
 858                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 859                                 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
)) 
 861                 if 'token' not in video_info
: 
 862                         if 'reason' in video_info
: 
 863                                 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8')) 
 865                                 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason') 
 868                 # Start extracting information 
 869                 self
.report_information_extraction(video_id
) 
 872                 if 'author' not in video_info
: 
 873                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
 875                 video_uploader 
= urllib
.unquote_plus(video_info
['author'][0]) 
 878                 if 'title' not in video_info
: 
 879                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
 881                 video_title 
= urllib
.unquote_plus(video_info
['title'][0]) 
 882                 video_title 
= video_title
.decode('utf-8') 
 883                 video_title 
= sanitize_title(video_title
) 
 886                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
 887                 simple_title 
= simple_title
.strip(ur
'_') 
 890                 if 'thumbnail_url' not in video_info
: 
 891                         self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail') 
 893                 else:   # don't panic if we can't find it 
 894                         video_thumbnail 
= urllib
.unquote_plus(video_info
['thumbnail_url'][0]) 
 897                 video_description 
= 'No description available.' 
 898                 if self
._downloader
.params
.get('forcedescription', False): 
 899                         mobj 
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
) 
 901                                 video_description 
= mobj
.group(1) 
 904                 video_token 
= urllib
.unquote_plus(video_info
['token'][0]) 
 906                 # Decide which formats to download 
 907                 requested_format 
= self
._downloader
.params
.get('format', None) 
 908                 get_video_template 
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
) 
 910                 if 'fmt_url_map' in video_info
: 
 911                         url_map 
= dict(tuple(pair
.split('|')) for pair 
in video_info
['fmt_url_map'][0].split(',')) 
 912                         format_limit 
= self
._downloader
.params
.get('format_limit', None) 
 913                         if format_limit 
is not None and format_limit 
in self
._available
_formats
: 
 914                                 format_list 
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):] 
 916                                 format_list 
= self
._available
_formats
 
 917                         existing_formats 
= [x 
for x 
in format_list 
if x 
in url_map
] 
 918                         if len(existing_formats
) == 0: 
 919                                 self
._downloader
.trouble(u
'ERROR: no known formats available for video') 
 921                         if requested_format 
is None: 
 922                                 video_url_list 
= [(existing_formats
[0], get_video_template 
% existing_formats
[0])] # Best quality 
 923                         elif requested_format 
== '-1': 
 924                                 video_url_list 
= [(f
, get_video_template 
% f
) for f 
in existing_formats
] # All formats 
 926                                 video_url_list 
= [(requested_format
, get_video_template 
% requested_format
)] # Specific format 
 928                 elif 'conn' in video_info 
and video_info
['conn'][0].startswith('rtmp'): 
 929                         self
.report_rtmp_download() 
 930                         video_url_list 
= [(None, video_info
['conn'][0])] 
 933                         self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info') 
 936                 for format_param
, video_real_url 
in video_url_list
: 
 937                         # At this point we have a new video 
 938                         self
._downloader
.increment_downloads() 
 941                         video_extension 
= self
._video
_extensions
.get(format_param
, 'flv') 
 943                         # Find the video URL in fmt_url_map or conn paramters 
 945                                 # Process video information 
 946                                 self
._downloader
.process_info({ 
 947                                         'id':           video_id
.decode('utf-8'), 
 948                                         'url':          video_real_url
.decode('utf-8'), 
 949                                         'uploader':     video_uploader
.decode('utf-8'), 
 950                                         'title':        video_title
, 
 951                                         'stitle':       simple_title
, 
 952                                         'ext':          video_extension
.decode('utf-8'), 
 953                                         'format':       (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
 954                                         'thumbnail':    video_thumbnail
.decode('utf-8'), 
 955                                         'description':  video_description
.decode('utf-8'), 
 956                                         'player_url':   player_url
, 
 958                         except UnavailableVideoError
, err
: 
 959                                 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)') 
 962 class MetacafeIE(InfoExtractor
): 
 963         """Information Extractor for metacafe.com.""" 
 965         _VALID_URL 
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 
 966         _DISCLAIMER 
= 'http://www.metacafe.com/family_filter/' 
 967         _FILTER_POST 
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 
 970         def __init__(self
, youtube_ie
, downloader
=None): 
 971                 InfoExtractor
.__init
__(self
, downloader
) 
 972                 self
._youtube
_ie 
= youtube_ie
 
 976                 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None) 
 978         def report_disclaimer(self
): 
 979                 """Report disclaimer retrieval.""" 
 980                 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer') 
 982         def report_age_confirmation(self
): 
 983                 """Report attempt to confirm age.""" 
 984                 self
._downloader
.to_screen(u
'[metacafe] Confirming age') 
 986         def report_download_webpage(self
, video_id
): 
 987                 """Report webpage download.""" 
 988                 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
) 
 990         def report_extraction(self
, video_id
): 
 991                 """Report information extraction.""" 
 992                 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
) 
 994         def _real_initialize(self
): 
 995                 # Retrieve disclaimer 
 996                 request 
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
) 
 998                         self
.report_disclaimer() 
 999                         disclaimer 
= urllib2
.urlopen(request
).read() 
1000                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1001                         self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
)) 
1007                         'submit': "Continue - I'm over 18", 
1009                 request 
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
) 
1011                         self
.report_age_confirmation() 
1012                         disclaimer 
= urllib2
.urlopen(request
).read() 
1013                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1014                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
1017         def _real_extract(self
, url
): 
1018                 # Extract id and simplified title from URL 
1019                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1021                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
1024                 video_id 
= mobj
.group(1) 
1026                 # Check if video comes from YouTube 
1027                 mobj2 
= re
.match(r
'^yt-(.*)$', video_id
) 
1028                 if mobj2 
is not None: 
1029                         self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1)) 
1032                 # At this point we have a new video 
1033                 self
._downloader
.increment_downloads() 
1035                 simple_title 
= mobj
.group(2).decode('utf-8') 
1037                 # Retrieve video webpage to extract further information 
1038                 request 
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
) 
1040                         self
.report_download_webpage(video_id
) 
1041                         webpage 
= urllib2
.urlopen(request
).read() 
1042                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1043                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
1046                 # Extract URL, uploader and title from webpage 
1047                 self
.report_extraction(video_id
) 
1048                 mobj 
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
) 
1049                 if mobj 
is not None: 
1050                         mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1051                         video_extension 
= mediaURL
[-3:] 
1053                         # Extract gdaKey if available 
1054                         mobj 
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
) 
1056                                 video_url 
= mediaURL
 
1058                                 gdaKey 
= mobj
.group(1) 
1059                                 video_url 
= '%s?__gda__=%s' % (mediaURL
, gdaKey
) 
1061                         mobj 
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
) 
1063                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1065                         vardict 
= parse_qs(mobj
.group(1)) 
1066                         if 'mediaData' not in vardict
: 
1067                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1069                         mobj 
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0]) 
1071                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1073                         mediaURL 
= mobj
.group(1).replace('\\/', '/') 
1074                         video_extension 
= mediaURL
[-3:] 
1075                         video_url 
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2)) 
1077                 mobj 
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
) 
1079                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1081                 video_title 
= mobj
.group(1).decode('utf-8') 
1082                 video_title 
= sanitize_title(video_title
) 
1084                 mobj 
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
) 
1086                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
1088                 video_uploader 
= mobj
.group(1) 
1091                         # Process video information 
1092                         self
._downloader
.process_info({ 
1093                                 'id':           video_id
.decode('utf-8'), 
1094                                 'url':          video_url
.decode('utf-8'), 
1095                                 'uploader':     video_uploader
.decode('utf-8'), 
1096                                 'title':        video_title
, 
1097                                 'stitle':       simple_title
, 
1098                                 'ext':          video_extension
.decode('utf-8'), 
1102                 except UnavailableVideoError
: 
1103                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1106 class DailymotionIE(InfoExtractor
): 
1107         """Information Extractor for Dailymotion""" 
1109         _VALID_URL 
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' 
1111         def __init__(self
, downloader
=None): 
1112                 InfoExtractor
.__init
__(self
, downloader
) 
1116                 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None) 
1118         def report_download_webpage(self
, video_id
): 
1119                 """Report webpage download.""" 
1120                 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
) 
1122         def report_extraction(self
, video_id
): 
1123                 """Report information extraction.""" 
1124                 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
) 
1126         def _real_initialize(self
): 
1129         def _real_extract(self
, url
): 
1130                 # Extract id and simplified title from URL 
1131                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1133                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
1136                 # At this point we have a new video 
1137                 self
._downloader
.increment_downloads() 
1138                 video_id 
= mobj
.group(1) 
1140                 simple_title 
= mobj
.group(2).decode('utf-8') 
1141                 video_extension 
= 'flv' 
1143                 # Retrieve video webpage to extract further information 
1144                 request 
= urllib2
.Request(url
) 
1146                         self
.report_download_webpage(video_id
) 
1147                         webpage 
= urllib2
.urlopen(request
).read() 
1148                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1149                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
1152                 # Extract URL, uploader and title from webpage 
1153                 self
.report_extraction(video_id
) 
1154                 mobj 
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
) 
1156                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1158                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1160                 # if needed add http://www.dailymotion.com/ if relative URL 
1162                 video_url 
= mediaURL
 
1164                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>' 
1165                 mobj 
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
) 
1167                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1169                 video_title 
= mobj
.group(1).decode('utf-8') 
1170                 video_title 
= sanitize_title(video_title
) 
1172                 mobj 
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage
) 
1174                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
1176                 video_uploader 
= mobj
.group(1) 
1179                         # Process video information 
1180                         self
._downloader
.process_info({ 
1181                                 'id':           video_id
.decode('utf-8'), 
1182                                 'url':          video_url
.decode('utf-8'), 
1183                                 'uploader':     video_uploader
.decode('utf-8'), 
1184                                 'title':        video_title
, 
1185                                 'stitle':       simple_title
, 
1186                                 'ext':          video_extension
.decode('utf-8'), 
1190                 except UnavailableVideoError
: 
1191                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1193 class GoogleIE(InfoExtractor
): 
1194         """Information extractor for video.google.com.""" 
1196         _VALID_URL 
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' 
1198         def __init__(self
, downloader
=None): 
1199                 InfoExtractor
.__init
__(self
, downloader
) 
1203                 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None) 
1205         def report_download_webpage(self
, video_id
): 
1206                 """Report webpage download.""" 
1207                 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
) 
1209         def report_extraction(self
, video_id
): 
1210                 """Report information extraction.""" 
1211                 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
) 
1213         def _real_initialize(self
): 
1216         def _real_extract(self
, url
): 
1217                 # Extract id from URL 
1218                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1220                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1223                 # At this point we have a new video 
1224                 self
._downloader
.increment_downloads() 
1225                 video_id 
= mobj
.group(1) 
1227                 video_extension 
= 'mp4' 
1229                 # Retrieve video webpage to extract further information 
1230                 request 
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
) 
1232                         self
.report_download_webpage(video_id
) 
1233                         webpage 
= urllib2
.urlopen(request
).read() 
1234                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1235                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1238                 # Extract URL, uploader, and title from webpage 
1239                 self
.report_extraction(video_id
) 
1240                 mobj 
= re
.search(r
"download_url:'([^']+)'", webpage
) 
1242                         video_extension 
= 'flv' 
1243                         mobj 
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
) 
1245                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1247                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1248                 mediaURL 
= mediaURL
.replace('\\x3d', '\x3d') 
1249                 mediaURL 
= mediaURL
.replace('\\x26', '\x26') 
1251                 video_url 
= mediaURL
 
1253                 mobj 
= re
.search(r
'<title>(.*)</title>', webpage
) 
1255                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1257                 video_title 
= mobj
.group(1).decode('utf-8') 
1258                 video_title 
= sanitize_title(video_title
) 
1259                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1261                 # Extract video description 
1262                 mobj 
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
) 
1264                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
1266                 video_description 
= mobj
.group(1).decode('utf-8') 
1267                 if not video_description
: 
1268                         video_description 
= 'No description available.' 
1270                 # Extract video thumbnail 
1271                 if self
._downloader
.params
.get('forcethumbnail', False): 
1272                         request 
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
))) 
1274                                 webpage 
= urllib2
.urlopen(request
).read() 
1275                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1276                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1278                         mobj 
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
) 
1280                                 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
1282                         video_thumbnail 
= mobj
.group(1) 
1283                 else:   # we need something to pass to process_info 
1284                         video_thumbnail 
= '' 
1288                         # Process video information 
1289                         self
._downloader
.process_info({ 
1290                                 'id':           video_id
.decode('utf-8'), 
1291                                 'url':          video_url
.decode('utf-8'), 
1293                                 'title':        video_title
, 
1294                                 'stitle':       simple_title
, 
1295                                 'ext':          video_extension
.decode('utf-8'), 
1299                 except UnavailableVideoError
: 
1300                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1303 class PhotobucketIE(InfoExtractor
): 
1304         """Information extractor for photobucket.com.""" 
1306         _VALID_URL 
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 
1308         def __init__(self
, downloader
=None): 
1309                 InfoExtractor
.__init
__(self
, downloader
) 
1313                 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None) 
1315         def report_download_webpage(self
, video_id
): 
1316                 """Report webpage download.""" 
1317                 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
) 
1319         def report_extraction(self
, video_id
): 
1320                 """Report information extraction.""" 
1321                 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
) 
1323         def _real_initialize(self
): 
1326         def _real_extract(self
, url
): 
1327                 # Extract id from URL 
1328                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1330                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1333                 # At this point we have a new video 
1334                 self
._downloader
.increment_downloads() 
1335                 video_id 
= mobj
.group(1) 
1337                 video_extension 
= 'flv' 
1339                 # Retrieve video webpage to extract further information 
1340                 request 
= urllib2
.Request(url
) 
1342                         self
.report_download_webpage(video_id
) 
1343                         webpage 
= urllib2
.urlopen(request
).read() 
1344                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1345                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1348                 # Extract URL, uploader, and title from webpage 
1349                 self
.report_extraction(video_id
) 
1350                 mobj 
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
) 
1352                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1354                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1356                 video_url 
= mediaURL
 
1358                 mobj 
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
) 
1360                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1362                 video_title 
= mobj
.group(1).decode('utf-8') 
1363                 video_title 
= sanitize_title(video_title
) 
1364                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1366                 video_uploader 
= mobj
.group(2).decode('utf-8') 
1369                         # Process video information 
1370                         self
._downloader
.process_info({ 
1371                                 'id':           video_id
.decode('utf-8'), 
1372                                 'url':          video_url
.decode('utf-8'), 
1373                                 'uploader':     video_uploader
, 
1374                                 'title':        video_title
, 
1375                                 'stitle':       simple_title
, 
1376                                 'ext':          video_extension
.decode('utf-8'), 
1380                 except UnavailableVideoError
: 
1381                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1384 class YahooIE(InfoExtractor
): 
1385         """Information extractor for video.yahoo.com.""" 
1387         # _VALID_URL matches all Yahoo! Video URLs 
1388         # _VPAGE_URL matches only the extractable '/watch/' URLs 
1389         _VALID_URL 
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' 
1390         _VPAGE_URL 
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' 
1392         def __init__(self
, downloader
=None): 
1393                 InfoExtractor
.__init
__(self
, downloader
) 
1397                 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None) 
1399         def report_download_webpage(self
, video_id
): 
1400                 """Report webpage download.""" 
1401                 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
) 
1403         def report_extraction(self
, video_id
): 
1404                 """Report information extraction.""" 
1405                 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
) 
1407         def _real_initialize(self
): 
1410         def _real_extract(self
, url
, new_video
=True): 
1411                 # Extract ID from URL 
1412                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1414                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1417                 # At this point we have a new video 
1418                 self
._downloader
.increment_downloads() 
1419                 video_id 
= mobj
.group(2) 
1420                 video_extension 
= 'flv' 
1422                 # Rewrite valid but non-extractable URLs as 
1423                 # extractable English language /watch/ URLs 
1424                 if re
.match(self
._VPAGE
_URL
, url
) is None: 
1425                         request 
= urllib2
.Request(url
) 
1427                                 webpage 
= urllib2
.urlopen(request
).read() 
1428                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1429                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1432                         mobj 
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
) 
1434                                 self
._downloader
.trouble(u
'ERROR: Unable to extract id field') 
1436                         yahoo_id 
= mobj
.group(1) 
1438                         mobj 
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
) 
1440                                 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field') 
1442                         yahoo_vid 
= mobj
.group(1) 
1444                         url 
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
) 
1445                         return self
._real
_extract
(url
, new_video
=False) 
1447                 # Retrieve video webpage to extract further information 
1448                 request 
= urllib2
.Request(url
) 
1450                         self
.report_download_webpage(video_id
) 
1451                         webpage 
= urllib2
.urlopen(request
).read() 
1452                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1453                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1456                 # Extract uploader and title from webpage 
1457                 self
.report_extraction(video_id
) 
1458                 mobj 
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
) 
1460                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
1462                 video_title 
= mobj
.group(1).decode('utf-8') 
1463                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1465                 mobj 
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
) 
1467                         self
._downloader
.trouble(u
'ERROR: unable to extract video uploader') 
1469                 video_uploader 
= mobj
.group(1).decode('utf-8') 
1471                 # Extract video thumbnail 
1472                 mobj 
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
) 
1474                         self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
1476                 video_thumbnail 
= mobj
.group(1).decode('utf-8') 
1478                 # Extract video description 
1479                 mobj 
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
) 
1481                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
1483                 video_description 
= mobj
.group(1).decode('utf-8') 
1484                 if not video_description
: video_description 
= 'No description available.' 
1486                 # Extract video height and width 
1487                 mobj 
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
) 
1489                         self
._downloader
.trouble(u
'ERROR: unable to extract video height') 
1491                 yv_video_height 
= mobj
.group(1) 
1493                 mobj 
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
) 
1495                         self
._downloader
.trouble(u
'ERROR: unable to extract video width') 
1497                 yv_video_width 
= mobj
.group(1) 
1499                 # Retrieve video playlist to extract media URL 
1500                 # I'm not completely sure what all these options are, but we 
1501                 # seem to need most of them, otherwise the server sends a 401. 
1502                 yv_lg 
= 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents 
1503                 yv_bitrate 
= '700'  # according to Wikipedia this is hard-coded 
1504                 request 
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id 
+ 
1505                                           '&tech=flash&mode=playlist&lg=' + yv_lg 
+ '&bitrate=' + yv_bitrate 
+ '&vidH=' + yv_video_height 
+ 
1506                                           '&vidW=' + yv_video_width 
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') 
1508                         self
.report_download_webpage(video_id
) 
1509                         webpage 
= urllib2
.urlopen(request
).read() 
1510                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1511                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1514                 # Extract media URL from playlist XML 
1515                 mobj 
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
) 
1517                         self
._downloader
.trouble(u
'ERROR: Unable to extract media URL') 
1519                 video_url 
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8') 
1520                 video_url 
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
) 
1523                         # Process video information 
1524                         self
._downloader
.process_info({ 
1525                                 'id':           video_id
.decode('utf-8'), 
1527                                 'uploader':     video_uploader
, 
1528                                 'title':        video_title
, 
1529                                 'stitle':       simple_title
, 
1530                                 'ext':          video_extension
.decode('utf-8'), 
1531                                 'thumbnail':    video_thumbnail
.decode('utf-8'), 
1532                                 'description':  video_description
, 
1533                                 'thumbnail':    video_thumbnail
, 
1534                                 'description':  video_description
, 
1537                 except UnavailableVideoError
: 
1538                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1541 class GenericIE(InfoExtractor
): 
1542         """Generic last-resort information extractor.""" 
1544         def __init__(self
, downloader
=None): 
1545                 InfoExtractor
.__init
__(self
, downloader
) 
1551         def report_download_webpage(self
, video_id
): 
1552                 """Report webpage download.""" 
1553                 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.') 
1554                 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
) 
1556         def report_extraction(self
, video_id
): 
1557                 """Report information extraction.""" 
1558                 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
) 
1560         def _real_initialize(self
): 
1563         def _real_extract(self
, url
): 
1564                 # At this point we have a new video 
1565                 self
._downloader
.increment_downloads() 
1567                 video_id 
= url
.split('/')[-1] 
1568                 request 
= urllib2
.Request(url
) 
1570                         self
.report_download_webpage(video_id
) 
1571                         webpage 
= urllib2
.urlopen(request
).read() 
1572                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1573                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1575                 except ValueError, err
: 
1576                         # since this is the last-resort InfoExtractor, if 
1577                         # this error is thrown, it'll be thrown here 
1578                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1581                 # Start with something easy: JW Player in SWFObject 
1582                 mobj 
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
1584                         # Broaden the search a little bit 
1585                         mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage) 
1587                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1590                 # It's possible that one of the regexes 
1591                 # matched, but returned an empty group: 
1592                 if mobj.group(1) is None: 
1593                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1596                 video_url = urllib.unquote(mobj.group(1)) 
1597                 video_id  = os.path.basename(video_url) 
1599                 # here's a fun little line of code for you: 
1600                 video_extension = os.path.splitext(video_id)[1][1:] 
1601                 video_id        = os.path.splitext(video_id)[0] 
1603                 # it's tempting to parse this further, but you would 
1604                 # have to take into account all the variations like 
1605                 #   Video Title - Site Name 
1606                 #   Site Name | Video Title 
1607                 #   Video Title - Tagline | Site Name 
1608                 # and so on and so forth; it's just not practical 
1609                 mobj = re.search(r'<title>(.*)</title>', webpage) 
1611                         self._downloader.trouble(u'ERROR: unable to extract title') 
1613                 video_title = mobj.group(1).decode('utf-8') 
1614                 video_title = sanitize_title(video_title) 
1615                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) 
1617                 # video uploader is domain name 
1618                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) 
1620                         self._downloader.trouble(u'ERROR: unable to extract title') 
1622                 video_uploader = mobj.group(1).decode('utf-8') 
1625                         # Process video information 
1626                         self._downloader.process_info({ 
1627                                 'id':           video_id.decode('utf-8'), 
1628                                 'url':          video_url.decode('utf-8'), 
1629                                 'uploader':     video_uploader, 
1630                                 'title':        video_title, 
1631                                 'stitle':       simple_title, 
1632                                 'ext':          video_extension.decode('utf-8'), 
1636                 except UnavailableVideoError, err: 
1637                         self._downloader.trouble(u'ERROR: unable to download video') 
1640 class YoutubeSearchIE(InfoExtractor): 
1641         """Information Extractor for YouTube search queries.""" 
1642         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+' 
1643         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' 
1644         _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"' 
1645         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' 
1647         _max_youtube_results = 1000 
1649         def __init__(self, youtube_ie, downloader=None): 
1650                 InfoExtractor.__init__(self, downloader) 
1651                 self._youtube_ie = youtube_ie 
1655                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None) 
1657         def report_download_page(self, query, pagenum): 
1658                 """Report attempt to download playlist page with given number.""" 
1659                 query = query.decode(preferredencoding()) 
1660                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) 
1662         def _real_initialize(self): 
1663                 self._youtube_ie.initialize() 
1665         def _real_extract(self, query): 
1666                 mobj = re.match(self._VALID_QUERY, query) 
1668                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1671                 prefix, query = query.split(':') 
1673                 query  = query.encode('utf-8') 
1675                         self._download_n_results(query, 1) 
1677                 elif prefix == 'all': 
1678                         self._download_n_results(query, self._max_youtube_results) 
1684                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1686                                 elif n > self._max_youtube_results: 
1687                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n)) 
1688                                         n = self._max_youtube_results 
1689                                 self._download_n_results(query, n) 
1691                         except ValueError: # parsing prefix as integer fails 
1692                                 self._download_n_results(query, 1) 
1695         def _download_n_results(self, query, n): 
1696                 """Downloads a specified number of results for a query""" 
1699                 already_seen = set() 
1703                         self.report_download_page(query, pagenum) 
1704                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1705                         request = urllib2.Request(result_url, None, std_headers) 
1707                                 page = urllib2.urlopen(request).read() 
1708                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1709                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1712                         # Extract video identifiers 
1713                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1714                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1] 
1715                                 if video_id not in already_seen: 
1716                                         video_ids.append(video_id) 
1717                                         already_seen.add(video_id) 
1718                                         if len(video_ids) == n: 
1719                                                 # Specified n videos reached 
1720                                                 for id in video_ids: 
1721                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1724                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1725                                 for id in video_ids: 
1726                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1729                         pagenum = pagenum + 1 
1731 class GoogleSearchIE(InfoExtractor): 
1732         """Information Extractor for Google Video search queries.""" 
1733         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+' 
1734         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' 
1735         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' 
1736         _MORE_PAGES_INDICATOR = r'<span>Next</span>' 
1738         _max_google_results = 1000 
1740         def __init__(self, google_ie, downloader=None): 
1741                 InfoExtractor.__init__(self, downloader) 
1742                 self._google_ie = google_ie 
1746                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None) 
1748         def report_download_page(self, query, pagenum): 
1749                 """Report attempt to download playlist page with given number.""" 
1750                 query = query.decode(preferredencoding()) 
1751                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum)) 
1753         def _real_initialize(self): 
1754                 self._google_ie.initialize() 
1756         def _real_extract(self, query): 
1757                 mobj = re.match(self._VALID_QUERY, query) 
1759                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1762                 prefix, query = query.split(':') 
1764                 query  = query.encode('utf-8') 
1766                         self._download_n_results(query, 1) 
1768                 elif prefix == 'all': 
1769                         self._download_n_results(query, self._max_google_results) 
1775                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1777                                 elif n > self._max_google_results: 
1778                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n)) 
1779                                         n = self._max_google_results 
1780                                 self._download_n_results(query, n) 
1782                         except ValueError: # parsing prefix as integer fails 
1783                                 self._download_n_results(query, 1) 
1786         def _download_n_results(self, query, n): 
1787                 """Downloads a specified number of results for a query""" 
1790                 already_seen = set() 
1794                         self.report_download_page(query, pagenum) 
1795                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1796                         request = urllib2.Request(result_url, None, std_headers) 
1798                                 page = urllib2.urlopen(request).read() 
1799                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1800                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1803                         # Extract video identifiers 
1804                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1805                                 video_id = mobj.group(1) 
1806                                 if video_id not in already_seen: 
1807                                         video_ids.append(video_id) 
1808                                         already_seen.add(video_id) 
1809                                         if len(video_ids) == n: 
1810                                                 # Specified n videos reached 
1811                                                 for id in video_ids: 
1812                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) 
1815                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1816                                 for id in video_ids: 
1817                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) 
1820                         pagenum = pagenum + 1 
1822 class YahooSearchIE(InfoExtractor): 
1823         """Information Extractor for Yahoo! Video search queries.""" 
1824         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+' 
1825         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' 
1826         _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"' 
1827         _MORE_PAGES_INDICATOR = r'\s*Next' 
1829         _max_yahoo_results = 1000 
1831         def __init__(self, yahoo_ie, downloader=None): 
1832                 InfoExtractor.__init__(self, downloader) 
1833                 self._yahoo_ie = yahoo_ie 
1837                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None) 
1839         def report_download_page(self, query, pagenum): 
1840                 """Report attempt to download playlist page with given number.""" 
1841                 query = query.decode(preferredencoding()) 
1842                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) 
1844         def _real_initialize(self): 
1845                 self._yahoo_ie.initialize() 
1847         def _real_extract(self, query): 
1848                 mobj = re.match(self._VALID_QUERY, query) 
1850                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1853                 prefix, query = query.split(':') 
1855                 query  = query.encode('utf-8') 
1857                         self._download_n_results(query, 1) 
1859                 elif prefix == 'all': 
1860                         self._download_n_results(query, self._max_yahoo_results) 
1866                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1868                                 elif n > self._max_yahoo_results: 
1869                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n)) 
1870                                         n = self._max_yahoo_results 
1871                                 self._download_n_results(query, n) 
1873                         except ValueError: # parsing prefix as integer fails 
1874                                 self._download_n_results(query, 1) 
1877         def _download_n_results(self, query, n): 
1878                 """Downloads a specified number of results for a query""" 
1881                 already_seen = set() 
1885                         self.report_download_page(query, pagenum) 
1886                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1887                         request = urllib2.Request(result_url, None, std_headers) 
1889                                 page = urllib2.urlopen(request).read() 
1890                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1891                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1894                         # Extract video identifiers 
1895                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1896                                 video_id = mobj.group(1) 
1897                                 if video_id not in already_seen: 
1898                                         video_ids.append(video_id) 
1899                                         already_seen.add(video_id) 
1900                                         if len(video_ids) == n: 
1901                                                 # Specified n videos reached 
1902                                                 for id in video_ids: 
1903                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) 
1906                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1907                                 for id in video_ids: 
1908                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) 
1911                         pagenum = pagenum + 1 
1913 class YoutubePlaylistIE(InfoExtractor): 
1914         """Information Extractor for YouTube playlists.""" 
1916         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*' 
1917         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en' 
1918         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' 
1919         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' 
1922         def __init__(self, youtube_ie, downloader=None): 
1923                 InfoExtractor.__init__(self, downloader) 
1924                 self._youtube_ie = youtube_ie 
1928                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None) 
1930         def report_download_page(self, playlist_id, pagenum): 
1931                 """Report attempt to download playlist page with given number.""" 
1932                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) 
1934         def _real_initialize(self): 
1935                 self._youtube_ie.initialize() 
1937         def _real_extract(self, url): 
1938                 # Extract playlist id 
1939                 mobj = re.match(self._VALID_URL, url) 
1941                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1944                 # Download playlist pages 
1945                 playlist_id = mobj.group(1) 
1950                         self.report_download_page(playlist_id, pagenum) 
1951                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers) 
1953                                 page = urllib2.urlopen(request).read() 
1954                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1955                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1958                         # Extract video identifiers 
1960                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1961                                 if mobj.group(1) not in ids_in_page: 
1962                                         ids_in_page.append(mobj.group(1)) 
1963                         video_ids.extend(ids_in_page) 
1965                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1967                         pagenum = pagenum + 1 
1969                 playliststart = self._downloader.params.get('playliststart', 1) 
1970                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based 
1971                 if playliststart > 0: 
1972                         video_ids = video_ids[playliststart:] 
1974                 for id in video_ids: 
1975                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1978 class YoutubeUserIE(InfoExtractor): 
1979         """Information Extractor for YouTube users.""" 
1981         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)' 
1982         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' 
1983         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this. 
1986         def __init__(self, youtube_ie, downloader=None): 
1987                 InfoExtractor.__init__(self, downloader) 
1988                 self._youtube_ie = youtube_ie 
1992                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None) 
1994         def report_download_page(self, username): 
1995                 """Report attempt to download user page.""" 
1996                 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username)) 
1998         def _real_initialize(self): 
1999                 self._youtube_ie.initialize() 
2001         def _real_extract(self, url): 
2003                 mobj = re.match(self._VALID_URL, url) 
2005                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
2008                 # Download user page 
2009                 username = mobj.group(1) 
2013                 self.report_download_page(username) 
2014                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers) 
2016                         page = urllib2.urlopen(request).read() 
2017                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2018                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
2021                 # Extract video identifiers 
2024                 for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
2025                         if mobj.group(1) not in ids_in_page: 
2026                                 ids_in_page.append(mobj.group(1)) 
2027                 video_ids.extend(ids_in_page) 
2029                 playliststart = self._downloader.params.get('playliststart', 1) 
2030                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based 
2031                 if playliststart > 0: 
2032                         video_ids = video_ids[playliststart:]    
2034                 for id in video_ids: 
2035                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
2038 class PostProcessor(object): 
2039         """Post Processor class. 
2041         PostProcessor objects can be added to downloaders with their 
2042         add_post_processor() method. When the downloader has finished a 
2043         successful download, it will take its internal chain of PostProcessors 
2044         and start calling the run() method on each one of them, first with 
2045         an initial argument and then with the returned value of the previous 
2048         The chain will be stopped if one of them ever returns None or the end 
2049         of the chain is reached. 
2051         PostProcessor objects follow a "mutual registration
" process similar 
2052         to InfoExtractor objects. 
2057         def __init__(self, downloader=None): 
2058                 self._downloader = downloader 
2060         def set_downloader(self, downloader): 
2061                 """Sets the downloader for this PP.""" 
2062                 self._downloader = downloader 
2064         def run(self, information): 
2065                 """Run the PostProcessor. 
2067                 The "information
" argument is a dictionary like the ones 
2068                 composed by InfoExtractors. The only difference is that this 
2069                 one has an extra field called "filepath
" that points to the 
2072                 When this method returns None, the postprocessing chain is 
2073                 stopped. However, this method may return an information 
2074                 dictionary that will be passed to the next postprocessing 
2075                 object in the chain. It can be the one it received after 
2076                 changing some fields. 
2078                 In addition, this method may raise a PostProcessingError 
2079                 exception that will be taken into account by the downloader 
2082                 return information # by default, do nothing 
2084 ### MAIN PROGRAM ### 
2085 if __name__ == '__main__': 
2087                 # Modules needed only when running the main program 
2091                 # Function to update the program file with the latest version from bitbucket.org 
2092                 def update_self(downloader, filename): 
2093                         # Note: downloader only used for options 
2094                         if not os.access (filename, os.W_OK): 
2095                                 sys.exit('ERROR: no write permissions on %s' % filename) 
2097                         downloader.to_screen('Updating to latest stable version...') 
2098                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION' 
2099                         latest_version = urllib.urlopen(latest_url).read().strip() 
2100                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version 
2101                         newcontent = urllib.urlopen(prog_url).read() 
2102                         stream = open(filename, 'w') 
2103                         stream.write(newcontent) 
2105                         downloader.to_screen('Updated to version %s' % latest_version) 
2107                 # Parse command line 
2108                 parser = optparse.OptionParser( 
2109                         usage='Usage: %prog [options] url...', 
2110                         version='2010.10.24', 
2111                         conflict_handler='resolve', 
2114                 parser.add_option('-h', '--help', 
2115                                 action='help', help='print this help text and exit') 
2116                 parser.add_option('-v', '--version', 
2117                                 action='version', help='print program version and exit') 
2118                 parser.add_option('-U', '--update', 
2119                                 action='store_true', dest='update_self', help='update this program to latest stable version') 
2120                 parser.add_option('-i', '--ignore-errors', 
2121                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) 
2122                 parser.add_option('-r', '--rate-limit', 
2123                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') 
2124                 parser.add_option('-R', '--retries', 
2125                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) 
2126                 parser.add_option('--playlist-start', 
2127                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) 
2129                 authentication = optparse.OptionGroup(parser, 'Authentication Options') 
2130                 authentication.add_option('-u', '--username', 
2131                                 dest='username', metavar='USERNAME', help='account username') 
2132                 authentication.add_option('-p', '--password', 
2133                                 dest='password', metavar='PASSWORD', help='account password') 
2134                 authentication.add_option('-n', '--netrc', 
2135                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) 
2136                 parser.add_option_group(authentication) 
2138                 video_format = optparse.OptionGroup(parser, 'Video Format Options') 
2139                 video_format.add_option('-f', '--format', 
2140                                 action='store', dest='format', metavar='FORMAT', help='video format code') 
2141                 video_format.add_option('-m', '--mobile-version', 
2142                                 action='store_const', dest='format', help='alias for -f 17', const='17') 
2143                 video_format.add_option('--all-formats', 
2144                                 action='store_const', dest='format', help='download all available video formats', const='-1') 
2145                 video_format.add_option('--max-quality', 
2146                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') 
2147                 video_format.add_option('-b', '--best-quality', 
2148                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)') 
2149                 parser.add_option_group(video_format) 
2151                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') 
2152                 verbosity.add_option('-q', '--quiet', 
2153                                 action='store_true', dest='quiet', help='activates quiet mode', default=False) 
2154                 verbosity.add_option('-s', '--simulate', 
2155                                 action='store_true', dest='simulate', help='do not download video', default=False) 
2156                 verbosity.add_option('-g', '--get-url', 
2157                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) 
2158                 verbosity.add_option('-e', '--get-title', 
2159                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) 
2160                 verbosity.add_option('--get-thumbnail', 
2161                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False) 
2162                 verbosity.add_option('--get-description', 
2163                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False) 
2164                 verbosity.add_option('--no-progress', 
2165                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False) 
2166                 parser.add_option_group(verbosity) 
2168                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options') 
2169                 filesystem.add_option('-t', '--title', 
2170                                 action='store_true', dest='usetitle', help='use title in file name', default=False) 
2171                 filesystem.add_option('-l', '--literal', 
2172                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False) 
2173                 filesystem.add_option('-o', '--output', 
2174                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template') 
2175                 filesystem.add_option('-a', '--batch-file', 
2176                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') 
2177                 filesystem.add_option('-w', '--no-overwrites', 
2178                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) 
2179                 filesystem.add_option('-c', '--continue', 
2180                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False) 
2181                 filesystem.add_option('--cookies', 
2182                                 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to') 
2183                 parser.add_option_group(filesystem) 
2185                 (opts, args) = parser.parse_args() 
2187                 # Open appropriate CookieJar 
2188                 if opts.cookiefile is None: 
2189                         jar = cookielib.CookieJar() 
2192                                 jar = cookielib.MozillaCookieJar(opts.cookiefile) 
2193                                 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK): 
2195                         except (IOError, OSError), err: 
2196                                 sys.exit(u'ERROR: unable to open cookie file') 
2198                 # General configuration 
2199                 cookie_processor = urllib2.HTTPCookieProcessor(jar) 
2200                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler())) 
2201                 urllib2.install_opener(urllib2.build_opener(cookie_processor)) 
2202                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) 
2204                 # Batch file verification 
2206                 if opts.batchfile is not None: 
2208                                 if opts.batchfile == '-': 
2211                                         batchfd = open(opts.batchfile, 'r') 
2212                                 batchurls = batchfd.readlines() 
2213                                 batchurls = [x.strip() for x in batchurls] 
2214                                 batchurls = [x for x in batchurls if len(x) > 0] 
2216                                 sys.exit(u'ERROR: batch file could not be read') 
2217                 all_urls = batchurls + args 
2219                 # Conflicting, missing and erroneous options 
2220                 if opts.bestquality: 
2221                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n' 
2222                 if opts.usenetrc and (opts.username is not None or opts.password is not None): 
2223                         parser.error(u'using .netrc conflicts with giving username/password') 
2224                 if opts.password is not None and opts.username is None: 
2225                         parser.error(u'account username missing') 
2226                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle): 
2227                         parser.error(u'using output template conflicts with using title or literal title') 
2228                 if opts.usetitle and opts.useliteral: 
2229                         parser.error(u'using title conflicts with using literal title') 
2230                 if opts.username is not None and opts.password is None: 
2231                         opts.password = getpass.getpass(u'Type account password and press return:') 
2232                 if opts.ratelimit is not None: 
2233                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) 
2234                         if numeric_limit is None: 
2235                                 parser.error(u'invalid rate limit specified') 
2236                         opts.ratelimit = numeric_limit 
2237                 if opts.retries is not None: 
2239                                 opts.retries = long(opts.retries) 
2240                         except (TypeError, ValueError), err: 
2241                                 parser.error(u'invalid retry count specified') 
2242                 if opts.playliststart is not None: 
2244                                 opts.playliststart = long(opts.playliststart) 
2245                         except (TypeError, ValueError), err: 
2246                                 parser.error(u'invalid playlist page specified') 
2248                 # Information extractors 
2249                 youtube_ie = YoutubeIE() 
2250                 metacafe_ie = MetacafeIE(youtube_ie) 
2251                 dailymotion_ie = DailymotionIE() 
2252                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie) 
2253                 youtube_user_ie = YoutubeUserIE(youtube_ie) 
2254                 youtube_search_ie = YoutubeSearchIE(youtube_ie) 
2255                 google_ie = GoogleIE() 
2256                 google_search_ie = GoogleSearchIE(google_ie) 
2257                 photobucket_ie = PhotobucketIE() 
2258                 yahoo_ie = YahooIE() 
2259                 yahoo_search_ie = YahooSearchIE(yahoo_ie) 
2260                 generic_ie = GenericIE() 
2263                 fd = FileDownloader({ 
2264                         'usenetrc': opts.usenetrc, 
2265                         'username': opts.username, 
2266                         'password': opts.password, 
2267                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription), 
2268                         'forceurl': opts.geturl, 
2269                         'forcetitle': opts.gettitle, 
2270                         'forcethumbnail': opts.getthumbnail, 
2271                         'forcedescription': opts.getdescription, 
2272                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription), 
2273                         'format': opts.format, 
2274                         'format_limit': opts.format_limit, 
2275                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) 
2276                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') 
2277                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') 
2278                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s') 
2279                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s') 
2280                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s') 
2281                                 or u'%(id)s.%(ext)s'), 
2282                         'ignoreerrors': opts.ignoreerrors, 
2283                         'ratelimit': opts.ratelimit, 
2284                         'nooverwrites': opts.nooverwrites, 
2285                         'retries': opts.retries, 
2286                         'continuedl': opts.continue_dl, 
2287                         'noprogress': opts.noprogress, 
2288                         'playliststart': opts.playliststart, 
2289                         'logtostderr': opts.outtmpl == '-', 
2291                 fd.add_info_extractor(youtube_search_ie) 
2292                 fd.add_info_extractor(youtube_pl_ie) 
2293                 fd.add_info_extractor(youtube_user_ie) 
2294                 fd.add_info_extractor(metacafe_ie) 
2295                 fd.add_info_extractor(dailymotion_ie) 
2296                 fd.add_info_extractor(youtube_ie) 
2297                 fd.add_info_extractor(google_ie) 
2298                 fd.add_info_extractor(google_search_ie) 
2299                 fd.add_info_extractor(photobucket_ie) 
2300                 fd.add_info_extractor(yahoo_ie) 
2301                 fd.add_info_extractor(yahoo_search_ie) 
2303                 # This must come last since it's the 
2304                 # fallback if none of the others work 
2305                 fd.add_info_extractor(generic_ie) 
2308                 if opts.update_self: 
2309                         update_self(fd, sys.argv[0]) 
2312                 if len(all_urls) < 1: 
2313                         if not opts.update_self: 
2314                                 parser.error(u'you must provide at least one URL') 
2317                 retcode = fd.download(all_urls) 
2319                 # Dump cookie jar if requested 
2320                 if opts.cookiefile is not None: 
2323                         except (IOError, OSError), err: 
2324                                 sys.exit(u'ERROR: unable to save cookie jar') 
2328         except DownloadError: 
2330         except SameFileError: 
2331                 sys.exit(u'ERROR: fixed output name but more than one file to download') 
2332         except KeyboardInterrupt: 
2333                 sys.exit(u'\nERROR: Interrupted by user')