]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
   2 # -*- coding: utf-8 -*- 
   3 # Author: Ricardo Garcia Gonzalez 
   4 # Author: Danny Colligan 
   5 # Author: Benjamin Johnson 
   6 # License: Public domain code 
  23 # parse_qs was moved from the cgi module to the urlparse module recently. 
  25         from urlparse 
import parse_qs
 
  27         from cgi 
import parse_qs
 
  30         'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8', 
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 
  32         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 
  33         'Accept-Language': 'en-us,en;q=0.5', 
  36 simple_title_chars 
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii') 
  38 def preferredencoding(): 
  39         """Get preferred encoding. 
  41         Returns the best encoding scheme for the system, based on 
  42         locale.getpreferredencoding() and some further tweaks. 
  44         def yield_preferredencoding(): 
  46                         pref 
= locale
.getpreferredencoding() 
  52         return yield_preferredencoding().next() 
  54 def htmlentity_transform(matchobj
): 
  55         """Transforms an HTML entity to a Unicode character. 
  57         This function receives a match object and is intended to be used with 
  58         the re.sub() function. 
  60         entity 
= matchobj
.group(1) 
  62         # Known non-numeric HTML entity 
  63         if entity 
in htmlentitydefs
.name2codepoint
: 
  64                 return unichr(htmlentitydefs
.name2codepoint
[entity
]) 
  67         mobj 
= re
.match(ur
'(?u)#(x?\d+)', entity
) 
  69                 numstr 
= mobj
.group(1) 
  70                 if numstr
.startswith(u
'x'): 
  72                         numstr 
= u
'0%s' % numstr
 
  75                 return unichr(long(numstr
, base
)) 
  77         # Unknown entity in name, return its literal representation 
  78         return (u
'&%s;' % entity
) 
  80 def sanitize_title(utitle
): 
  81         """Sanitizes a video title so it could be used as part of a filename.""" 
  82         utitle 
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
) 
  83         return utitle
.replace(unicode(os
.sep
), u
'%') 
  85 def sanitize_open(filename
, open_mode
): 
  86         """Try to open the given filename, and slightly tweak it if this fails. 
  88         Attempts to open the given filename. If this fails, it tries to change 
  89         the filename slightly, step by step, until it's either able to open it 
  90         or it fails and raises a final exception, like the standard open() 
  93         It returns the tuple (stream, definitive_file_name). 
  97                         return (sys
.stdout
, filename
) 
  98                 stream 
= open(filename
, open_mode
) 
  99                 return (stream
, filename
) 
 100         except (IOError, OSError), err
: 
 101                 # In case of error, try to remove win32 forbidden chars 
 102                 filename 
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
) 
 104                 # An exception here should be caught in the caller 
 105                 stream 
= open(filename
, open_mode
) 
 106                 return (stream
, filename
) 
 109 class DownloadError(Exception): 
 110         """Download Error exception. 
 112         This exception may be thrown by FileDownloader objects if they are not 
 113         configured to continue on errors. They will contain the appropriate 
 118 class SameFileError(Exception): 
 119         """Same File exception. 
 121         This exception will be thrown by FileDownloader objects if they detect 
 122         multiple files would have to be downloaded to the same file on disk. 
 126 class PostProcessingError(Exception): 
 127         """Post Processing exception. 
 129         This exception may be raised by PostProcessor's .run() method to 
 130         indicate an error in the postprocessing task. 
 134 class UnavailableVideoError(Exception): 
 135         """Unavailable Format exception. 
 137         This exception will be thrown when a video is requested 
 138         in a format that is not available for that video. 
 142 class ContentTooShortError(Exception): 
 143         """Content Too Short exception. 
 145         This exception may be raised by FileDownloader objects when a file they 
 146         download is too small for what the server announced first, indicating 
 147         the connection was probably interrupted. 
 153         def __init__(self
, downloaded
, expected
): 
 154                 self
.downloaded 
= downloaded
 
 155                 self
.expected 
= expected
 
 157 class FileDownloader(object): 
 158         """File Downloader class. 
 160         File downloader objects are the ones responsible of downloading the 
 161         actual video file and writing it to disk if the user has requested 
 162         it, among some other tasks. In most cases there should be one per 
 163         program. As, given a video URL, the downloader doesn't know how to 
 164         extract all the needed information, task that InfoExtractors do, it 
 165         has to pass the URL to one of them. 
 167         For this, file downloader objects have a method that allows 
 168         InfoExtractors to be registered in a given order. When it is passed 
 169         a URL, the file downloader handles it to the first InfoExtractor it 
 170         finds that reports being able to handle it. The InfoExtractor extracts 
 171         all the information about the video or videos the URL refers to, and 
 172         asks the FileDownloader to process the video information, possibly 
 173         downloading the video. 
 175         File downloaders accept a lot of parameters. In order not to saturate 
 176         the object constructor with arguments, it receives a dictionary of 
 177         options instead. These options are available through the params 
 178         attribute for the InfoExtractors to use. The FileDownloader also 
 179         registers itself as the downloader in charge for the InfoExtractors 
 180         that are added to it, so this is a "mutual registration". 
 184         username:       Username for authentication purposes. 
 185         password:       Password for authentication purposes. 
 186         usenetrc:       Use netrc for authentication instead. 
 187         quiet:          Do not print messages to stdout. 
 188         forceurl:       Force printing final URL. 
 189         forcetitle:     Force printing title. 
 190         simulate:       Do not download the video files. 
 191         format:         Video format code. 
 192         format_limit:   Highest quality format to try. 
 193         outtmpl:        Template for output names. 
 194         ignoreerrors:   Do not stop on download errors. 
 195         ratelimit:      Download speed limit, in bytes/sec. 
 196         nooverwrites:   Prevent overwriting files. 
 197         retries:        Number of times to retry for HTTP error 503 
 198         continuedl:     Try to continue downloads if possible. 
 199         noprogress:     Do not print the progress bar. 
 205         _download_retcode 
= None 
 206         _num_downloads 
= None 
 208         def __init__(self
, params
): 
 209                 """Create a FileDownloader object with the given options.""" 
 212                 self
._download
_retcode 
= 0 
 213                 self
._num
_downloads 
= 0 
 217         def pmkdir(filename
): 
 218                 """Create directory components in filename. Similar to Unix "mkdir -p".""" 
 219                 components 
= filename
.split(os
.sep
) 
 220                 aggregate 
= [os
.sep
.join(components
[0:x
]) for x 
in xrange(1, len(components
))] 
 221                 aggregate 
= ['%s%s' % (x
, os
.sep
) for x 
in aggregate
] # Finish names with separator 
 222                 for dir in aggregate
: 
 223                         if not os
.path
.exists(dir): 
 227         def format_bytes(bytes): 
 230                 if type(bytes) is str: 
 235                         exponent 
= long(math
.log(bytes, 1024.0)) 
 236                 suffix 
= 'bkMGTPEZY'[exponent
] 
 237                 converted 
= float(bytes) / float(1024**exponent
) 
 238                 return '%.2f%s' % (converted
, suffix
) 
 241         def calc_percent(byte_counter
, data_len
): 
 244                 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0)) 
 247         def calc_eta(start
, now
, total
, current
): 
 251                 if current 
== 0 or dif 
< 0.001: # One millisecond 
 253                 rate 
= float(current
) / dif
 
 254                 eta 
= long((float(total
) - float(current
)) / rate
) 
 255                 (eta_mins
, eta_secs
) = divmod(eta
, 60) 
 258                 return '%02d:%02d' % (eta_mins
, eta_secs
) 
 261         def calc_speed(start
, now
, bytes): 
 263                 if bytes == 0 or dif 
< 0.001: # One millisecond 
 264                         return '%10s' % '---b/s' 
 265                 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
)) 
 268         def best_block_size(elapsed_time
, bytes): 
 269                 new_min 
= max(bytes / 2.0, 1.0) 
 270                 new_max 
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB 
 271                 if elapsed_time 
< 0.001: 
 273                 rate 
= bytes / elapsed_time
 
 281         def parse_bytes(bytestr
): 
 282                 """Parse a string indicating a byte quantity into a long integer.""" 
 283                 matchobj 
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
) 
 286                 number 
= float(matchobj
.group(1)) 
 287                 multiplier 
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower()) 
 288                 return long(round(number 
* multiplier
)) 
 290         def add_info_extractor(self
, ie
): 
 291                 """Add an InfoExtractor object to the end of the list.""" 
 293                 ie
.set_downloader(self
) 
 295         def add_post_processor(self
, pp
): 
 296                 """Add a PostProcessor object to the end of the chain.""" 
 298                 pp
.set_downloader(self
) 
 300         def to_stdout(self
, message
, skip_eol
=False, ignore_encoding_errors
=False): 
 301                 """Print message to stdout if not in quiet mode.""" 
 303                         if not self
.params
.get('quiet', False): 
 304                                 print (u
'%s%s' % (message
, [u
'\n', u
''][skip_eol
])).encode(preferredencoding()), 
 306                 except (UnicodeEncodeError), err
: 
 307                         if not ignore_encoding_errors
: 
 310         def to_stderr(self
, message
): 
 311                 """Print message to stderr.""" 
 312                 print >>sys
.stderr
, message
.encode(preferredencoding()) 
 314         def fixed_template(self
): 
 315                 """Checks if the output template is fixed.""" 
 316                 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None) 
 318         def trouble(self
, message
=None): 
 319                 """Determine action to take when a download problem appears. 
 321                 Depending on if the downloader has been configured to ignore 
 322                 download errors or not, this method may throw an exception or 
 323                 not when errors are found, after printing the message. 
 325                 if message 
is not None: 
 326                         self
.to_stderr(message
) 
 327                 if not self
.params
.get('ignoreerrors', False): 
 328                         raise DownloadError(message
) 
 329                 self
._download
_retcode 
= 1 
 331         def slow_down(self
, start_time
, byte_counter
): 
 332                 """Sleep if the download speed is over the rate limit.""" 
 333                 rate_limit 
= self
.params
.get('ratelimit', None) 
 334                 if rate_limit 
is None or byte_counter 
== 0: 
 337                 elapsed 
= now 
- start_time
 
 340                 speed 
= float(byte_counter
) / elapsed
 
 341                 if speed 
> rate_limit
: 
 342                         time
.sleep((byte_counter 
- rate_limit 
* (now 
- start_time
)) / rate_limit
) 
 344         def report_destination(self
, filename
): 
 345                 """Report destination filename.""" 
 346                 self
.to_stdout(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True) 
 348         def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
): 
 349                 """Report download progress.""" 
 350                 if self
.params
.get('noprogress', False): 
 352                 self
.to_stdout(u
'\r[download] %s of %s at %s ETA %s' % 
 353                                 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True) 
 355         def report_resuming_byte(self
, resume_len
): 
 356                 """Report attemtp to resume at given byte.""" 
 357                 self
.to_stdout(u
'[download] Resuming download at byte %s' % resume_len
) 
 359         def report_retry(self
, count
, retries
): 
 360                 """Report retry in case of HTTP error 503""" 
 361                 self
.to_stdout(u
'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count
, retries
)) 
 363         def report_file_already_downloaded(self
, file_name
): 
 364                 """Report file has already been fully downloaded.""" 
 366                         self
.to_stdout(u
'[download] %s has already been downloaded' % file_name
) 
 367                 except (UnicodeEncodeError), err
: 
 368                         self
.to_stdout(u
'[download] The file has already been downloaded') 
 370         def report_unable_to_resume(self
): 
 371                 """Report it was impossible to resume download.""" 
 372                 self
.to_stdout(u
'[download] Unable to resume') 
 374         def report_finish(self
): 
 375                 """Report download finished.""" 
 376                 if self
.params
.get('noprogress', False): 
 377                         self
.to_stdout(u
'[download] Download completed') 
 381         def increment_downloads(self
): 
 382                 """Increment the ordinal that assigns a number to each file.""" 
 383                 self
._num
_downloads 
+= 1 
 385         def process_info(self
, info_dict
): 
 386                 """Process a single dictionary returned by an InfoExtractor.""" 
 387                 # Do nothing else if in simulate mode 
 388                 if self
.params
.get('simulate', False): 
 390                         if self
.params
.get('forcetitle', False): 
 391                                 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace') 
 392                         if self
.params
.get('forceurl', False): 
 393                                 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace') 
 394                         if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
: 
 395                                 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace') 
 396                         if self
.params
.get('forcedescription', False) and 'description' in info_dict
: 
 397                                 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace') 
 402                         template_dict 
= dict(info_dict
) 
 403                         template_dict
['epoch'] = unicode(long(time
.time())) 
 404                         template_dict
['ord'] = unicode('%05d' % self
._num
_downloads
) 
 405                         filename 
= self
.params
['outtmpl'] % template_dict
 
 406                 except (ValueError, KeyError), err
: 
 407                         self
.trouble('ERROR: invalid output template or system charset: %s' % str(err
)) 
 408                 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
): 
 409                         self
.to_stderr(u
'WARNING: file exists: %s; skipping' % filename
) 
 413                         self
.pmkdir(filename
) 
 414                 except (OSError, IOError), err
: 
 415                         self
.trouble('ERROR: unable to create directories: %s' % str(err
)) 
 419                         success 
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None)) 
 420                 except (OSError, IOError), err
: 
 421                         raise UnavailableVideoError
 
 422                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 423                         self
.trouble('ERROR: unable to download video data: %s' % str(err
)) 
 425                 except (ContentTooShortError
, ), err
: 
 426                         self
.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
)) 
 431                                 self
.post_process(filename
, info_dict
) 
 432                         except (PostProcessingError
), err
: 
 433                                 self
.trouble('ERROR: postprocessing: %s' % str(err
)) 
 436         def download(self
, url_list
): 
 437                 """Download a given list of URLs.""" 
 438                 if len(url_list
) > 1 and self
.fixed_template(): 
 439                         raise SameFileError(self
.params
['outtmpl']) 
 442                         suitable_found 
= False 
 444                                 # Go to next InfoExtractor if not suitable 
 445                                 if not ie
.suitable(url
): 
 448                                 # Suitable InfoExtractor found 
 449                                 suitable_found 
= True 
 451                                 # Extract information from URL and process it 
 454                                 # Suitable InfoExtractor had been found; go to next URL 
 457                         if not suitable_found
: 
 458                                 self
.trouble('ERROR: no suitable InfoExtractor: %s' % url
) 
 460                 return self
._download
_retcode
 
 462         def post_process(self
, filename
, ie_info
): 
 463                 """Run the postprocessing chain on the given file.""" 
 465                 info
['filepath'] = filename
 
 471         def _download_with_rtmpdump(self
, filename
, url
, player_url
): 
 472                 self
.report_destination(filename
) 
 474                 # Check for rtmpdump first 
 476                         subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
) 
 477                 except (OSError, IOError): 
 478                         self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run') 
 481                 # Download using rtmpdump. rtmpdump returns exit code 2 when 
 482                 # the connection was interrumpted and resuming appears to be 
 483                 # possible. This is part of rtmpdump's normal usage, AFAIK. 
 484                 basic_args 
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url 
is not None] + ['-r', url
, '-o', filename
] 
 485                 retval 
= subprocess
.call(basic_args 
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)]) 
 486                 while retval 
== 2 or retval 
== 1: 
 487                         prevsize 
= os
.path
.getsize(filename
) 
 488                         self
.to_stdout(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True) 
 489                         time
.sleep(5.0) # This seems to be needed 
 490                         retval 
= subprocess
.call(basic_args 
+ ['-e'] + [[], ['-k', '1']][retval 
== 1]) 
 491                         cursize 
= os
.path
.getsize(filename
) 
 492                         if prevsize 
== cursize 
and retval 
== 1: 
 495                         self
.to_stdout(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(filename
)) 
 498                         self
.trouble('\nERROR: rtmpdump exited with code %d' % retval
) 
 501         def _do_download(self
, filename
, url
, player_url
): 
 502                 # Attempt to download using rtmpdump 
 503                 if url
.startswith('rtmp'): 
 504                         return self
._download
_with
_rtmpdump
(filename
, url
, player_url
) 
 508                 basic_request 
= urllib2
.Request(url
, None, std_headers
) 
 509                 request 
= urllib2
.Request(url
, None, std_headers
) 
 511                 # Establish possible resume length 
 512                 if os
.path
.isfile(filename
): 
 513                         resume_len 
= os
.path
.getsize(filename
) 
 517                 # Request parameters in case of being able to resume 
 518                 if self
.params
.get('continuedl', False) and resume_len 
!= 0: 
 519                         self
.report_resuming_byte(resume_len
) 
 520                         request
.add_header('Range','bytes=%d-' % resume_len
) 
 524                 retries 
= self
.params
.get('retries', 0) 
 525                 while count 
<= retries
: 
 526                         # Establish connection 
 528                                 data 
= urllib2
.urlopen(request
) 
 530                         except (urllib2
.HTTPError
, ), err
: 
 531                                 if err
.code 
!= 503 and err
.code 
!= 416: 
 532                                         # Unexpected HTTP error 
 534                                 elif err
.code 
== 416: 
 535                                         # Unable to resume (requested range not satisfiable) 
 537                                                 # Open the connection again without the range header 
 538                                                 data 
= urllib2
.urlopen(basic_request
) 
 539                                                 content_length 
= data
.info()['Content-Length'] 
 540                                         except (urllib2
.HTTPError
, ), err
: 
 544                                                 # Examine the reported length 
 545                                                 if (content_length 
is not None and 
 546                                                     (resume_len 
- 100 < long(content_length
) < resume_len 
+ 100)): 
 547                                                         # The file had already been fully downloaded. 
 548                                                         # Explanation to the above condition: in issue #175 it was revealed that 
 549                                                         # YouTube sometimes adds or removes a few bytes from the end of the file, 
 550                                                         # changing the file size slightly and causing problems for some users. So 
 551                                                         # I decided to implement a suggested change and consider the file 
 552                                                         # completely downloaded if the file size differs less than 100 bytes from 
 553                                                         # the one in the hard drive. 
 554                                                         self
.report_file_already_downloaded(filename
) 
 557                                                         # The length does not match, we start the download over 
 558                                                         self
.report_unable_to_resume() 
 564                                 self
.report_retry(count
, retries
) 
 567                         self
.trouble(u
'ERROR: giving up after %s retries' % retries
) 
 570                 data_len 
= data
.info().get('Content-length', None) 
 571                 data_len_str 
= self
.format_bytes(data_len
) 
 578                         data_block 
= data
.read(block_size
) 
 580                         data_block_len 
= len(data_block
) 
 581                         if data_block_len 
== 0: 
 583                         byte_counter 
+= data_block_len
 
 585                         # Open file just in time 
 588                                         (stream
, filename
) = sanitize_open(filename
, open_mode
) 
 589                                         self
.report_destination(filename
) 
 590                                 except (OSError, IOError), err
: 
 591                                         self
.trouble('ERROR: unable to open for writing: %s' % str(err
)) 
 594                                 stream
.write(data_block
) 
 595                         except (IOError, OSError), err
: 
 596                                 self
.trouble('\nERROR: unable to write data: %s' % str(err
)) 
 597                         block_size 
= self
.best_block_size(after 
- before
, data_block_len
) 
 600                         percent_str 
= self
.calc_percent(byte_counter
, data_len
) 
 601                         eta_str 
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
) 
 602                         speed_str 
= self
.calc_speed(start
, time
.time(), byte_counter
) 
 603                         self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
) 
 606                         self
.slow_down(start
, byte_counter
) 
 609                 if data_len 
is not None and str(byte_counter
) != data_len
: 
 610                         raise ContentTooShortError(byte_counter
, long(data_len
)) 
 613 class InfoExtractor(object): 
 614         """Information Extractor class. 
 616         Information extractors are the classes that, given a URL, extract 
 617         information from the video (or videos) the URL refers to. This 
 618         information includes the real video URL, the video title and simplified 
 619         title, author and others. The information is stored in a dictionary 
 620         which is then passed to the FileDownloader. The FileDownloader 
 621         processes this information possibly downloading the video to the file 
 622         system, among other possible outcomes. The dictionaries must include 
 623         the following fields: 
 625         id:             Video identifier. 
 626         url:            Final video URL. 
 627         uploader:       Nickname of the video uploader. 
 628         title:          Literal title. 
 629         stitle:         Simplified title. 
 630         ext:            Video filename extension. 
 631         format:         Video format. 
 632         player_url:     SWF Player URL (may be None). 
 634         The following fields are optional. Their primary purpose is to allow 
 635         youtube-dl to serve as the backend for a video search function, such 
 636         as the one in youtube2mp3.  They are only used when their respective 
 637         forced printing functions are called: 
 639         thumbnail:      Full URL to a video thumbnail image. 
 640         description:    One-line video description. 
 642         Subclasses of this one should re-define the _real_initialize() and 
 643         _real_extract() methods, as well as the suitable() static method. 
 644         Probably, they should also be instantiated and added to the main 
 651         def __init__(self
, downloader
=None): 
 652                 """Constructor. Receives an optional downloader.""" 
 654                 self
.set_downloader(downloader
) 
 658                 """Receives a URL and returns True if suitable for this IE.""" 
 661         def initialize(self
): 
 662                 """Initializes an instance (authentication, etc).""" 
 664                         self
._real
_initialize
() 
 667         def extract(self
, url
): 
 668                 """Extracts URL information and returns it in list of dicts.""" 
 670                 return self
._real
_extract
(url
) 
 672         def set_downloader(self
, downloader
): 
 673                 """Sets the downloader for this IE.""" 
 674                 self
._downloader 
= downloader
 
 676         def _real_initialize(self
): 
 677                 """Real initialization process. Redefine in subclasses.""" 
 680         def _real_extract(self
, url
): 
 681                 """Real extraction process. Redefine in subclasses.""" 
 684 class YoutubeIE(InfoExtractor
): 
 685         """Information extractor for youtube.com.""" 
 687         _VALID_URL 
= r
'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$' 
 688         _LANG_URL 
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 
 689         _LOGIN_URL 
= 'http://www.youtube.com/signup?next=/&gl=US&hl=en' 
 690         _AGE_URL 
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 
 691         _NETRC_MACHINE 
= 'youtube' 
 692         # Listed in order of quality 
 693         _available_formats 
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13'] 
 694         _video_extensions 
= { 
 700                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 
 707                 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None) 
 709         def report_lang(self
): 
 710                 """Report attempt to set language.""" 
 711                 self
._downloader
.to_stdout(u
'[youtube] Setting language') 
 713         def report_login(self
): 
 714                 """Report attempt to log in.""" 
 715                 self
._downloader
.to_stdout(u
'[youtube] Logging in') 
 717         def report_age_confirmation(self
): 
 718                 """Report attempt to confirm age.""" 
 719                 self
._downloader
.to_stdout(u
'[youtube] Confirming age') 
 721         def report_video_webpage_download(self
, video_id
): 
 722                 """Report attempt to download video webpage.""" 
 723                 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video webpage' % video_id
) 
 725         def report_video_info_webpage_download(self
, video_id
): 
 726                 """Report attempt to download video info webpage.""" 
 727                 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video info webpage' % video_id
) 
 729         def report_information_extraction(self
, video_id
): 
 730                 """Report attempt to extract video information.""" 
 731                 self
._downloader
.to_stdout(u
'[youtube] %s: Extracting video information' % video_id
) 
 733         def report_unavailable_format(self
, video_id
, format
): 
 734                 """Report extracted video URL.""" 
 735                 self
._downloader
.to_stdout(u
'[youtube] %s: Format %s not available' % (video_id
, format
)) 
 737         def report_rtmp_download(self
): 
 738                 """Indicate the download will use the RTMP protocol.""" 
 739                 self
._downloader
.to_stdout(u
'[youtube] RTMP download detected') 
 741         def _real_initialize(self
): 
 742                 if self
._downloader 
is None: 
 747                 downloader_params 
= self
._downloader
.params
 
 749                 # Attempt to use provided username and password or .netrc data 
 750                 if downloader_params
.get('username', None) is not None: 
 751                         username 
= downloader_params
['username'] 
 752                         password 
= downloader_params
['password'] 
 753                 elif downloader_params
.get('usenetrc', False): 
 755                                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
 760                                         raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
 761                         except (IOError, netrc
.NetrcParseError
), err
: 
 762                                 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
)) 
 766                 request 
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
) 
 769                         urllib2
.urlopen(request
).read() 
 770                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 771                         self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
)) 
 774                 # No authentication to be performed 
 780                                 'current_form': 'loginForm', 
 782                                 'action_login': 'Log In', 
 783                                 'username':     username
, 
 784                                 'password':     password
, 
 786                 request 
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
) 
 789                         login_results 
= urllib2
.urlopen(request
).read() 
 790                         if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None: 
 791                                 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password') 
 793                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 794                         self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
)) 
 800                                 'action_confirm':       'Confirm', 
 802                 request 
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
) 
 804                         self
.report_age_confirmation() 
 805                         age_results 
= urllib2
.urlopen(request
).read() 
 806                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 807                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
 810         def _real_extract(self
, url
): 
 811                 # Extract video id from URL 
 812                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 814                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
 816                 video_id 
= mobj
.group(2) 
 819                 self
.report_video_webpage_download(video_id
) 
 820                 request 
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
, None, std_headers
) 
 822                         video_webpage 
= urllib2
.urlopen(request
).read() 
 823                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 824                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
 827                 # Attempt to extract SWF player URL 
 828                 mobj 
= re
.search(r
'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage
) 
 830                         player_url 
= mobj
.group(1) 
 835                 self
.report_video_info_webpage_download(video_id
) 
 836                 for el_type 
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 
 837                         video_info_url 
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 
 838                                            % (video_id
, el_type
)) 
 839                         request 
= urllib2
.Request(video_info_url
, None, std_headers
) 
 841                                 video_info_webpage 
= urllib2
.urlopen(request
).read() 
 842                                 video_info 
= parse_qs(video_info_webpage
) 
 843                                 if 'token' in video_info
: 
 845                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 846                                 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
)) 
 848                 if 'token' not in video_info
: 
 849                         if 'reason' in video_info
: 
 850                                 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0]) 
 852                                 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason') 
 855                 # Start extracting information 
 856                 self
.report_information_extraction(video_id
) 
 859                 if 'author' not in video_info
: 
 860                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
 862                 video_uploader 
= urllib
.unquote_plus(video_info
['author'][0]) 
 865                 if 'title' not in video_info
: 
 866                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
 868                 video_title 
= urllib
.unquote_plus(video_info
['title'][0]) 
 869                 video_title 
= video_title
.decode('utf-8') 
 870                 video_title 
= sanitize_title(video_title
) 
 873                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
 874                 simple_title 
= simple_title
.strip(ur
'_') 
 877                 if 'thumbnail_url' not in video_info
: 
 878                         self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail') 
 880                 else:   # don't panic if we can't find it 
 881                         video_thumbnail 
= urllib
.unquote_plus(video_info
['thumbnail_url'][0]) 
 884                 video_description 
= 'No description available.' 
 885                 if self
._downloader
.params
.get('forcedescription', False): 
 886                         mobj 
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
) 
 888                                 video_description 
= mobj
.group(1) 
 891                 video_token 
= urllib
.unquote_plus(video_info
['token'][0]) 
 893                 # Decide which formats to download 
 894                 requested_format 
= self
._downloader
.params
.get('format', None) 
 895                 get_video_template 
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
) 
 897                 if 'fmt_url_map' in video_info
: 
 898                         url_map 
= dict(tuple(pair
.split('|')) for pair 
in video_info
['fmt_url_map'][0].split(',')) 
 899                         format_limit 
= self
._downloader
.params
.get('format_limit', None) 
 900                         if format_limit 
is not None and format_limit 
in self
._available
_formats
: 
 901                                 format_list 
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):] 
 903                                 format_list 
= self
._available
_formats
 
 904                         existing_formats 
= [x 
for x 
in format_list 
if x 
in url_map
] 
 905                         if len(existing_formats
) == 0: 
 906                                 self
._downloader
.trouble(u
'ERROR: no known formats available for video') 
 908                         if requested_format 
is None: 
 909                                 video_url_list 
= [(existing_formats
[0], get_video_template 
% existing_formats
[0])] # Best quality 
 910                         elif requested_format 
== '-1': 
 911                                 video_url_list 
= [(f
, get_video_template 
% f
) for f 
in existing_formats
] # All formats 
 913                                 video_url_list 
= [(requested_format
, get_video_template 
% requested_format
)] # Specific format 
 915                 elif 'conn' in video_info 
and video_info
['conn'][0].startswith('rtmp'): 
 916                         self
.report_rtmp_download() 
 917                         video_url_list 
= [(None, video_info
['conn'][0])] 
 920                         self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info') 
 923                 for format_param
, video_real_url 
in video_url_list
: 
 924                         # At this point we have a new video 
 925                         self
._downloader
.increment_downloads() 
 928                         video_extension 
= self
._video
_extensions
.get(format_param
, 'flv') 
 930                         # Find the video URL in fmt_url_map or conn paramters 
 932                                 # Process video information 
 933                                 self
._downloader
.process_info({ 
 934                                         'id':           video_id
.decode('utf-8'), 
 935                                         'url':          video_real_url
.decode('utf-8'), 
 936                                         'uploader':     video_uploader
.decode('utf-8'), 
 937                                         'title':        video_title
, 
 938                                         'stitle':       simple_title
, 
 939                                         'ext':          video_extension
.decode('utf-8'), 
 940                                         'format':       (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
 941                                         'thumbnail':    video_thumbnail
.decode('utf-8'), 
 942                                         'description':  video_description
.decode('utf-8'), 
 943                                         'player_url':   player_url
, 
 945                         except UnavailableVideoError
, err
: 
 946                                 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)') 
 949 class MetacafeIE(InfoExtractor
): 
 950         """Information Extractor for metacafe.com.""" 
 952         _VALID_URL 
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 
 953         _DISCLAIMER 
= 'http://www.metacafe.com/family_filter/' 
 954         _FILTER_POST 
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 
 957         def __init__(self
, youtube_ie
, downloader
=None): 
 958                 InfoExtractor
.__init
__(self
, downloader
) 
 959                 self
._youtube
_ie 
= youtube_ie
 
 963                 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None) 
 965         def report_disclaimer(self
): 
 966                 """Report disclaimer retrieval.""" 
 967                 self
._downloader
.to_stdout(u
'[metacafe] Retrieving disclaimer') 
 969         def report_age_confirmation(self
): 
 970                 """Report attempt to confirm age.""" 
 971                 self
._downloader
.to_stdout(u
'[metacafe] Confirming age') 
 973         def report_download_webpage(self
, video_id
): 
 974                 """Report webpage download.""" 
 975                 self
._downloader
.to_stdout(u
'[metacafe] %s: Downloading webpage' % video_id
) 
 977         def report_extraction(self
, video_id
): 
 978                 """Report information extraction.""" 
 979                 self
._downloader
.to_stdout(u
'[metacafe] %s: Extracting information' % video_id
) 
 981         def _real_initialize(self
): 
 982                 # Retrieve disclaimer 
 983                 request 
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
) 
 985                         self
.report_disclaimer() 
 986                         disclaimer 
= urllib2
.urlopen(request
).read() 
 987                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 988                         self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
)) 
 994                         'submit': "Continue - I'm over 18", 
 996                 request 
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
) 
 998                         self
.report_age_confirmation() 
 999                         disclaimer 
= urllib2
.urlopen(request
).read() 
1000                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1001                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
1004         def _real_extract(self
, url
): 
1005                 # Extract id and simplified title from URL 
1006                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1008                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
1011                 video_id 
= mobj
.group(1) 
1013                 # Check if video comes from YouTube 
1014                 mobj2 
= re
.match(r
'^yt-(.*)$', video_id
) 
1015                 if mobj2 
is not None: 
1016                         self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1)) 
1019                 # At this point we have a new video 
1020                 self
._downloader
.increment_downloads() 
1022                 simple_title 
= mobj
.group(2).decode('utf-8') 
1023                 video_extension 
= 'flv' 
1025                 # Retrieve video webpage to extract further information 
1026                 request 
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
) 
1028                         self
.report_download_webpage(video_id
) 
1029                         webpage 
= urllib2
.urlopen(request
).read() 
1030                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1031                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
1034                 # Extract URL, uploader and title from webpage 
1035                 self
.report_extraction(video_id
) 
1036                 mobj 
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
) 
1038                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1040                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1042                 # Extract gdaKey if available 
1043                 mobj 
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
) 
1045                         video_url 
= mediaURL
 
1046                         #self._downloader.trouble(u'ERROR: unable to extract gdaKey') 
1049                         gdaKey 
= mobj
.group(1) 
1050                         video_url 
= '%s?__gda__=%s' % (mediaURL
, gdaKey
) 
1052                 mobj 
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
) 
1054                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1056                 video_title 
= mobj
.group(1).decode('utf-8') 
1057                 video_title 
= sanitize_title(video_title
) 
1059                 mobj 
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
) 
1061                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
1063                 video_uploader 
= mobj
.group(1) 
1066                         # Process video information 
1067                         self
._downloader
.process_info({ 
1068                                 'id':           video_id
.decode('utf-8'), 
1069                                 'url':          video_url
.decode('utf-8'), 
1070                                 'uploader':     video_uploader
.decode('utf-8'), 
1071                                 'title':        video_title
, 
1072                                 'stitle':       simple_title
, 
1073                                 'ext':          video_extension
.decode('utf-8'), 
1077                 except UnavailableVideoError
: 
1078                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1081 class DailymotionIE(InfoExtractor
): 
1082         """Information Extractor for Dailymotion""" 
1084         _VALID_URL 
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' 
1086         def __init__(self
, downloader
=None): 
1087                 InfoExtractor
.__init
__(self
, downloader
) 
1091                 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None) 
1093         def report_download_webpage(self
, video_id
): 
1094                 """Report webpage download.""" 
1095                 self
._downloader
.to_stdout(u
'[dailymotion] %s: Downloading webpage' % video_id
) 
1097         def report_extraction(self
, video_id
): 
1098                 """Report information extraction.""" 
1099                 self
._downloader
.to_stdout(u
'[dailymotion] %s: Extracting information' % video_id
) 
1101         def _real_initialize(self
): 
1104         def _real_extract(self
, url
): 
1105                 # Extract id and simplified title from URL 
1106                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1108                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
1111                 # At this point we have a new video 
1112                 self
._downloader
.increment_downloads() 
1113                 video_id 
= mobj
.group(1) 
1115                 simple_title 
= mobj
.group(2).decode('utf-8') 
1116                 video_extension 
= 'flv' 
1118                 # Retrieve video webpage to extract further information 
1119                 request 
= urllib2
.Request(url
) 
1121                         self
.report_download_webpage(video_id
) 
1122                         webpage 
= urllib2
.urlopen(request
).read() 
1123                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1124                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
1127                 # Extract URL, uploader and title from webpage 
1128                 self
.report_extraction(video_id
) 
1129                 mobj 
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
) 
1131                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1133                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1135                 # if needed add http://www.dailymotion.com/ if relative URL 
1137                 video_url 
= mediaURL
 
1139                 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>' 
1140                 mobj 
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
) 
1142                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1144                 video_title 
= mobj
.group(1).decode('utf-8') 
1145                 video_title 
= sanitize_title(video_title
) 
1147                 mobj 
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage
) 
1149                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
1151                 video_uploader 
= mobj
.group(1) 
1154                         # Process video information 
1155                         self
._downloader
.process_info({ 
1156                                 'id':           video_id
.decode('utf-8'), 
1157                                 'url':          video_url
.decode('utf-8'), 
1158                                 'uploader':     video_uploader
.decode('utf-8'), 
1159                                 'title':        video_title
, 
1160                                 'stitle':       simple_title
, 
1161                                 'ext':          video_extension
.decode('utf-8'), 
1165                 except UnavailableVideoError
: 
1166                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1168 class GoogleIE(InfoExtractor
): 
1169         """Information extractor for video.google.com.""" 
1171         _VALID_URL 
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' 
1173         def __init__(self
, downloader
=None): 
1174                 InfoExtractor
.__init
__(self
, downloader
) 
1178                 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None) 
1180         def report_download_webpage(self
, video_id
): 
1181                 """Report webpage download.""" 
1182                 self
._downloader
.to_stdout(u
'[video.google] %s: Downloading webpage' % video_id
) 
1184         def report_extraction(self
, video_id
): 
1185                 """Report information extraction.""" 
1186                 self
._downloader
.to_stdout(u
'[video.google] %s: Extracting information' % video_id
) 
1188         def _real_initialize(self
): 
1191         def _real_extract(self
, url
): 
1192                 # Extract id from URL 
1193                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1195                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1198                 # At this point we have a new video 
1199                 self
._downloader
.increment_downloads() 
1200                 video_id 
= mobj
.group(1) 
1202                 video_extension 
= 'mp4' 
1204                 # Retrieve video webpage to extract further information 
1205                 request 
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
) 
1207                         self
.report_download_webpage(video_id
) 
1208                         webpage 
= urllib2
.urlopen(request
).read() 
1209                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1210                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1213                 # Extract URL, uploader, and title from webpage 
1214                 self
.report_extraction(video_id
) 
1215                 mobj 
= re
.search(r
"download_url:'([^']+)'", webpage
) 
1217                         video_extension 
= 'flv' 
1218                         mobj 
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
) 
1220                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1222                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1223                 mediaURL 
= mediaURL
.replace('\\x3d', '\x3d') 
1224                 mediaURL 
= mediaURL
.replace('\\x26', '\x26') 
1226                 video_url 
= mediaURL
 
1228                 mobj 
= re
.search(r
'<title>(.*)</title>', webpage
) 
1230                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1232                 video_title 
= mobj
.group(1).decode('utf-8') 
1233                 video_title 
= sanitize_title(video_title
) 
1234                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1236                 # Extract video description 
1237                 mobj 
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
) 
1239                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
1241                 video_description 
= mobj
.group(1).decode('utf-8') 
1242                 if not video_description
: 
1243                         video_description 
= 'No description available.' 
1245                 # Extract video thumbnail 
1246                 if self
._downloader
.params
.get('forcethumbnail', False): 
1247                         request 
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
))) 
1249                                 webpage 
= urllib2
.urlopen(request
).read() 
1250                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1251                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1253                         mobj 
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
) 
1255                                 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
1257                         video_thumbnail 
= mobj
.group(1) 
1258                 else:   # we need something to pass to process_info 
1259                         video_thumbnail 
= '' 
1263                         # Process video information 
1264                         self
._downloader
.process_info({ 
1265                                 'id':           video_id
.decode('utf-8'), 
1266                                 'url':          video_url
.decode('utf-8'), 
1268                                 'title':        video_title
, 
1269                                 'stitle':       simple_title
, 
1270                                 'ext':          video_extension
.decode('utf-8'), 
1274                 except UnavailableVideoError
: 
1275                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1278 class PhotobucketIE(InfoExtractor
): 
1279         """Information extractor for photobucket.com.""" 
1281         _VALID_URL 
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 
1283         def __init__(self
, downloader
=None): 
1284                 InfoExtractor
.__init
__(self
, downloader
) 
1288                 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None) 
1290         def report_download_webpage(self
, video_id
): 
1291                 """Report webpage download.""" 
1292                 self
._downloader
.to_stdout(u
'[photobucket] %s: Downloading webpage' % video_id
) 
1294         def report_extraction(self
, video_id
): 
1295                 """Report information extraction.""" 
1296                 self
._downloader
.to_stdout(u
'[photobucket] %s: Extracting information' % video_id
) 
1298         def _real_initialize(self
): 
1301         def _real_extract(self
, url
): 
1302                 # Extract id from URL 
1303                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1305                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1308                 # At this point we have a new video 
1309                 self
._downloader
.increment_downloads() 
1310                 video_id 
= mobj
.group(1) 
1312                 video_extension 
= 'flv' 
1314                 # Retrieve video webpage to extract further information 
1315                 request 
= urllib2
.Request(url
) 
1317                         self
.report_download_webpage(video_id
) 
1318                         webpage 
= urllib2
.urlopen(request
).read() 
1319                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1320                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1323                 # Extract URL, uploader, and title from webpage 
1324                 self
.report_extraction(video_id
) 
1325                 mobj 
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
) 
1327                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
1329                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
1331                 video_url 
= mediaURL
 
1333                 mobj 
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
) 
1335                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
1337                 video_title 
= mobj
.group(1).decode('utf-8') 
1338                 video_title 
= sanitize_title(video_title
) 
1339                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1341                 video_uploader 
= mobj
.group(2).decode('utf-8') 
1344                         # Process video information 
1345                         self
._downloader
.process_info({ 
1346                                 'id':           video_id
.decode('utf-8'), 
1347                                 'url':          video_url
.decode('utf-8'), 
1348                                 'uploader':     video_uploader
, 
1349                                 'title':        video_title
, 
1350                                 'stitle':       simple_title
, 
1351                                 'ext':          video_extension
.decode('utf-8'), 
1355                 except UnavailableVideoError
: 
1356                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1359 class YahooIE(InfoExtractor
): 
1360         """Information extractor for video.yahoo.com.""" 
1362         # _VALID_URL matches all Yahoo! Video URLs 
1363         # _VPAGE_URL matches only the extractable '/watch/' URLs 
1364         _VALID_URL 
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' 
1365         _VPAGE_URL 
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' 
1367         def __init__(self
, downloader
=None): 
1368                 InfoExtractor
.__init
__(self
, downloader
) 
1372                 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None) 
1374         def report_download_webpage(self
, video_id
): 
1375                 """Report webpage download.""" 
1376                 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Downloading webpage' % video_id
) 
1378         def report_extraction(self
, video_id
): 
1379                 """Report information extraction.""" 
1380                 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Extracting information' % video_id
) 
1382         def _real_initialize(self
): 
1385         def _real_extract(self
, url
, new_video
=True): 
1386                 # Extract ID from URL 
1387                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
1389                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1392                 # At this point we have a new video 
1393                 self
._downloader
.increment_downloads() 
1394                 video_id 
= mobj
.group(2) 
1395                 video_extension 
= 'flv' 
1397                 # Rewrite valid but non-extractable URLs as 
1398                 # extractable English language /watch/ URLs 
1399                 if re
.match(self
._VPAGE
_URL
, url
) is None: 
1400                         request 
= urllib2
.Request(url
) 
1402                                 webpage 
= urllib2
.urlopen(request
).read() 
1403                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1404                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1407                         mobj 
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
) 
1409                                 self
._downloader
.trouble(u
'ERROR: Unable to extract id field') 
1411                         yahoo_id 
= mobj
.group(1) 
1413                         mobj 
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
) 
1415                                 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field') 
1417                         yahoo_vid 
= mobj
.group(1) 
1419                         url 
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
) 
1420                         return self
._real
_extract
(url
, new_video
=False) 
1422                 # Retrieve video webpage to extract further information 
1423                 request 
= urllib2
.Request(url
) 
1425                         self
.report_download_webpage(video_id
) 
1426                         webpage 
= urllib2
.urlopen(request
).read() 
1427                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1428                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1431                 # Extract uploader and title from webpage 
1432                 self
.report_extraction(video_id
) 
1433                 mobj 
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
) 
1435                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
1437                 video_title 
= mobj
.group(1).decode('utf-8') 
1438                 simple_title 
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
) 
1440                 mobj 
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
) 
1442                         self
._downloader
.trouble(u
'ERROR: unable to extract video uploader') 
1444                 video_uploader 
= mobj
.group(1).decode('utf-8') 
1446                 # Extract video thumbnail 
1447                 mobj 
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
) 
1449                         self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
1451                 video_thumbnail 
= mobj
.group(1).decode('utf-8') 
1453                 # Extract video description 
1454                 mobj 
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
) 
1456                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
1458                 video_description 
= mobj
.group(1).decode('utf-8') 
1459                 if not video_description
: video_description 
= 'No description available.' 
1461                 # Extract video height and width 
1462                 mobj 
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
) 
1464                         self
._downloader
.trouble(u
'ERROR: unable to extract video height') 
1466                 yv_video_height 
= mobj
.group(1) 
1468                 mobj 
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
) 
1470                         self
._downloader
.trouble(u
'ERROR: unable to extract video width') 
1472                 yv_video_width 
= mobj
.group(1) 
1474                 # Retrieve video playlist to extract media URL 
1475                 # I'm not completely sure what all these options are, but we 
1476                 # seem to need most of them, otherwise the server sends a 401. 
1477                 yv_lg 
= 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents 
1478                 yv_bitrate 
= '700'  # according to Wikipedia this is hard-coded 
1479                 request 
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id 
+ 
1480                                           '&tech=flash&mode=playlist&lg=' + yv_lg 
+ '&bitrate=' + yv_bitrate 
+ '&vidH=' + yv_video_height 
+ 
1481                                           '&vidW=' + yv_video_width 
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') 
1483                         self
.report_download_webpage(video_id
) 
1484                         webpage 
= urllib2
.urlopen(request
).read() 
1485                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1486                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1489                 # Extract media URL from playlist XML 
1490                 mobj 
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
) 
1492                         self
._downloader
.trouble(u
'ERROR: Unable to extract media URL') 
1494                 video_url 
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8') 
1495                 video_url 
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
) 
1498                         # Process video information 
1499                         self
._downloader
.process_info({ 
1500                                 'id':           video_id
.decode('utf-8'), 
1502                                 'uploader':     video_uploader
, 
1503                                 'title':        video_title
, 
1504                                 'stitle':       simple_title
, 
1505                                 'ext':          video_extension
.decode('utf-8'), 
1506                                 'thumbnail':    video_thumbnail
.decode('utf-8'), 
1507                                 'description':  video_description
, 
1508                                 'thumbnail':    video_thumbnail
, 
1509                                 'description':  video_description
, 
1512                 except UnavailableVideoError
: 
1513                         self
._downloader
.trouble(u
'ERROR: unable to download video') 
1516 class GenericIE(InfoExtractor
): 
1517         """Generic last-resort information extractor.""" 
1519         def __init__(self
, downloader
=None): 
1520                 InfoExtractor
.__init
__(self
, downloader
) 
1526         def report_download_webpage(self
, video_id
): 
1527                 """Report webpage download.""" 
1528                 self
._downloader
.to_stdout(u
'WARNING: Falling back on generic information extractor.') 
1529                 self
._downloader
.to_stdout(u
'[generic] %s: Downloading webpage' % video_id
) 
1531         def report_extraction(self
, video_id
): 
1532                 """Report information extraction.""" 
1533                 self
._downloader
.to_stdout(u
'[generic] %s: Extracting information' % video_id
) 
1535         def _real_initialize(self
): 
1538         def _real_extract(self
, url
): 
1539                 # At this point we have a new video 
1540                 self
._downloader
.increment_downloads() 
1542                 video_id 
= url
.split('/')[-1] 
1543                 request 
= urllib2
.Request(url
) 
1545                         self
.report_download_webpage(video_id
) 
1546                         webpage 
= urllib2
.urlopen(request
).read() 
1547                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1548                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1550                 except ValueError, err
: 
1551                         # since this is the last-resort InfoExtractor, if 
1552                         # this error is thrown, it'll be thrown here 
1553                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1556                 # Start with something easy: JW Player in SWFObject 
1557                 mobj 
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
1559                         # Broaden the search a little bit 
1560                         mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage) 
1562                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1565                 # It's possible that one of the regexes 
1566                 # matched, but returned an empty group: 
1567                 if mobj.group(1) is None: 
1568                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1571                 video_url = urllib.unquote(mobj.group(1)) 
1572                 video_id  = os.path.basename(video_url) 
1574                 # here's a fun little line of code for you: 
1575                 video_extension = os.path.splitext(video_id)[1][1:] 
1576                 video_id        = os.path.splitext(video_id)[0] 
1578                 # it's tempting to parse this further, but you would 
1579                 # have to take into account all the variations like 
1580                 #   Video Title - Site Name 
1581                 #   Site Name | Video Title 
1582                 #   Video Title - Tagline | Site Name 
1583                 # and so on and so forth; it's just not practical 
1584                 mobj = re.search(r'<title>(.*)</title>', webpage) 
1586                         self._downloader.trouble(u'ERROR: unable to extract title') 
1588                 video_title = mobj.group(1).decode('utf-8') 
1589                 video_title = sanitize_title(video_title) 
1590                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title) 
1592                 # video uploader is domain name 
1593                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) 
1595                         self._downloader.trouble(u'ERROR: unable to extract title') 
1597                 video_uploader = mobj.group(1).decode('utf-8') 
1600                         # Process video information 
1601                         self._downloader.process_info({ 
1602                                 'id':           video_id.decode('utf-8'), 
1603                                 'url':          video_url.decode('utf-8'), 
1604                                 'uploader':     video_uploader, 
1605                                 'title':        video_title, 
1606                                 'stitle':       simple_title, 
1607                                 'ext':          video_extension.decode('utf-8'), 
1611                 except UnavailableVideoError, err: 
1612                         self._downloader.trouble(u'ERROR: unable to download video') 
1615 class YoutubeSearchIE(InfoExtractor): 
1616         """Information Extractor for YouTube search queries.""" 
1617         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+' 
1618         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en' 
1619         _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"' 
1620         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' 
1622         _max_youtube_results = 1000 
1624         def __init__(self, youtube_ie, downloader=None): 
1625                 InfoExtractor.__init__(self, downloader) 
1626                 self._youtube_ie = youtube_ie 
1630                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None) 
1632         def report_download_page(self, query, pagenum): 
1633                 """Report attempt to download playlist page with given number.""" 
1634                 query = query.decode(preferredencoding()) 
1635                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) 
1637         def _real_initialize(self): 
1638                 self._youtube_ie.initialize() 
1640         def _real_extract(self, query): 
1641                 mobj = re.match(self._VALID_QUERY, query) 
1643                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1646                 prefix, query = query.split(':') 
1648                 query  = query.encode('utf-8') 
1650                         self._download_n_results(query, 1) 
1652                 elif prefix == 'all': 
1653                         self._download_n_results(query, self._max_youtube_results) 
1659                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1661                                 elif n > self._max_youtube_results: 
1662                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n)) 
1663                                         n = self._max_youtube_results 
1664                                 self._download_n_results(query, n) 
1666                         except ValueError: # parsing prefix as integer fails 
1667                                 self._download_n_results(query, 1) 
1670         def _download_n_results(self, query, n): 
1671                 """Downloads a specified number of results for a query""" 
1674                 already_seen = set() 
1678                         self.report_download_page(query, pagenum) 
1679                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1680                         request = urllib2.Request(result_url, None, std_headers) 
1682                                 page = urllib2.urlopen(request).read() 
1683                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1684                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1687                         # Extract video identifiers 
1688                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1689                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1] 
1690                                 if video_id not in already_seen: 
1691                                         video_ids.append(video_id) 
1692                                         already_seen.add(video_id) 
1693                                         if len(video_ids) == n: 
1694                                                 # Specified n videos reached 
1695                                                 for id in video_ids: 
1696                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1699                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1700                                 for id in video_ids: 
1701                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1704                         pagenum = pagenum + 1 
1706 class GoogleSearchIE(InfoExtractor): 
1707         """Information Extractor for Google Video search queries.""" 
1708         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+' 
1709         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' 
1710         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&' 
1711         _MORE_PAGES_INDICATOR = r'<span>Next</span>' 
1713         _max_google_results = 1000 
1715         def __init__(self, google_ie, downloader=None): 
1716                 InfoExtractor.__init__(self, downloader) 
1717                 self._google_ie = google_ie 
1721                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None) 
1723         def report_download_page(self, query, pagenum): 
1724                 """Report attempt to download playlist page with given number.""" 
1725                 query = query.decode(preferredencoding()) 
1726                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum)) 
1728         def _real_initialize(self): 
1729                 self._google_ie.initialize() 
1731         def _real_extract(self, query): 
1732                 mobj = re.match(self._VALID_QUERY, query) 
1734                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1737                 prefix, query = query.split(':') 
1739                 query  = query.encode('utf-8') 
1741                         self._download_n_results(query, 1) 
1743                 elif prefix == 'all': 
1744                         self._download_n_results(query, self._max_google_results) 
1750                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1752                                 elif n > self._max_google_results: 
1753                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n)) 
1754                                         n = self._max_google_results 
1755                                 self._download_n_results(query, n) 
1757                         except ValueError: # parsing prefix as integer fails 
1758                                 self._download_n_results(query, 1) 
1761         def _download_n_results(self, query, n): 
1762                 """Downloads a specified number of results for a query""" 
1765                 already_seen = set() 
1769                         self.report_download_page(query, pagenum) 
1770                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1771                         request = urllib2.Request(result_url, None, std_headers) 
1773                                 page = urllib2.urlopen(request).read() 
1774                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1775                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1778                         # Extract video identifiers 
1779                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1780                                 video_id = mobj.group(1) 
1781                                 if video_id not in already_seen: 
1782                                         video_ids.append(video_id) 
1783                                         already_seen.add(video_id) 
1784                                         if len(video_ids) == n: 
1785                                                 # Specified n videos reached 
1786                                                 for id in video_ids: 
1787                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) 
1790                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1791                                 for id in video_ids: 
1792                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id) 
1795                         pagenum = pagenum + 1 
1797 class YahooSearchIE(InfoExtractor): 
1798         """Information Extractor for Yahoo! Video search queries.""" 
1799         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+' 
1800         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' 
1801         _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"' 
1802         _MORE_PAGES_INDICATOR = r'\s*Next' 
1804         _max_yahoo_results = 1000 
1806         def __init__(self, yahoo_ie, downloader=None): 
1807                 InfoExtractor.__init__(self, downloader) 
1808                 self._yahoo_ie = yahoo_ie 
1812                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None) 
1814         def report_download_page(self, query, pagenum): 
1815                 """Report attempt to download playlist page with given number.""" 
1816                 query = query.decode(preferredencoding()) 
1817                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) 
1819         def _real_initialize(self): 
1820                 self._yahoo_ie.initialize() 
1822         def _real_extract(self, query): 
1823                 mobj = re.match(self._VALID_QUERY, query) 
1825                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1828                 prefix, query = query.split(':') 
1830                 query  = query.encode('utf-8') 
1832                         self._download_n_results(query, 1) 
1834                 elif prefix == 'all': 
1835                         self._download_n_results(query, self._max_yahoo_results) 
1841                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1843                                 elif n > self._max_yahoo_results: 
1844                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n)) 
1845                                         n = self._max_yahoo_results 
1846                                 self._download_n_results(query, n) 
1848                         except ValueError: # parsing prefix as integer fails 
1849                                 self._download_n_results(query, 1) 
1852         def _download_n_results(self, query, n): 
1853                 """Downloads a specified number of results for a query""" 
1856                 already_seen = set() 
1860                         self.report_download_page(query, pagenum) 
1861                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1862                         request = urllib2.Request(result_url, None, std_headers) 
1864                                 page = urllib2.urlopen(request).read() 
1865                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1866                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1869                         # Extract video identifiers 
1870                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1871                                 video_id = mobj.group(1) 
1872                                 if video_id not in already_seen: 
1873                                         video_ids.append(video_id) 
1874                                         already_seen.add(video_id) 
1875                                         if len(video_ids) == n: 
1876                                                 # Specified n videos reached 
1877                                                 for id in video_ids: 
1878                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) 
1881                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1882                                 for id in video_ids: 
1883                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id) 
1886                         pagenum = pagenum + 1 
1888 class YoutubePlaylistIE(InfoExtractor): 
1889         """Information Extractor for YouTube playlists.""" 
1891         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*' 
1892         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en' 
1893         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' 
1894         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>' 
1897         def __init__(self, youtube_ie, downloader=None): 
1898                 InfoExtractor.__init__(self, downloader) 
1899                 self._youtube_ie = youtube_ie 
1903                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None) 
1905         def report_download_page(self, playlist_id, pagenum): 
1906                 """Report attempt to download playlist page with given number.""" 
1907                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) 
1909         def _real_initialize(self): 
1910                 self._youtube_ie.initialize() 
1912         def _real_extract(self, url): 
1913                 # Extract playlist id 
1914                 mobj = re.match(self._VALID_URL, url) 
1916                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1919                 # Download playlist pages 
1920                 playlist_id = mobj.group(1) 
1925                         self.report_download_page(playlist_id, pagenum) 
1926                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers) 
1928                                 page = urllib2.urlopen(request).read() 
1929                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1930                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1933                         # Extract video identifiers 
1935                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1936                                 if mobj.group(1) not in ids_in_page: 
1937                                         ids_in_page.append(mobj.group(1)) 
1938                         video_ids.extend(ids_in_page) 
1940                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1942                         pagenum = pagenum + 1 
1944                 playliststart = self._downloader.params.get('playliststart', 1) 
1945                 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based 
1946                 if playliststart > 0: 
1947                         video_ids = video_ids[playliststart:] 
1949                 for id in video_ids: 
1950                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
1953 class YoutubeUserIE(InfoExtractor): 
1954         """Information Extractor for YouTube users.""" 
1956         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)' 
1957         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' 
1958         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this. 
1961         def __init__(self, youtube_ie, downloader=None): 
1962                 InfoExtractor.__init__(self, downloader) 
1963                 self._youtube_ie = youtube_ie 
1967                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None) 
1969         def report_download_page(self, username): 
1970                 """Report attempt to download user page.""" 
1971                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username)) 
1973         def _real_initialize(self): 
1974                 self._youtube_ie.initialize() 
1976         def _real_extract(self, url): 
1978                 mobj = re.match(self._VALID_URL, url) 
1980                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1983                 # Download user page 
1984                 username = mobj.group(1) 
1988                 self.report_download_page(username) 
1989                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers) 
1991                         page = urllib2.urlopen(request).read() 
1992                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1993                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1996                 # Extract video identifiers 
1999                 for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
2000                         if mobj.group(1) not in ids_in_page: 
2001                                 ids_in_page.append(mobj.group(1)) 
2002                 video_ids.extend(ids_in_page) 
2004                 playliststart = self._downloader.params.get('playliststart', 1) 
2005                 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based 
2006                 if playliststart > 0: 
2007                         video_ids = video_ids[playliststart:]    
2009                 for id in video_ids: 
2010                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id) 
2013 class PostProcessor(object): 
2014         """Post Processor class. 
2016         PostProcessor objects can be added to downloaders with their 
2017         add_post_processor() method. When the downloader has finished a 
2018         successful download, it will take its internal chain of PostProcessors 
2019         and start calling the run() method on each one of them, first with 
2020         an initial argument and then with the returned value of the previous 
2023         The chain will be stopped if one of them ever returns None or the end 
2024         of the chain is reached. 
2026         PostProcessor objects follow a "mutual registration
" process similar 
2027         to InfoExtractor objects. 
2032         def __init__(self, downloader=None): 
2033                 self._downloader = downloader 
2035         def set_downloader(self, downloader): 
2036                 """Sets the downloader for this PP.""" 
2037                 self._downloader = downloader 
2039         def run(self, information): 
2040                 """Run the PostProcessor. 
2042                 The "information
" argument is a dictionary like the ones 
2043                 composed by InfoExtractors. The only difference is that this 
2044                 one has an extra field called "filepath
" that points to the 
2047                 When this method returns None, the postprocessing chain is 
2048                 stopped. However, this method may return an information 
2049                 dictionary that will be passed to the next postprocessing 
2050                 object in the chain. It can be the one it received after 
2051                 changing some fields. 
2053                 In addition, this method may raise a PostProcessingError 
2054                 exception that will be taken into account by the downloader 
2057                 return information # by default, do nothing 
2059 ### MAIN PROGRAM ### 
2060 if __name__ == '__main__': 
2062                 # Modules needed only when running the main program 
2066                 # Function to update the program file with the latest version from bitbucket.org 
2067                 def update_self(downloader, filename): 
2068                         # Note: downloader only used for options 
2069                         if not os.access (filename, os.W_OK): 
2070                                 sys.exit('ERROR: no write permissions on %s' % filename) 
2072                         downloader.to_stdout('Updating to latest stable version...') 
2073                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION' 
2074                         latest_version = urllib.urlopen(latest_url).read().strip() 
2075                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version 
2076                         newcontent = urllib.urlopen(prog_url).read() 
2077                         stream = open(filename, 'w') 
2078                         stream.write(newcontent) 
2080                         downloader.to_stdout('Updated to version %s' % latest_version) 
2082                 # General configuration 
2083                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler())) 
2084                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor())) 
2085                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) 
2087                 # Parse command line 
2088                 parser = optparse.OptionParser( 
2089                         usage='Usage: %prog [options] url...', 
2090                         version='2010.08.04', 
2091                         conflict_handler='resolve', 
2094                 parser.add_option('-h', '--help', 
2095                                 action='help', help='print this help text and exit') 
2096                 parser.add_option('-v', '--version', 
2097                                 action='version', help='print program version and exit') 
2098                 parser.add_option('-U', '--update', 
2099                                 action='store_true', dest='update_self', help='update this program to latest stable version') 
2100                 parser.add_option('-i', '--ignore-errors', 
2101                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False) 
2102                 parser.add_option('-r', '--rate-limit', 
2103                                 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)') 
2104                 parser.add_option('-R', '--retries', 
2105                                 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10) 
2106                 parser.add_option('--playlist-start', 
2107                                 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) 
2109                 authentication = optparse.OptionGroup(parser, 'Authentication Options') 
2110                 authentication.add_option('-u', '--username', 
2111                                 dest='username', metavar='USERNAME', help='account username') 
2112                 authentication.add_option('-p', '--password', 
2113                                 dest='password', metavar='PASSWORD', help='account password') 
2114                 authentication.add_option('-n', '--netrc', 
2115                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False) 
2116                 parser.add_option_group(authentication) 
2118                 video_format = optparse.OptionGroup(parser, 'Video Format Options') 
2119                 video_format.add_option('-f', '--format', 
2120                                 action='store', dest='format', metavar='FORMAT', help='video format code') 
2121                 video_format.add_option('-m', '--mobile-version', 
2122                                 action='store_const', dest='format', help='alias for -f 17', const='17') 
2123                 video_format.add_option('--all-formats', 
2124                                 action='store_const', dest='format', help='download all available video formats', const='-1') 
2125                 video_format.add_option('--max-quality', 
2126                                 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download') 
2127                 video_format.add_option('-b', '--best-quality', 
2128                                 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)') 
2129                 parser.add_option_group(video_format) 
2131                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') 
2132                 verbosity.add_option('-q', '--quiet', 
2133                                 action='store_true', dest='quiet', help='activates quiet mode', default=False) 
2134                 verbosity.add_option('-s', '--simulate', 
2135                                 action='store_true', dest='simulate', help='do not download video', default=False) 
2136                 verbosity.add_option('-g', '--get-url', 
2137                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False) 
2138                 verbosity.add_option('-e', '--get-title', 
2139                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False) 
2140                 verbosity.add_option('--get-thumbnail', 
2141                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False) 
2142                 verbosity.add_option('--get-description', 
2143                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False) 
2144                 verbosity.add_option('--no-progress', 
2145                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False) 
2146                 parser.add_option_group(verbosity) 
2148                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options') 
2149                 filesystem.add_option('-t', '--title', 
2150                                 action='store_true', dest='usetitle', help='use title in file name', default=False) 
2151                 filesystem.add_option('-l', '--literal', 
2152                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False) 
2153                 filesystem.add_option('-o', '--output', 
2154                                 dest='outtmpl', metavar='TEMPLATE', help='output filename template') 
2155                 filesystem.add_option('-a', '--batch-file', 
2156                                 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)') 
2157                 filesystem.add_option('-w', '--no-overwrites', 
2158                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False) 
2159                 filesystem.add_option('-c', '--continue', 
2160                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False) 
2161                 parser.add_option_group(filesystem) 
2163                 (opts, args) = parser.parse_args() 
2165                 # Batch file verification 
2167                 if opts.batchfile is not None: 
2169                                 if opts.batchfile == '-': 
2172                                         batchfd = open(opts.batchfile, 'r') 
2173                                 batchurls = batchfd.readlines() 
2174                                 batchurls = [x.strip() for x in batchurls] 
2175                                 batchurls = [x for x in batchurls if len(x) > 0] 
2177                                 sys.exit(u'ERROR: batch file could not be read') 
2178                 all_urls = batchurls + args 
2180                 # Conflicting, missing and erroneous options 
2181                 if opts.bestquality: 
2182                         print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n' 
2183                 if opts.usenetrc and (opts.username is not None or opts.password is not None): 
2184                         parser.error(u'using .netrc conflicts with giving username/password') 
2185                 if opts.password is not None and opts.username is None: 
2186                         parser.error(u'account username missing') 
2187                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle): 
2188                         parser.error(u'using output template conflicts with using title or literal title') 
2189                 if opts.usetitle and opts.useliteral: 
2190                         parser.error(u'using title conflicts with using literal title') 
2191                 if opts.username is not None and opts.password is None: 
2192                         opts.password = getpass.getpass(u'Type account password and press return:') 
2193                 if opts.ratelimit is not None: 
2194                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit) 
2195                         if numeric_limit is None: 
2196                                 parser.error(u'invalid rate limit specified') 
2197                         opts.ratelimit = numeric_limit 
2198                 if opts.retries is not None: 
2200                                 opts.retries = long(opts.retries) 
2201                         except (TypeError, ValueError), err: 
2202                                 parser.error(u'invalid retry count specified') 
2203                 if opts.playliststart is not None: 
2205                                 opts.playliststart = long(opts.playliststart) 
2206                         except (TypeError, ValueError), err: 
2207                                 parser.error(u'invalid playlist page specified') 
2209                 # Information extractors 
2210                 youtube_ie = YoutubeIE() 
2211                 metacafe_ie = MetacafeIE(youtube_ie) 
2212                 dailymotion_ie = DailymotionIE() 
2213                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie) 
2214                 youtube_user_ie = YoutubeUserIE(youtube_ie) 
2215                 youtube_search_ie = YoutubeSearchIE(youtube_ie) 
2216                 google_ie = GoogleIE() 
2217                 google_search_ie = GoogleSearchIE(google_ie) 
2218                 photobucket_ie = PhotobucketIE() 
2219                 yahoo_ie = YahooIE() 
2220                 yahoo_search_ie = YahooSearchIE(yahoo_ie) 
2221                 generic_ie = GenericIE() 
2224                 fd = FileDownloader({ 
2225                         'usenetrc': opts.usenetrc, 
2226                         'username': opts.username, 
2227                         'password': opts.password, 
2228                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription), 
2229                         'forceurl': opts.geturl, 
2230                         'forcetitle': opts.gettitle, 
2231                         'forcethumbnail': opts.getthumbnail, 
2232                         'forcedescription': opts.getdescription, 
2233                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription), 
2234                         'format': opts.format, 
2235                         'format_limit': opts.format_limit, 
2236                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding())) 
2237                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s') 
2238                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s') 
2239                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s') 
2240                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s') 
2241                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s') 
2242                                 or u'%(id)s.%(ext)s'), 
2243                         'ignoreerrors': opts.ignoreerrors, 
2244                         'ratelimit': opts.ratelimit, 
2245                         'nooverwrites': opts.nooverwrites, 
2246                         'retries': opts.retries, 
2247                         'continuedl': opts.continue_dl, 
2248                         'noprogress': opts.noprogress, 
2249                         'playliststart': opts.playliststart, 
2251                 fd.add_info_extractor(youtube_search_ie) 
2252                 fd.add_info_extractor(youtube_pl_ie) 
2253                 fd.add_info_extractor(youtube_user_ie) 
2254                 fd.add_info_extractor(metacafe_ie) 
2255                 fd.add_info_extractor(dailymotion_ie) 
2256                 fd.add_info_extractor(youtube_ie) 
2257                 fd.add_info_extractor(google_ie) 
2258                 fd.add_info_extractor(google_search_ie) 
2259                 fd.add_info_extractor(photobucket_ie) 
2260                 fd.add_info_extractor(yahoo_ie) 
2261                 fd.add_info_extractor(yahoo_search_ie) 
2263                 # This must come last since it's the 
2264                 # fallback if none of the others work 
2265                 fd.add_info_extractor(generic_ie) 
2268                 if opts.update_self: 
2269                         update_self(fd, sys.argv[0]) 
2272                 if len(all_urls) < 1: 
2273                         if not opts.update_self: 
2274                                 parser.error(u'you must provide at least one URL') 
2277                 retcode = fd.download(all_urls) 
2280         except DownloadError: 
2282         except SameFileError: 
2283                 sys.exit(u'ERROR: fixed output name but more than one file to download') 
2284         except KeyboardInterrupt: 
2285                 sys.exit(u'\nERROR: Interrupted by user')