]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
9e2862123c124ba82e0e235181048fbf983b3dfc
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 class DownloadError(Exception):
55 """Download Error exception.
56
57 This exception may be thrown by FileDownloader objects if they are not
58 configured to continue on errors. They will contain the appropriate
59 error message.
60 """
61 pass
62
63 class SameFileError(Exception):
64 """Same File exception.
65
66 This exception will be thrown by FileDownloader objects if they detect
67 multiple files would have to be downloaded to the same file on disk.
68 """
69 pass
70
71 class PostProcessingError(Exception):
72 """Post Processing exception.
73
74 This exception may be raised by PostProcessor's .run() method to
75 indicate an error in the postprocessing task.
76 """
77 pass
78
79 class UnavailableFormatError(Exception):
80 """Unavailable Format exception.
81
82 This exception will be thrown when a video is requested
83 in a format that is not available for that video.
84 """
85 pass
86
87 class ContentTooShortError(Exception):
88 """Content Too Short exception.
89
90 This exception may be raised by FileDownloader objects when a file they
91 download is too small for what the server announced first, indicating
92 the connection was probably interrupted.
93 """
94 # Both in bytes
95 downloaded = None
96 expected = None
97
98 def __init__(self, downloaded, expected):
99 self.downloaded = downloaded
100 self.expected = expected
101
102 class FileDownloader(object):
103 """File Downloader class.
104
105 File downloader objects are the ones responsible of downloading the
106 actual video file and writing it to disk if the user has requested
107 it, among some other tasks. In most cases there should be one per
108 program. As, given a video URL, the downloader doesn't know how to
109 extract all the needed information, task that InfoExtractors do, it
110 has to pass the URL to one of them.
111
112 For this, file downloader objects have a method that allows
113 InfoExtractors to be registered in a given order. When it is passed
114 a URL, the file downloader handles it to the first InfoExtractor it
115 finds that reports being able to handle it. The InfoExtractor extracts
116 all the information about the video or videos the URL refers to, and
117 asks the FileDownloader to process the video information, possibly
118 downloading the video.
119
120 File downloaders accept a lot of parameters. In order not to saturate
121 the object constructor with arguments, it receives a dictionary of
122 options instead. These options are available through the params
123 attribute for the InfoExtractors to use. The FileDownloader also
124 registers itself as the downloader in charge for the InfoExtractors
125 that are added to it, so this is a "mutual registration".
126
127 Available options:
128
129 username: Username for authentication purposes.
130 password: Password for authentication purposes.
131 usenetrc: Use netrc for authentication instead.
132 quiet: Do not print messages to stdout.
133 forceurl: Force printing final URL.
134 forcetitle: Force printing title.
135 simulate: Do not download the video files.
136 format: Video format code.
137 outtmpl: Template for output names.
138 ignoreerrors: Do not stop on download errors.
139 ratelimit: Download speed limit, in bytes/sec.
140 nooverwrites: Prevent overwriting files.
141 continuedl: Try to continue downloads if possible.
142 """
143
144 params = None
145 _ies = []
146 _pps = []
147 _download_retcode = None
148
149 def __init__(self, params):
150 """Create a FileDownloader object with the given options."""
151 self._ies = []
152 self._pps = []
153 self._download_retcode = 0
154 self.params = params
155
156 @staticmethod
157 def pmkdir(filename):
158 """Create directory components in filename. Similar to Unix "mkdir -p"."""
159 components = filename.split(os.sep)
160 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
161 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
162 for dir in aggregate:
163 if not os.path.exists(dir):
164 os.mkdir(dir)
165
166 @staticmethod
167 def format_bytes(bytes):
168 if bytes is None:
169 return 'N/A'
170 if type(bytes) is str:
171 bytes = float(bytes)
172 if bytes == 0.0:
173 exponent = 0
174 else:
175 exponent = long(math.log(bytes, 1024.0))
176 suffix = 'bkMGTPEZY'[exponent]
177 converted = float(bytes) / float(1024**exponent)
178 return '%.2f%s' % (converted, suffix)
179
180 @staticmethod
181 def calc_percent(byte_counter, data_len):
182 if data_len is None:
183 return '---.-%'
184 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
185
186 @staticmethod
187 def calc_eta(start, now, total, current):
188 if total is None:
189 return '--:--'
190 dif = now - start
191 if current == 0 or dif < 0.001: # One millisecond
192 return '--:--'
193 rate = float(current) / dif
194 eta = long((float(total) - float(current)) / rate)
195 (eta_mins, eta_secs) = divmod(eta, 60)
196 if eta_mins > 99:
197 return '--:--'
198 return '%02d:%02d' % (eta_mins, eta_secs)
199
200 @staticmethod
201 def calc_speed(start, now, bytes):
202 dif = now - start
203 if bytes == 0 or dif < 0.001: # One millisecond
204 return '%10s' % '---b/s'
205 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
206
207 @staticmethod
208 def best_block_size(elapsed_time, bytes):
209 new_min = max(bytes / 2.0, 1.0)
210 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
211 if elapsed_time < 0.001:
212 return long(new_max)
213 rate = bytes / elapsed_time
214 if rate > new_max:
215 return long(new_max)
216 if rate < new_min:
217 return long(new_min)
218 return long(rate)
219
220 @staticmethod
221 def parse_bytes(bytestr):
222 """Parse a string indicating a byte quantity into a long integer."""
223 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
224 if matchobj is None:
225 return None
226 number = float(matchobj.group(1))
227 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
228 return long(round(number * multiplier))
229
230 @staticmethod
231 def verify_url(url):
232 """Verify a URL is valid and data could be downloaded. Return real data URL."""
233 request = urllib2.Request(url, None, std_headers)
234 data = urllib2.urlopen(request)
235 data.read(1)
236 url = data.geturl()
237 data.close()
238 return url
239
240 def add_info_extractor(self, ie):
241 """Add an InfoExtractor object to the end of the list."""
242 self._ies.append(ie)
243 ie.set_downloader(self)
244
245 def add_post_processor(self, pp):
246 """Add a PostProcessor object to the end of the chain."""
247 self._pps.append(pp)
248 pp.set_downloader(self)
249
250 def to_stdout(self, message, skip_eol=False):
251 """Print message to stdout if not in quiet mode."""
252 if not self.params.get('quiet', False):
253 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
254 sys.stdout.flush()
255
256 def to_stderr(self, message):
257 """Print message to stderr."""
258 print >>sys.stderr, message.encode(preferredencoding())
259
260 def fixed_template(self):
261 """Checks if the output template is fixed."""
262 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
263
264 def trouble(self, message=None):
265 """Determine action to take when a download problem appears.
266
267 Depending on if the downloader has been configured to ignore
268 download errors or not, this method may throw an exception or
269 not when errors are found, after printing the message.
270 """
271 if message is not None:
272 self.to_stderr(message)
273 if not self.params.get('ignoreerrors', False):
274 raise DownloadError(message)
275 self._download_retcode = 1
276
277 def slow_down(self, start_time, byte_counter):
278 """Sleep if the download speed is over the rate limit."""
279 rate_limit = self.params.get('ratelimit', None)
280 if rate_limit is None or byte_counter == 0:
281 return
282 now = time.time()
283 elapsed = now - start_time
284 if elapsed <= 0.0:
285 return
286 speed = float(byte_counter) / elapsed
287 if speed > rate_limit:
288 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
289
290 def report_destination(self, filename):
291 """Report destination filename."""
292 self.to_stdout(u'[download] Destination: %s' % filename)
293
294 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
295 """Report download progress."""
296 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
297 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
298
299 def report_resuming_byte(self, resume_len):
300 """Report attemtp to resume at given byte."""
301 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
302
303 def report_file_already_downloaded(self, file_name):
304 """Report file has already been fully downloaded."""
305 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
306
307 def report_unable_to_resume(self):
308 """Report it was impossible to resume download."""
309 self.to_stdout(u'[download] Unable to resume')
310
311 def report_finish(self):
312 """Report download finished."""
313 self.to_stdout(u'')
314
315 def process_info(self, info_dict):
316 """Process a single dictionary returned by an InfoExtractor."""
317 # Do nothing else if in simulate mode
318 if self.params.get('simulate', False):
319 # Verify URL if it's an HTTP one
320 if info_dict['url'].startswith('http'):
321 try:
322 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
323 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
324 raise UnavailableFormatError
325
326 # Forced printings
327 if self.params.get('forcetitle', False):
328 print info_dict['title'].encode(preferredencoding())
329 if self.params.get('forceurl', False):
330 print info_dict['url'].encode(preferredencoding())
331
332 return
333
334 try:
335 template_dict = dict(info_dict)
336 template_dict['epoch'] = unicode(long(time.time()))
337 filename = self.params['outtmpl'] % template_dict
338 except (ValueError, KeyError), err:
339 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
340 if self.params.get('nooverwrites', False) and os.path.exists(filename):
341 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
342 return
343
344 try:
345 self.pmkdir(filename)
346 except (OSError, IOError), err:
347 self.trouble('ERROR: unable to create directories: %s' % str(err))
348 return
349
350 try:
351 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
352 except (OSError, IOError), err:
353 raise UnavailableFormatError
354 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
355 self.trouble('ERROR: unable to download video data: %s' % str(err))
356 return
357 except (ContentTooShortError, ), err:
358 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
359 return
360
361 if success:
362 try:
363 self.post_process(filename, info_dict)
364 except (PostProcessingError), err:
365 self.trouble('ERROR: postprocessing: %s' % str(err))
366 return
367
368 def download(self, url_list):
369 """Download a given list of URLs."""
370 if len(url_list) > 1 and self.fixed_template():
371 raise SameFileError(self.params['outtmpl'])
372
373 for url in url_list:
374 suitable_found = False
375 for ie in self._ies:
376 # Go to next InfoExtractor if not suitable
377 if not ie.suitable(url):
378 continue
379
380 # Suitable InfoExtractor found
381 suitable_found = True
382
383 # Extract information from URL and process it
384 ie.extract(url)
385
386 # Suitable InfoExtractor had been found; go to next URL
387 break
388
389 if not suitable_found:
390 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
391
392 return self._download_retcode
393
394 def post_process(self, filename, ie_info):
395 """Run the postprocessing chain on the given file."""
396 info = dict(ie_info)
397 info['filepath'] = filename
398 for pp in self._pps:
399 info = pp.run(info)
400 if info is None:
401 break
402
403 def _download_with_rtmpdump(self, filename, url):
404 self.report_destination(filename)
405
406 # Check for rtmpdump first
407 try:
408 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
409 except (OSError, IOError):
410 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
411 return False
412
413 # Download using rtmpdump. rtmpdump returns exit code 2 when
414 # the connection was interrumpted and resuming appears to be
415 # possible. This is part of rtmpdump's normal usage, AFAIK.
416 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
417 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
418 while retval == 2 or retval == 1:
419 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
420 time.sleep(2.0) # This seems to be needed
421 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
422 if retval == 0:
423 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
424 return True
425 else:
426 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
427 return False
428
429 def _do_download(self, filename, url):
430 # Attempt to download using rtmpdump
431 if url.startswith('rtmp'):
432 return self._download_with_rtmpdump(filename, url)
433
434 stream = None
435 open_mode = 'wb'
436 basic_request = urllib2.Request(url, None, std_headers)
437 request = urllib2.Request(url, None, std_headers)
438
439 # Establish possible resume length
440 if os.path.isfile(filename):
441 resume_len = os.path.getsize(filename)
442 else:
443 resume_len = 0
444
445 # Request parameters in case of being able to resume
446 if self.params.get('continuedl', False) and resume_len != 0:
447 self.report_resuming_byte(resume_len)
448 request.add_header('Range','bytes=%d-' % resume_len)
449 open_mode = 'ab'
450
451 # Establish connection
452 try:
453 data = urllib2.urlopen(request)
454 except (urllib2.HTTPError, ), err:
455 if err.code != 416: # 416 is 'Requested range not satisfiable'
456 raise
457 # Unable to resume
458 data = urllib2.urlopen(basic_request)
459 content_length = data.info()['Content-Length']
460
461 if content_length is not None and long(content_length) == resume_len:
462 # Because the file had already been fully downloaded
463 self.report_file_already_downloaded(filename)
464 return True
465 else:
466 # Because the server didn't let us
467 self.report_unable_to_resume()
468 open_mode = 'wb'
469
470 data_len = data.info().get('Content-length', None)
471 data_len_str = self.format_bytes(data_len)
472 byte_counter = 0
473 block_size = 1024
474 start = time.time()
475 while True:
476 # Download and write
477 before = time.time()
478 data_block = data.read(block_size)
479 after = time.time()
480 data_block_len = len(data_block)
481 if data_block_len == 0:
482 break
483 byte_counter += data_block_len
484
485 # Open file just in time
486 if stream is None:
487 try:
488 stream = open(filename, open_mode)
489 self.report_destination(filename)
490 except (OSError, IOError), err:
491 self.trouble('ERROR: unable to open for writing: %s' % str(err))
492 return False
493 stream.write(data_block)
494 block_size = self.best_block_size(after - before, data_block_len)
495
496 # Progress message
497 percent_str = self.calc_percent(byte_counter, data_len)
498 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
499 speed_str = self.calc_speed(start, time.time(), byte_counter)
500 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
501
502 # Apply rate limit
503 self.slow_down(start, byte_counter)
504
505 self.report_finish()
506 if data_len is not None and str(byte_counter) != data_len:
507 raise ContentTooShortError(byte_counter, long(data_len))
508 return True
509
510 class InfoExtractor(object):
511 """Information Extractor class.
512
513 Information extractors are the classes that, given a URL, extract
514 information from the video (or videos) the URL refers to. This
515 information includes the real video URL, the video title and simplified
516 title, author and others. The information is stored in a dictionary
517 which is then passed to the FileDownloader. The FileDownloader
518 processes this information possibly downloading the video to the file
519 system, among other possible outcomes. The dictionaries must include
520 the following fields:
521
522 id: Video identifier.
523 url: Final video URL.
524 uploader: Nickname of the video uploader.
525 title: Literal title.
526 stitle: Simplified title.
527 ext: Video filename extension.
528
529 Subclasses of this one should re-define the _real_initialize() and
530 _real_extract() methods, as well as the suitable() static method.
531 Probably, they should also be instantiated and added to the main
532 downloader.
533 """
534
535 _ready = False
536 _downloader = None
537
538 def __init__(self, downloader=None):
539 """Constructor. Receives an optional downloader."""
540 self._ready = False
541 self.set_downloader(downloader)
542
543 @staticmethod
544 def suitable(url):
545 """Receives a URL and returns True if suitable for this IE."""
546 return False
547
548 def initialize(self):
549 """Initializes an instance (authentication, etc)."""
550 if not self._ready:
551 self._real_initialize()
552 self._ready = True
553
554 def extract(self, url):
555 """Extracts URL information and returns it in list of dicts."""
556 self.initialize()
557 return self._real_extract(url)
558
559 def set_downloader(self, downloader):
560 """Sets the downloader for this IE."""
561 self._downloader = downloader
562
563 def _real_initialize(self):
564 """Real initialization process. Redefine in subclasses."""
565 pass
566
567 def _real_extract(self, url):
568 """Real extraction process. Redefine in subclasses."""
569 pass
570
571 class YoutubeIE(InfoExtractor):
572 """Information extractor for youtube.com."""
573
574 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
575 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
576 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
577 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
578 _NETRC_MACHINE = 'youtube'
579 _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
580 _video_extensions = {
581 '13': '3gp',
582 '17': 'mp4',
583 '18': 'mp4',
584 '22': 'mp4',
585 '37': 'mp4',
586 }
587
588 @staticmethod
589 def suitable(url):
590 return (re.match(YoutubeIE._VALID_URL, url) is not None)
591
592 @staticmethod
593 def htmlentity_transform(matchobj):
594 """Transforms an HTML entity to a Unicode character."""
595 entity = matchobj.group(1)
596
597 # Known non-numeric HTML entity
598 if entity in htmlentitydefs.name2codepoint:
599 return unichr(htmlentitydefs.name2codepoint[entity])
600
601 # Unicode character
602 mobj = re.match(ur'(?u)#(x?\d+)', entity)
603 if mobj is not None:
604 numstr = mobj.group(1)
605 if numstr.startswith(u'x'):
606 base = 16
607 numstr = u'0%s' % numstr
608 else:
609 base = 10
610 return unichr(long(numstr, base))
611
612 # Unknown entity in name, return its literal representation
613 return (u'&%s;' % entity)
614
615 def report_lang(self):
616 """Report attempt to set language."""
617 self._downloader.to_stdout(u'[youtube] Setting language')
618
619 def report_login(self):
620 """Report attempt to log in."""
621 self._downloader.to_stdout(u'[youtube] Logging in')
622
623 def report_age_confirmation(self):
624 """Report attempt to confirm age."""
625 self._downloader.to_stdout(u'[youtube] Confirming age')
626
627 def report_video_info_webpage_download(self, video_id):
628 """Report attempt to download video info webpage."""
629 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
630
631 def report_information_extraction(self, video_id):
632 """Report attempt to extract video information."""
633 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
634
635 def report_unavailable_format(self, video_id, format):
636 """Report extracted video URL."""
637 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
638
639 def report_rtmp_download(self):
640 """Indicate the download will use the RTMP protocol."""
641 self._downloader.to_stdout(u'[youtube] RTMP download detected')
642
643 def _real_initialize(self):
644 if self._downloader is None:
645 return
646
647 username = None
648 password = None
649 downloader_params = self._downloader.params
650
651 # Attempt to use provided username and password or .netrc data
652 if downloader_params.get('username', None) is not None:
653 username = downloader_params['username']
654 password = downloader_params['password']
655 elif downloader_params.get('usenetrc', False):
656 try:
657 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
658 if info is not None:
659 username = info[0]
660 password = info[2]
661 else:
662 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
663 except (IOError, netrc.NetrcParseError), err:
664 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
665 return
666
667 # Set language
668 request = urllib2.Request(self._LANG_URL, None, std_headers)
669 try:
670 self.report_lang()
671 urllib2.urlopen(request).read()
672 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
673 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
674 return
675
676 # No authentication to be performed
677 if username is None:
678 return
679
680 # Log in
681 login_form = {
682 'current_form': 'loginForm',
683 'next': '/',
684 'action_login': 'Log In',
685 'username': username,
686 'password': password,
687 }
688 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
689 try:
690 self.report_login()
691 login_results = urllib2.urlopen(request).read()
692 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
693 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
694 return
695 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
696 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
697 return
698
699 # Confirm age
700 age_form = {
701 'next_url': '/',
702 'action_confirm': 'Confirm',
703 }
704 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
705 try:
706 self.report_age_confirmation()
707 age_results = urllib2.urlopen(request).read()
708 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
709 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
710 return
711
712 def _real_extract(self, url):
713 # Extract video id from URL
714 mobj = re.match(self._VALID_URL, url)
715 if mobj is None:
716 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
717 return
718 video_id = mobj.group(2)
719
720 # Downloader parameters
721 best_quality = False
722 format_param = None
723 quality_index = 0
724 if self._downloader is not None:
725 params = self._downloader.params
726 format_param = params.get('format', None)
727 if format_param == '0':
728 format_param = self._available_formats[quality_index]
729 best_quality = True
730
731 while True:
732 # Extension
733 video_extension = self._video_extensions.get(format_param, 'flv')
734
735 # Get video info
736 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
737 request = urllib2.Request(video_info_url, None, std_headers)
738 try:
739 self.report_video_info_webpage_download(video_id)
740 video_info_webpage = urllib2.urlopen(request).read()
741 video_info = parse_qs(video_info_webpage)
742 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
743 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
744 return
745 self.report_information_extraction(video_id)
746
747 # "t" param
748 if 'token' not in video_info:
749 # Attempt to see if YouTube has issued an error message
750 if 'reason' not in video_info:
751 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
752 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
753 stream.write(video_info_webpage)
754 stream.close()
755 else:
756 reason = urllib.unquote_plus(video_info['reason'][0])
757 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
758 return
759 token = urllib.unquote_plus(video_info['token'][0])
760 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
761 if format_param is not None:
762 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
763
764 # Check possible RTMP download
765 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
766 self.report_rtmp_download()
767 video_real_url = video_info['conn'][0]
768
769 # uploader
770 if 'author' not in video_info:
771 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
772 return
773 video_uploader = urllib.unquote_plus(video_info['author'][0])
774
775 # title
776 if 'title' not in video_info:
777 self._downloader.trouble(u'ERROR: unable to extract video title')
778 return
779 video_title = urllib.unquote_plus(video_info['title'][0])
780 video_title = video_title.decode('utf-8')
781 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
782 video_title = video_title.replace(os.sep, u'%')
783
784 # simplified title
785 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
786 simple_title = simple_title.strip(ur'_')
787
788 try:
789 # Process video information
790 self._downloader.process_info({
791 'id': video_id.decode('utf-8'),
792 'url': video_real_url.decode('utf-8'),
793 'uploader': video_uploader.decode('utf-8'),
794 'title': video_title,
795 'stitle': simple_title,
796 'ext': video_extension.decode('utf-8'),
797 })
798
799 return
800
801 except UnavailableFormatError, err:
802 if best_quality:
803 if quality_index == len(self._available_formats) - 1:
804 # I don't ever expect this to happen
805 self._downloader.trouble(u'ERROR: no known formats available for video')
806 return
807 else:
808 self.report_unavailable_format(video_id, format_param)
809 quality_index += 1
810 format_param = self._available_formats[quality_index]
811 continue
812 else:
813 self._downloader.trouble('ERROR: format not available for video')
814 return
815
816
817 class MetacafeIE(InfoExtractor):
818 """Information Extractor for metacafe.com."""
819
820 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
821 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
822 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
823 _youtube_ie = None
824
825 def __init__(self, youtube_ie, downloader=None):
826 InfoExtractor.__init__(self, downloader)
827 self._youtube_ie = youtube_ie
828
829 @staticmethod
830 def suitable(url):
831 return (re.match(MetacafeIE._VALID_URL, url) is not None)
832
833 def report_disclaimer(self):
834 """Report disclaimer retrieval."""
835 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
836
837 def report_age_confirmation(self):
838 """Report attempt to confirm age."""
839 self._downloader.to_stdout(u'[metacafe] Confirming age')
840
841 def report_download_webpage(self, video_id):
842 """Report webpage download."""
843 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
844
845 def report_extraction(self, video_id):
846 """Report information extraction."""
847 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
848
849 def _real_initialize(self):
850 # Retrieve disclaimer
851 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
852 try:
853 self.report_disclaimer()
854 disclaimer = urllib2.urlopen(request).read()
855 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
856 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
857 return
858
859 # Confirm age
860 disclaimer_form = {
861 'filters': '0',
862 'submit': "Continue - I'm over 18",
863 }
864 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
865 try:
866 self.report_age_confirmation()
867 disclaimer = urllib2.urlopen(request).read()
868 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
869 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
870 return
871
872 def _real_extract(self, url):
873 # Extract id and simplified title from URL
874 mobj = re.match(self._VALID_URL, url)
875 if mobj is None:
876 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
877 return
878
879 video_id = mobj.group(1)
880
881 # Check if video comes from YouTube
882 mobj2 = re.match(r'^yt-(.*)$', video_id)
883 if mobj2 is not None:
884 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
885 return
886
887 simple_title = mobj.group(2).decode('utf-8')
888 video_extension = 'flv'
889
890 # Retrieve video webpage to extract further information
891 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
892 try:
893 self.report_download_webpage(video_id)
894 webpage = urllib2.urlopen(request).read()
895 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
896 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
897 return
898
899 # Extract URL, uploader and title from webpage
900 self.report_extraction(video_id)
901 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
902 if mobj is None:
903 self._downloader.trouble(u'ERROR: unable to extract media URL')
904 return
905 mediaURL = urllib.unquote(mobj.group(1))
906
907 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
908 #if mobj is None:
909 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
910 # return
911 #gdaKey = mobj.group(1)
912 #
913 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
914
915 video_url = mediaURL
916
917 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
918 if mobj is None:
919 self._downloader.trouble(u'ERROR: unable to extract title')
920 return
921 video_title = mobj.group(1).decode('utf-8')
922
923 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
924 if mobj is None:
925 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
926 return
927 video_uploader = mobj.group(1)
928
929 try:
930 # Process video information
931 self._downloader.process_info({
932 'id': video_id.decode('utf-8'),
933 'url': video_url.decode('utf-8'),
934 'uploader': video_uploader.decode('utf-8'),
935 'title': video_title,
936 'stitle': simple_title,
937 'ext': video_extension.decode('utf-8'),
938 })
939 except UnavailableFormatError:
940 self._downloader.trouble(u'ERROR: format not available for video')
941
942
943 class GoogleIE(InfoExtractor):
944 """Information extractor for video.google.com."""
945
946 _VALID_URL = r'(?:http://)?video\.google\.com/videoplay\?docid=([^\&]+).*'
947
948 def __init__(self, downloader=None):
949 InfoExtractor.__init__(self, downloader)
950
951 @staticmethod
952 def suitable(url):
953 return (re.match(GoogleIE._VALID_URL, url) is not None)
954
955 def report_download_webpage(self, video_id):
956 """Report webpage download."""
957 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
958
959 def report_extraction(self, video_id):
960 """Report information extraction."""
961 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
962
963 def _real_initialize(self):
964 return
965
966 def _real_extract(self, url):
967 # Extract id from URL
968 mobj = re.match(self._VALID_URL, url)
969 if mobj is None:
970 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
971 return
972
973 video_id = mobj.group(1)
974
975 video_extension = 'mp4'
976
977 # Retrieve video webpage to extract further information
978 request = urllib2.Request('http://video.google.com/videoplay?docid=%s' % video_id)
979 try:
980 self.report_download_webpage(video_id)
981 webpage = urllib2.urlopen(request).read()
982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
983 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
984 return
985
986 # Extract URL, uploader, and title from webpage
987 self.report_extraction(video_id)
988 mobj = re.search(r"download_url:'(.*)'", webpage)
989 if mobj is None:
990 self._downloader.trouble(u'ERROR: unable to extract media URL')
991 return
992 mediaURL = urllib.unquote(mobj.group(1))
993 mediaURL = mediaURL.replace('\\x3d', '\x3d')
994 mediaURL = mediaURL.replace('\\x26', '\x26')
995
996 video_url = mediaURL
997
998 mobj = re.search(r'<title>(.*)</title>', webpage)
999 if mobj is None:
1000 self._downloader.trouble(u'ERROR: unable to extract title')
1001 return
1002 video_title = mobj.group(1).decode('utf-8')
1003
1004 # Google Video doesn't show uploader nicknames?
1005 video_uploader = 'uploader'
1006
1007 try:
1008 # Process video information
1009 self._downloader.process_info({
1010 'id': video_id.decode('utf-8'),
1011 'url': video_url.decode('utf-8'),
1012 'uploader': video_uploader.decode('utf-8'),
1013 'title': video_title.decode('utf-8'),
1014 'stitle': video_title.decode('utf-8'),
1015 'ext': video_extension.decode('utf-8'),
1016 })
1017 except UnavailableFormatError:
1018 self._downloader.trouble(u'ERROR: format not available for video')
1019
1020
1021 class PhotobucketIE(InfoExtractor):
1022 """Information extractor for photobucket.com."""
1023
1024 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1025
1026 def __init__(self, downloader=None):
1027 InfoExtractor.__init__(self, downloader)
1028
1029 @staticmethod
1030 def suitable(url):
1031 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1032
1033 def report_download_webpage(self, video_id):
1034 """Report webpage download."""
1035 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1036
1037 def report_extraction(self, video_id):
1038 """Report information extraction."""
1039 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1040
1041 def _real_initialize(self):
1042 return
1043
1044 def _real_extract(self, url):
1045 # Extract id from URL
1046 mobj = re.match(self._VALID_URL, url)
1047 if mobj is None:
1048 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1049 return
1050
1051 video_id = mobj.group(1)
1052
1053 video_extension = 'flv'
1054
1055 # Retrieve video webpage to extract further information
1056 request = urllib2.Request(url)
1057 try:
1058 self.report_download_webpage(video_id)
1059 webpage = urllib2.urlopen(request).read()
1060 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1061 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1062 return
1063
1064 # Extract URL, uploader, and title from webpage
1065 self.report_extraction(video_id)
1066 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1067 if mobj is None:
1068 self._downloader.trouble(u'ERROR: unable to extract media URL')
1069 return
1070 mediaURL = urllib.unquote(mobj.group(1))
1071
1072 video_url = mediaURL
1073
1074 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1075 if mobj is None:
1076 self._downloader.trouble(u'ERROR: unable to extract title')
1077 return
1078 video_title = mobj.group(1).decode('utf-8')
1079
1080 video_uploader = mobj.group(2).decode('utf-8')
1081
1082 try:
1083 # Process video information
1084 self._downloader.process_info({
1085 'id': video_id.decode('utf-8'),
1086 'url': video_url.decode('utf-8'),
1087 'uploader': video_uploader.decode('utf-8'),
1088 'title': video_title.decode('utf-8'),
1089 'stitle': video_title.decode('utf-8'),
1090 'ext': video_extension.decode('utf-8'),
1091 })
1092 except UnavailableFormatError:
1093 self._downloader.trouble(u'ERROR: format not available for video')
1094
1095
1096 class YoutubeSearchIE(InfoExtractor):
1097 """Information Extractor for YouTube search queries."""
1098 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1099 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1100 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1101 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1102 _youtube_ie = None
1103 _max_youtube_results = 1000
1104
1105 def __init__(self, youtube_ie, downloader=None):
1106 InfoExtractor.__init__(self, downloader)
1107 self._youtube_ie = youtube_ie
1108
1109 @staticmethod
1110 def suitable(url):
1111 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1112
1113 def report_download_page(self, query, pagenum):
1114 """Report attempt to download playlist page with given number."""
1115 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1116
1117 def _real_initialize(self):
1118 self._youtube_ie.initialize()
1119
1120 def _real_extract(self, query):
1121 mobj = re.match(self._VALID_QUERY, query)
1122 if mobj is None:
1123 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1124 return
1125
1126 prefix, query = query.split(':')
1127 prefix = prefix[8:]
1128 if prefix == '':
1129 self._download_n_results(query, 1)
1130 return
1131 elif prefix == 'all':
1132 self._download_n_results(query, self._max_youtube_results)
1133 return
1134 else:
1135 try:
1136 n = long(prefix)
1137 if n <= 0:
1138 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1139 return
1140 elif n > self._max_youtube_results:
1141 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1142 n = self._max_youtube_results
1143 self._download_n_results(query, n)
1144 return
1145 except ValueError: # parsing prefix as integer fails
1146 self._download_n_results(query, 1)
1147 return
1148
1149 def _download_n_results(self, query, n):
1150 """Downloads a specified number of results for a query"""
1151
1152 video_ids = []
1153 already_seen = set()
1154 pagenum = 1
1155
1156 while True:
1157 self.report_download_page(query, pagenum)
1158 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1159 request = urllib2.Request(result_url, None, std_headers)
1160 try:
1161 page = urllib2.urlopen(request).read()
1162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1163 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1164 return
1165
1166 # Extract video identifiers
1167 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1168 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1169 if video_id not in already_seen:
1170 video_ids.append(video_id)
1171 already_seen.add(video_id)
1172 if len(video_ids) == n:
1173 # Specified n videos reached
1174 for id in video_ids:
1175 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1176 return
1177
1178 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1179 for id in video_ids:
1180 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1181 return
1182
1183 pagenum = pagenum + 1
1184
1185 class YoutubePlaylistIE(InfoExtractor):
1186 """Information Extractor for YouTube playlists."""
1187
1188 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1189 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1190 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1191 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1192 _youtube_ie = None
1193
1194 def __init__(self, youtube_ie, downloader=None):
1195 InfoExtractor.__init__(self, downloader)
1196 self._youtube_ie = youtube_ie
1197
1198 @staticmethod
1199 def suitable(url):
1200 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1201
1202 def report_download_page(self, playlist_id, pagenum):
1203 """Report attempt to download playlist page with given number."""
1204 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1205
1206 def _real_initialize(self):
1207 self._youtube_ie.initialize()
1208
1209 def _real_extract(self, url):
1210 # Extract playlist id
1211 mobj = re.match(self._VALID_URL, url)
1212 if mobj is None:
1213 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1214 return
1215
1216 # Download playlist pages
1217 playlist_id = mobj.group(1)
1218 video_ids = []
1219 pagenum = 1
1220
1221 while True:
1222 self.report_download_page(playlist_id, pagenum)
1223 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1224 try:
1225 page = urllib2.urlopen(request).read()
1226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1227 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1228 return
1229
1230 # Extract video identifiers
1231 ids_in_page = []
1232 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1233 if mobj.group(1) not in ids_in_page:
1234 ids_in_page.append(mobj.group(1))
1235 video_ids.extend(ids_in_page)
1236
1237 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1238 break
1239 pagenum = pagenum + 1
1240
1241 for id in video_ids:
1242 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1243 return
1244
1245 class YoutubeUserIE(InfoExtractor):
1246 """Information Extractor for YouTube users."""
1247
1248 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1249 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1250 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1251 _youtube_ie = None
1252
1253 def __init__(self, youtube_ie, downloader=None):
1254 InfoExtractor.__init__(self, downloader)
1255 self._youtube_ie = youtube_ie
1256
1257 @staticmethod
1258 def suitable(url):
1259 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1260
1261 def report_download_page(self, username):
1262 """Report attempt to download user page."""
1263 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1264
1265 def _real_initialize(self):
1266 self._youtube_ie.initialize()
1267
1268 def _real_extract(self, url):
1269 # Extract username
1270 mobj = re.match(self._VALID_URL, url)
1271 if mobj is None:
1272 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1273 return
1274
1275 # Download user page
1276 username = mobj.group(1)
1277 video_ids = []
1278 pagenum = 1
1279
1280 self.report_download_page(username)
1281 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1282 try:
1283 page = urllib2.urlopen(request).read()
1284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1285 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1286 return
1287
1288 # Extract video identifiers
1289 ids_in_page = []
1290
1291 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1292 if mobj.group(1) not in ids_in_page:
1293 ids_in_page.append(mobj.group(1))
1294 video_ids.extend(ids_in_page)
1295
1296 for id in video_ids:
1297 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1298 return
1299
1300 class PostProcessor(object):
1301 """Post Processor class.
1302
1303 PostProcessor objects can be added to downloaders with their
1304 add_post_processor() method. When the downloader has finished a
1305 successful download, it will take its internal chain of PostProcessors
1306 and start calling the run() method on each one of them, first with
1307 an initial argument and then with the returned value of the previous
1308 PostProcessor.
1309
1310 The chain will be stopped if one of them ever returns None or the end
1311 of the chain is reached.
1312
1313 PostProcessor objects follow a "mutual registration" process similar
1314 to InfoExtractor objects.
1315 """
1316
1317 _downloader = None
1318
1319 def __init__(self, downloader=None):
1320 self._downloader = downloader
1321
1322 def set_downloader(self, downloader):
1323 """Sets the downloader for this PP."""
1324 self._downloader = downloader
1325
1326 def run(self, information):
1327 """Run the PostProcessor.
1328
1329 The "information" argument is a dictionary like the ones
1330 composed by InfoExtractors. The only difference is that this
1331 one has an extra field called "filepath" that points to the
1332 downloaded file.
1333
1334 When this method returns None, the postprocessing chain is
1335 stopped. However, this method may return an information
1336 dictionary that will be passed to the next postprocessing
1337 object in the chain. It can be the one it received after
1338 changing some fields.
1339
1340 In addition, this method may raise a PostProcessingError
1341 exception that will be taken into account by the downloader
1342 it was called from.
1343 """
1344 return information # by default, do nothing
1345
1346 ### MAIN PROGRAM ###
1347 if __name__ == '__main__':
1348 try:
1349 # Modules needed only when running the main program
1350 import getpass
1351 import optparse
1352
1353 # Function to update the program file with the latest version from bitbucket.org
1354 def update_self(downloader, filename):
1355 # Note: downloader only used for options
1356 if not os.access (filename, os.W_OK):
1357 sys.exit('ERROR: no write permissions on %s' % filename)
1358
1359 downloader.to_stdout('Updating to latest stable version...')
1360 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1361 latest_version = urllib.urlopen(latest_url).read().strip()
1362 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1363 newcontent = urllib.urlopen(prog_url).read()
1364 stream = open(filename, 'w')
1365 stream.write(newcontent)
1366 stream.close()
1367 downloader.to_stdout('Updated to version %s' % latest_version)
1368
1369 # General configuration
1370 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1371 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1372 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1373
1374 # Parse command line
1375 parser = optparse.OptionParser(
1376 usage='Usage: %prog [options] url...',
1377 version='2010.01.19',
1378 conflict_handler='resolve',
1379 )
1380
1381 parser.add_option('-h', '--help',
1382 action='help', help='print this help text and exit')
1383 parser.add_option('-v', '--version',
1384 action='version', help='print program version and exit')
1385 parser.add_option('-U', '--update',
1386 action='store_true', dest='update_self', help='update this program to latest stable version')
1387 parser.add_option('-i', '--ignore-errors',
1388 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1389 parser.add_option('-r', '--rate-limit',
1390 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1391
1392 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1393 authentication.add_option('-u', '--username',
1394 dest='username', metavar='UN', help='account username')
1395 authentication.add_option('-p', '--password',
1396 dest='password', metavar='PW', help='account password')
1397 authentication.add_option('-n', '--netrc',
1398 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1399 parser.add_option_group(authentication)
1400
1401 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1402 video_format.add_option('-f', '--format',
1403 action='store', dest='format', metavar='FMT', help='video format code')
1404 video_format.add_option('-b', '--best-quality',
1405 action='store_const', dest='format', help='download the best quality video possible', const='0')
1406 video_format.add_option('-m', '--mobile-version',
1407 action='store_const', dest='format', help='alias for -f 17', const='17')
1408 video_format.add_option('-d', '--high-def',
1409 action='store_const', dest='format', help='alias for -f 22', const='22')
1410 parser.add_option_group(video_format)
1411
1412 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1413 verbosity.add_option('-q', '--quiet',
1414 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1415 verbosity.add_option('-s', '--simulate',
1416 action='store_true', dest='simulate', help='do not download video', default=False)
1417 verbosity.add_option('-g', '--get-url',
1418 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1419 verbosity.add_option('-e', '--get-title',
1420 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1421 parser.add_option_group(verbosity)
1422
1423 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1424 filesystem.add_option('-t', '--title',
1425 action='store_true', dest='usetitle', help='use title in file name', default=False)
1426 filesystem.add_option('-l', '--literal',
1427 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1428 filesystem.add_option('-o', '--output',
1429 dest='outtmpl', metavar='TPL', help='output filename template')
1430 filesystem.add_option('-a', '--batch-file',
1431 dest='batchfile', metavar='F', help='file containing URLs to download')
1432 filesystem.add_option('-w', '--no-overwrites',
1433 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1434 filesystem.add_option('-c', '--continue',
1435 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1436 parser.add_option_group(filesystem)
1437
1438 (opts, args) = parser.parse_args()
1439
1440 # Batch file verification
1441 batchurls = []
1442 if opts.batchfile is not None:
1443 try:
1444 batchurls = open(opts.batchfile, 'r').readlines()
1445 batchurls = [x.strip() for x in batchurls]
1446 batchurls = [x for x in batchurls if len(x) > 0]
1447 except IOError:
1448 sys.exit(u'ERROR: batch file could not be read')
1449 all_urls = batchurls + args
1450
1451 # Conflicting, missing and erroneous options
1452 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1453 parser.error(u'using .netrc conflicts with giving username/password')
1454 if opts.password is not None and opts.username is None:
1455 parser.error(u'account username missing')
1456 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1457 parser.error(u'using output template conflicts with using title or literal title')
1458 if opts.usetitle and opts.useliteral:
1459 parser.error(u'using title conflicts with using literal title')
1460 if opts.username is not None and opts.password is None:
1461 opts.password = getpass.getpass(u'Type account password and press return:')
1462 if opts.ratelimit is not None:
1463 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1464 if numeric_limit is None:
1465 parser.error(u'invalid rate limit specified')
1466 opts.ratelimit = numeric_limit
1467
1468 # Information extractors
1469 youtube_ie = YoutubeIE()
1470 metacafe_ie = MetacafeIE(youtube_ie)
1471 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1472 youtube_user_ie = YoutubeUserIE(youtube_ie)
1473 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1474 google_ie = GoogleIE()
1475 photobucket_ie = PhotobucketIE()
1476
1477 # File downloader
1478 fd = FileDownloader({
1479 'usenetrc': opts.usenetrc,
1480 'username': opts.username,
1481 'password': opts.password,
1482 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1483 'forceurl': opts.geturl,
1484 'forcetitle': opts.gettitle,
1485 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1486 'format': opts.format,
1487 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1488 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1489 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1490 or u'%(id)s.%(ext)s'),
1491 'ignoreerrors': opts.ignoreerrors,
1492 'ratelimit': opts.ratelimit,
1493 'nooverwrites': opts.nooverwrites,
1494 'continuedl': opts.continue_dl,
1495 })
1496 fd.add_info_extractor(youtube_search_ie)
1497 fd.add_info_extractor(youtube_pl_ie)
1498 fd.add_info_extractor(youtube_user_ie)
1499 fd.add_info_extractor(metacafe_ie)
1500 fd.add_info_extractor(youtube_ie)
1501 fd.add_info_extractor(google_ie)
1502 fd.add_info_extractor(photobucket_ie)
1503
1504 # Update version
1505 if opts.update_self:
1506 update_self(fd, sys.argv[0])
1507
1508 # Maybe do nothing
1509 if len(all_urls) < 1:
1510 if not opts.update_self:
1511 parser.error(u'you must provide at least one URL')
1512 else:
1513 sys.exit()
1514 retcode = fd.download(all_urls)
1515 sys.exit(retcode)
1516
1517 except DownloadError:
1518 sys.exit(1)
1519 except SameFileError:
1520 sys.exit(u'ERROR: fixed output name but more than one file to download')
1521 except KeyboardInterrupt:
1522 sys.exit(u'\nERROR: Interrupted by user')