]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
5afff4e61879526235ff5f262ce4be188c122725
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # License: Public domain code
6 import htmlentitydefs
7 import httplib
8 import locale
9 import math
10 import netrc
11 import os
12 import os.path
13 import re
14 import socket
15 import string
16 import sys
17 import time
18 import urllib
19 import urllib2
20
21 std_headers = {
22 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.2) Gecko/20090729 Firefox/3.5.2',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
25 'Accept-Language': 'en-us,en;q=0.5',
26 }
27
28 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
29
30 def preferredencoding():
31 """Get preferred encoding.
32
33 Returns the best encoding scheme for the system, based on
34 locale.getpreferredencoding() and some further tweaks.
35 """
36 try:
37 pref = locale.getpreferredencoding()
38 # Mac OSX systems have this problem sometimes
39 if pref == '':
40 return 'UTF-8'
41 return pref
42 except:
43 sys.stderr.write('WARNING: problem obtaining preferred encoding. Falling back to UTF-8.\n')
44 return 'UTF-8'
45
46 class DownloadError(Exception):
47 """Download Error exception.
48
49 This exception may be thrown by FileDownloader objects if they are not
50 configured to continue on errors. They will contain the appropriate
51 error message.
52 """
53 pass
54
55 class SameFileError(Exception):
56 """Same File exception.
57
58 This exception will be thrown by FileDownloader objects if they detect
59 multiple files would have to be downloaded to the same file on disk.
60 """
61 pass
62
63 class PostProcessingError(Exception):
64 """Post Processing exception.
65
66 This exception may be raised by PostProcessor's .run() method to
67 indicate an error in the postprocessing task.
68 """
69 pass
70
71 class UnavailableFormatError(Exception):
72 """Unavailable Format exception.
73
74 This exception will be thrown when a video is requested
75 in a format that is not available for that video.
76 """
77 pass
78
79 class ContentTooShortError(Exception):
80 """Content Too Short exception.
81
82 This exception may be raised by FileDownloader objects when a file they
83 download is too small for what the server announced first, indicating
84 the connection was probably interrupted.
85 """
86 # Both in bytes
87 downloaded = None
88 expected = None
89
90 def __init__(self, downloaded, expected):
91 self.downloaded = downloaded
92 self.expected = expected
93
94 class FileDownloader(object):
95 """File Downloader class.
96
97 File downloader objects are the ones responsible of downloading the
98 actual video file and writing it to disk if the user has requested
99 it, among some other tasks. In most cases there should be one per
100 program. As, given a video URL, the downloader doesn't know how to
101 extract all the needed information, task that InfoExtractors do, it
102 has to pass the URL to one of them.
103
104 For this, file downloader objects have a method that allows
105 InfoExtractors to be registered in a given order. When it is passed
106 a URL, the file downloader handles it to the first InfoExtractor it
107 finds that reports being able to handle it. The InfoExtractor extracts
108 all the information about the video or videos the URL refers to, and
109 asks the FileDownloader to process the video information, possibly
110 downloading the video.
111
112 File downloaders accept a lot of parameters. In order not to saturate
113 the object constructor with arguments, it receives a dictionary of
114 options instead. These options are available through the params
115 attribute for the InfoExtractors to use. The FileDownloader also
116 registers itself as the downloader in charge for the InfoExtractors
117 that are added to it, so this is a "mutual registration".
118
119 Available options:
120
121 username: Username for authentication purposes.
122 password: Password for authentication purposes.
123 usenetrc: Use netrc for authentication instead.
124 quiet: Do not print messages to stdout.
125 forceurl: Force printing final URL.
126 forcetitle: Force printing title.
127 simulate: Do not download the video files.
128 format: Video format code.
129 outtmpl: Template for output names.
130 ignoreerrors: Do not stop on download errors.
131 ratelimit: Download speed limit, in bytes/sec.
132 nooverwrites: Prevent overwriting files.
133 continuedl: Try to continue downloads if possible.
134 """
135
136 params = None
137 _ies = []
138 _pps = []
139 _download_retcode = None
140
141 def __init__(self, params):
142 """Create a FileDownloader object with the given options."""
143 self._ies = []
144 self._pps = []
145 self._download_retcode = 0
146 self.params = params
147
148 @staticmethod
149 def pmkdir(filename):
150 """Create directory components in filename. Similar to Unix "mkdir -p"."""
151 components = filename.split(os.sep)
152 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
153 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
154 for dir in aggregate:
155 if not os.path.exists(dir):
156 os.mkdir(dir)
157
158 @staticmethod
159 def format_bytes(bytes):
160 if bytes is None:
161 return 'N/A'
162 if type(bytes) is str:
163 bytes = float(bytes)
164 if bytes == 0.0:
165 exponent = 0
166 else:
167 exponent = long(math.log(bytes, 1024.0))
168 suffix = 'bkMGTPEZY'[exponent]
169 converted = float(bytes) / float(1024**exponent)
170 return '%.2f%s' % (converted, suffix)
171
172 @staticmethod
173 def calc_percent(byte_counter, data_len):
174 if data_len is None:
175 return '---.-%'
176 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
177
178 @staticmethod
179 def calc_eta(start, now, total, current):
180 if total is None:
181 return '--:--'
182 dif = now - start
183 if current == 0 or dif < 0.001: # One millisecond
184 return '--:--'
185 rate = float(current) / dif
186 eta = long((float(total) - float(current)) / rate)
187 (eta_mins, eta_secs) = divmod(eta, 60)
188 if eta_mins > 99:
189 return '--:--'
190 return '%02d:%02d' % (eta_mins, eta_secs)
191
192 @staticmethod
193 def calc_speed(start, now, bytes):
194 dif = now - start
195 if bytes == 0 or dif < 0.001: # One millisecond
196 return '%10s' % '---b/s'
197 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
198
199 @staticmethod
200 def best_block_size(elapsed_time, bytes):
201 new_min = max(bytes / 2.0, 1.0)
202 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
203 if elapsed_time < 0.001:
204 return long(new_max)
205 rate = bytes / elapsed_time
206 if rate > new_max:
207 return long(new_max)
208 if rate < new_min:
209 return long(new_min)
210 return long(rate)
211
212 @staticmethod
213 def parse_bytes(bytestr):
214 """Parse a string indicating a byte quantity into a long integer."""
215 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
216 if matchobj is None:
217 return None
218 number = float(matchobj.group(1))
219 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
220 return long(round(number * multiplier))
221
222 @staticmethod
223 def verify_url(url):
224 """Verify a URL is valid and data could be downloaded. Return real data URL."""
225 request = urllib2.Request(url, None, std_headers)
226 data = urllib2.urlopen(request)
227 data.read(1)
228 url = data.geturl()
229 data.close()
230 return url
231
232 def add_info_extractor(self, ie):
233 """Add an InfoExtractor object to the end of the list."""
234 self._ies.append(ie)
235 ie.set_downloader(self)
236
237 def add_post_processor(self, pp):
238 """Add a PostProcessor object to the end of the chain."""
239 self._pps.append(pp)
240 pp.set_downloader(self)
241
242 def to_stdout(self, message, skip_eol=False):
243 """Print message to stdout if not in quiet mode."""
244 if not self.params.get('quiet', False):
245 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
246 sys.stdout.flush()
247
248 def to_stderr(self, message):
249 """Print message to stderr."""
250 print >>sys.stderr, message.encode(preferredencoding())
251
252 def fixed_template(self):
253 """Checks if the output template is fixed."""
254 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
255
256 def trouble(self, message=None):
257 """Determine action to take when a download problem appears.
258
259 Depending on if the downloader has been configured to ignore
260 download errors or not, this method may throw an exception or
261 not when errors are found, after printing the message.
262 """
263 if message is not None:
264 self.to_stderr(message)
265 if not self.params.get('ignoreerrors', False):
266 raise DownloadError(message)
267 self._download_retcode = 1
268
269 def slow_down(self, start_time, byte_counter):
270 """Sleep if the download speed is over the rate limit."""
271 rate_limit = self.params.get('ratelimit', None)
272 if rate_limit is None or byte_counter == 0:
273 return
274 now = time.time()
275 elapsed = now - start_time
276 if elapsed <= 0.0:
277 return
278 speed = float(byte_counter) / elapsed
279 if speed > rate_limit:
280 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
281
282 def report_destination(self, filename):
283 """Report destination filename."""
284 self.to_stdout(u'[download] Destination: %s' % filename)
285
286 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
287 """Report download progress."""
288 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
289 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
290
291 def report_resuming_byte(self, resume_len):
292 """Report attemtp to resume at given byte."""
293 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
294
295 def report_file_already_downloaded(self, file_name):
296 """Report file has already been fully downloaded."""
297 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
298
299 def report_unable_to_resume(self):
300 """Report it was impossible to resume download."""
301 self.to_stdout(u'[download] Unable to resume')
302
303 def report_finish(self):
304 """Report download finished."""
305 self.to_stdout(u'')
306
307 def process_info(self, info_dict):
308 """Process a single dictionary returned by an InfoExtractor."""
309 # Do nothing else if in simulate mode
310 if self.params.get('simulate', False):
311 try:
312 info_dict['url'] = self.verify_url(info_dict['url'])
313 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
314 raise UnavailableFormatError
315
316 # Forced printings
317 if self.params.get('forcetitle', False):
318 print info_dict['title'].encode(preferredencoding())
319 if self.params.get('forceurl', False):
320 print info_dict['url'].encode(preferredencoding())
321
322 return
323
324 try:
325 template_dict = dict(info_dict)
326 template_dict['epoch'] = unicode(long(time.time()))
327 filename = self.params['outtmpl'] % template_dict
328 except (ValueError, KeyError), err:
329 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
330 if self.params['nooverwrites'] and os.path.exists(filename):
331 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
332 return
333
334 try:
335 self.pmkdir(filename)
336 except (OSError, IOError), err:
337 self.trouble('ERROR: unable to create directories: %s' % str(err))
338 return
339
340 try:
341 success = self._do_download(filename, info_dict['url'])
342 except (OSError, IOError), err:
343 raise UnavailableFormatError
344 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
345 self.trouble('ERROR: unable to download video data: %s' % str(err))
346 return
347 except (ContentTooShortError, ), err:
348 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
349 return
350
351 if success:
352 try:
353 self.post_process(filename, info_dict)
354 except (PostProcessingError), err:
355 self.trouble('ERROR: postprocessing: %s' % str(err))
356 return
357
358 def download(self, url_list):
359 """Download a given list of URLs."""
360 if len(url_list) > 1 and self.fixed_template():
361 raise SameFileError(self.params['outtmpl'])
362
363 for url in url_list:
364 suitable_found = False
365 for ie in self._ies:
366 # Go to next InfoExtractor if not suitable
367 if not ie.suitable(url):
368 continue
369
370 # Suitable InfoExtractor found
371 suitable_found = True
372
373 # Extract information from URL and process it
374 ie.extract(url)
375
376 # Suitable InfoExtractor had been found; go to next URL
377 break
378
379 if not suitable_found:
380 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
381
382 return self._download_retcode
383
384 def post_process(self, filename, ie_info):
385 """Run the postprocessing chain on the given file."""
386 info = dict(ie_info)
387 info['filepath'] = filename
388 for pp in self._pps:
389 info = pp.run(info)
390 if info is None:
391 break
392
393 def _do_download(self, filename, url):
394 stream = None
395 open_mode = 'ab'
396
397 basic_request = urllib2.Request(url, None, std_headers)
398 request = urllib2.Request(url, None, std_headers)
399
400 # Attempt to resume download with "continuedl" option
401 if os.path.isfile(filename):
402 resume_len = os.path.getsize(filename)
403 else:
404 resume_len = 0
405 if self.params['continuedl'] and resume_len != 0:
406 self.report_resuming_byte(resume_len)
407 request.add_header('Range','bytes=%d-' % resume_len)
408
409 # Establish connection
410 try:
411 data = urllib2.urlopen(request)
412 except (urllib2.HTTPError, ), err:
413 if err.code != 416: # 416 is 'Requested range not satisfiable'
414 raise
415 data = urllib2.urlopen(basic_request)
416 content_length = data.info()['Content-Length']
417 if content_length is not None and long(content_length) == resume_len:
418 self.report_file_already_downloaded(filename)
419 return True
420 else:
421 self.report_unable_to_resume()
422 open_mode = 'wb'
423
424 data_len = data.info().get('Content-length', None)
425 data_len_str = self.format_bytes(data_len)
426 byte_counter = 0
427 block_size = 1024
428 start = time.time()
429 while True:
430 # Download and write
431 before = time.time()
432 data_block = data.read(block_size)
433 after = time.time()
434 data_block_len = len(data_block)
435 if data_block_len == 0:
436 break
437 byte_counter += data_block_len
438
439 # Open file just in time
440 if stream is None:
441 try:
442 stream = open(filename, open_mode)
443 self.report_destination(filename)
444 except (OSError, IOError), err:
445 self.trouble('ERROR: unable to open for writing: %s' % str(err))
446 return False
447 stream.write(data_block)
448 block_size = self.best_block_size(after - before, data_block_len)
449
450 # Progress message
451 percent_str = self.calc_percent(byte_counter, data_len)
452 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
453 speed_str = self.calc_speed(start, time.time(), byte_counter)
454 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
455
456 # Apply rate limit
457 self.slow_down(start, byte_counter)
458
459 self.report_finish()
460 if data_len is not None and str(byte_counter) != data_len:
461 raise ContentTooShortError(byte_counter, long(data_len))
462 return True
463
464 class InfoExtractor(object):
465 """Information Extractor class.
466
467 Information extractors are the classes that, given a URL, extract
468 information from the video (or videos) the URL refers to. This
469 information includes the real video URL, the video title and simplified
470 title, author and others. The information is stored in a dictionary
471 which is then passed to the FileDownloader. The FileDownloader
472 processes this information possibly downloading the video to the file
473 system, among other possible outcomes. The dictionaries must include
474 the following fields:
475
476 id: Video identifier.
477 url: Final video URL.
478 uploader: Nickname of the video uploader.
479 title: Literal title.
480 stitle: Simplified title.
481 ext: Video filename extension.
482
483 Subclasses of this one should re-define the _real_initialize() and
484 _real_extract() methods, as well as the suitable() static method.
485 Probably, they should also be instantiated and added to the main
486 downloader.
487 """
488
489 _ready = False
490 _downloader = None
491
492 def __init__(self, downloader=None):
493 """Constructor. Receives an optional downloader."""
494 self._ready = False
495 self.set_downloader(downloader)
496
497 @staticmethod
498 def suitable(url):
499 """Receives a URL and returns True if suitable for this IE."""
500 return False
501
502 def initialize(self):
503 """Initializes an instance (authentication, etc)."""
504 if not self._ready:
505 self._real_initialize()
506 self._ready = True
507
508 def extract(self, url):
509 """Extracts URL information and returns it in list of dicts."""
510 self.initialize()
511 return self._real_extract(url)
512
513 def set_downloader(self, downloader):
514 """Sets the downloader for this IE."""
515 self._downloader = downloader
516
517 def _real_initialize(self):
518 """Real initialization process. Redefine in subclasses."""
519 pass
520
521 def _real_extract(self, url):
522 """Real extraction process. Redefine in subclasses."""
523 pass
524
525 class YoutubeIE(InfoExtractor):
526 """Information extractor for youtube.com."""
527
528 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
529 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
530 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
531 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
532 _NETRC_MACHINE = 'youtube'
533 _available_formats = ['22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
534 _video_extensions = {
535 '13': '3gp',
536 '17': 'mp4',
537 '18': 'mp4',
538 '22': 'mp4',
539 }
540
541 @staticmethod
542 def suitable(url):
543 return (re.match(YoutubeIE._VALID_URL, url) is not None)
544
545 @staticmethod
546 def htmlentity_transform(matchobj):
547 """Transforms an HTML entity to a Unicode character."""
548 entity = matchobj.group(1)
549
550 # Known non-numeric HTML entity
551 if entity in htmlentitydefs.name2codepoint:
552 return unichr(htmlentitydefs.name2codepoint[entity])
553
554 # Unicode character
555 mobj = re.match(ur'(?u)#(x?\d+)', entity)
556 if mobj is not None:
557 numstr = mobj.group(1)
558 if numstr.startswith(u'x'):
559 base = 16
560 numstr = u'0%s' % numstr
561 else:
562 base = 10
563 return unichr(long(numstr, base))
564
565 # Unknown entity in name, return its literal representation
566 return (u'&%s;' % entity)
567
568 def report_lang(self):
569 """Report attempt to set language."""
570 self._downloader.to_stdout(u'[youtube] Setting language')
571
572 def report_login(self):
573 """Report attempt to log in."""
574 self._downloader.to_stdout(u'[youtube] Logging in')
575
576 def report_age_confirmation(self):
577 """Report attempt to confirm age."""
578 self._downloader.to_stdout(u'[youtube] Confirming age')
579
580 def report_video_info_webpage_download(self, video_id):
581 """Report attempt to download video info webpage."""
582 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
583
584 def report_information_extraction(self, video_id):
585 """Report attempt to extract video information."""
586 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
587
588 def report_unavailable_format(self, video_id, format):
589 """Report extracted video URL."""
590 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
591
592 def _real_initialize(self):
593 if self._downloader is None:
594 return
595
596 username = None
597 password = None
598 downloader_params = self._downloader.params
599
600 # Attempt to use provided username and password or .netrc data
601 if downloader_params.get('username', None) is not None:
602 username = downloader_params['username']
603 password = downloader_params['password']
604 elif downloader_params.get('usenetrc', False):
605 try:
606 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
607 if info is not None:
608 username = info[0]
609 password = info[2]
610 else:
611 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
612 except (IOError, netrc.NetrcParseError), err:
613 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
614 return
615
616 # Set language
617 request = urllib2.Request(self._LANG_URL, None, std_headers)
618 try:
619 self.report_lang()
620 urllib2.urlopen(request).read()
621 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
622 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
623 return
624
625 # No authentication to be performed
626 if username is None:
627 return
628
629 # Log in
630 login_form = {
631 'current_form': 'loginForm',
632 'next': '/',
633 'action_login': 'Log In',
634 'username': username,
635 'password': password,
636 }
637 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
638 try:
639 self.report_login()
640 login_results = urllib2.urlopen(request).read()
641 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
642 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
643 return
644 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
645 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
646 return
647
648 # Confirm age
649 age_form = {
650 'next_url': '/',
651 'action_confirm': 'Confirm',
652 }
653 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
654 try:
655 self.report_age_confirmation()
656 age_results = urllib2.urlopen(request).read()
657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
658 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
659 return
660
661 def _real_extract(self, url):
662 # Extract video id from URL
663 mobj = re.match(self._VALID_URL, url)
664 if mobj is None:
665 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
666 return
667 video_id = mobj.group(2)
668
669 # Downloader parameters
670 best_quality = False
671 format_param = None
672 quality_index = 0
673 if self._downloader is not None:
674 params = self._downloader.params
675 format_param = params.get('format', None)
676 if format_param == '0':
677 format_param = self._available_formats[quality_index]
678 best_quality = True
679
680 while True:
681 # Extension
682 video_extension = self._video_extensions.get(format_param, 'flv')
683
684 # Get video info
685 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
686 request = urllib2.Request(video_info_url, None, std_headers)
687 try:
688 self.report_video_info_webpage_download(video_id)
689 video_info_webpage = urllib2.urlopen(request).read()
690 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
691 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
692 return
693 self.report_information_extraction(video_id)
694
695 # "t" param
696 mobj = re.search(r'(?m)&token=([^&]+)(?:&|$)', video_info_webpage)
697 if mobj is None:
698 # Attempt to see if YouTube has issued an error message
699 mobj = re.search(r'(?m)&reason=([^&]+)(?:&|$)', video_info_webpage)
700 if mobj is None:
701 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
702 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
703 stream.write(video_info_webpage)
704 stream.close()
705 else:
706 reason = urllib.unquote_plus(mobj.group(1))
707 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
708 return
709 token = urllib.unquote(mobj.group(1))
710 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
711 if format_param is not None:
712 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
713
714 # uploader
715 mobj = re.search(r'(?m)&author=([^&]+)(?:&|$)', video_info_webpage)
716 if mobj is None:
717 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
718 return
719 video_uploader = urllib.unquote(mobj.group(1))
720
721 # title
722 mobj = re.search(r'(?m)&title=([^&]+)(?:&|$)', video_info_webpage)
723 if mobj is None:
724 self._downloader.trouble(u'ERROR: unable to extract video title')
725 return
726 video_title = urllib.unquote(mobj.group(1))
727 video_title = video_title.decode('utf-8')
728 video_title = re.sub(ur'(?u)&(.+?);', self.htmlentity_transform, video_title)
729 video_title = video_title.replace(os.sep, u'%')
730
731 # simplified title
732 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
733 simple_title = simple_title.strip(ur'_')
734
735 try:
736 # Process video information
737 self._downloader.process_info({
738 'id': video_id.decode('utf-8'),
739 'url': video_real_url.decode('utf-8'),
740 'uploader': video_uploader.decode('utf-8'),
741 'title': video_title,
742 'stitle': simple_title,
743 'ext': video_extension.decode('utf-8'),
744 })
745
746 return
747
748 except UnavailableFormatError, err:
749 if best_quality:
750 if quality_index == len(self._available_formats) - 1:
751 # I don't ever expect this to happen
752 self._downloader.trouble(u'ERROR: no known formats available for video')
753 return
754 else:
755 self.report_unavailable_format(video_id, format_param)
756 quality_index += 1
757 format_param = self._available_formats[quality_index]
758 continue
759 else:
760 self._downloader.trouble('ERROR: format not available for video')
761 return
762
763
764 class MetacafeIE(InfoExtractor):
765 """Information Extractor for metacafe.com."""
766
767 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
768 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
769 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
770 _youtube_ie = None
771
772 def __init__(self, youtube_ie, downloader=None):
773 InfoExtractor.__init__(self, downloader)
774 self._youtube_ie = youtube_ie
775
776 @staticmethod
777 def suitable(url):
778 return (re.match(MetacafeIE._VALID_URL, url) is not None)
779
780 def report_disclaimer(self):
781 """Report disclaimer retrieval."""
782 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
783
784 def report_age_confirmation(self):
785 """Report attempt to confirm age."""
786 self._downloader.to_stdout(u'[metacafe] Confirming age')
787
788 def report_download_webpage(self, video_id):
789 """Report webpage download."""
790 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
791
792 def report_extraction(self, video_id):
793 """Report information extraction."""
794 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
795
796 def _real_initialize(self):
797 # Retrieve disclaimer
798 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
799 try:
800 self.report_disclaimer()
801 disclaimer = urllib2.urlopen(request).read()
802 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
803 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
804 return
805
806 # Confirm age
807 disclaimer_form = {
808 'filters': '0',
809 'submit': "Continue - I'm over 18",
810 }
811 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
812 try:
813 self.report_age_confirmation()
814 disclaimer = urllib2.urlopen(request).read()
815 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
816 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
817 return
818
819 def _real_extract(self, url):
820 # Extract id and simplified title from URL
821 mobj = re.match(self._VALID_URL, url)
822 if mobj is None:
823 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
824 return
825
826 video_id = mobj.group(1)
827
828 # Check if video comes from YouTube
829 mobj2 = re.match(r'^yt-(.*)$', video_id)
830 if mobj2 is not None:
831 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
832 return
833
834 simple_title = mobj.group(2).decode('utf-8')
835 video_extension = 'flv'
836
837 # Retrieve video webpage to extract further information
838 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
839 try:
840 self.report_download_webpage(video_id)
841 webpage = urllib2.urlopen(request).read()
842 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
843 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
844 return
845
846 # Extract URL, uploader and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
849 if mobj is None:
850 self._downloader.trouble(u'ERROR: unable to extract media URL')
851 return
852 mediaURL = urllib.unquote(mobj.group(1))
853
854 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
855 #if mobj is None:
856 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
857 # return
858 #gdaKey = mobj.group(1)
859 #
860 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
861
862 video_url = mediaURL
863
864 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
865 if mobj is None:
866 self._downloader.trouble(u'ERROR: unable to extract title')
867 return
868 video_title = mobj.group(1).decode('utf-8')
869
870 mobj = re.search(r'(?ms)<li id="ChnlUsr">.*?Submitter:.*?<a .*?>(.*?)<', webpage)
871 if mobj is None:
872 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
873 return
874 video_uploader = mobj.group(1)
875
876 try:
877 # Process video information
878 self._downloader.process_info({
879 'id': video_id.decode('utf-8'),
880 'url': video_url.decode('utf-8'),
881 'uploader': video_uploader.decode('utf-8'),
882 'title': video_title,
883 'stitle': simple_title,
884 'ext': video_extension.decode('utf-8'),
885 })
886 except UnavailableFormatError:
887 self._downloader.trouble(u'ERROR: format not available for video')
888
889
890 class YoutubeSearchIE(InfoExtractor):
891 """Information Extractor for YouTube search queries."""
892 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
893 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
894 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
895 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
896 _youtube_ie = None
897 _max_youtube_results = 1000
898
899 def __init__(self, youtube_ie, downloader=None):
900 InfoExtractor.__init__(self, downloader)
901 self._youtube_ie = youtube_ie
902
903 @staticmethod
904 def suitable(url):
905 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
906
907 def report_download_page(self, query, pagenum):
908 """Report attempt to download playlist page with given number."""
909 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
910
911 def _real_initialize(self):
912 self._youtube_ie.initialize()
913
914 def _real_extract(self, query):
915 mobj = re.match(self._VALID_QUERY, query)
916 if mobj is None:
917 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
918 return
919
920 prefix, query = query.split(':')
921 prefix = prefix[8:]
922 if prefix == '':
923 self._download_n_results(query, 1)
924 return
925 elif prefix == 'all':
926 self._download_n_results(query, self._max_youtube_results)
927 return
928 else:
929 try:
930 n = long(prefix)
931 if n <= 0:
932 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
933 return
934 elif n > self._max_youtube_results:
935 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
936 n = self._max_youtube_results
937 self._download_n_results(query, n)
938 return
939 except ValueError: # parsing prefix as integer fails
940 self._download_n_results(query, 1)
941 return
942
943 def _download_n_results(self, query, n):
944 """Downloads a specified number of results for a query"""
945
946 video_ids = []
947 already_seen = set()
948 pagenum = 1
949
950 while True:
951 self.report_download_page(query, pagenum)
952 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
953 request = urllib2.Request(result_url, None, std_headers)
954 try:
955 page = urllib2.urlopen(request).read()
956 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
957 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
958 return
959
960 # Extract video identifiers
961 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
962 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
963 if video_id not in already_seen:
964 video_ids.append(video_id)
965 already_seen.add(video_id)
966 if len(video_ids) == n:
967 # Specified n videos reached
968 for id in video_ids:
969 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
970 return
971
972 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
973 for id in video_ids:
974 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
975 return
976
977 pagenum = pagenum + 1
978
979 class YoutubePlaylistIE(InfoExtractor):
980 """Information Extractor for YouTube playlists."""
981
982 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
983 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
984 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
985 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
986 _youtube_ie = None
987
988 def __init__(self, youtube_ie, downloader=None):
989 InfoExtractor.__init__(self, downloader)
990 self._youtube_ie = youtube_ie
991
992 @staticmethod
993 def suitable(url):
994 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
995
996 def report_download_page(self, playlist_id, pagenum):
997 """Report attempt to download playlist page with given number."""
998 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
999
1000 def _real_initialize(self):
1001 self._youtube_ie.initialize()
1002
1003 def _real_extract(self, url):
1004 # Extract playlist id
1005 mobj = re.match(self._VALID_URL, url)
1006 if mobj is None:
1007 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1008 return
1009
1010 # Download playlist pages
1011 playlist_id = mobj.group(1)
1012 video_ids = []
1013 pagenum = 1
1014
1015 while True:
1016 self.report_download_page(playlist_id, pagenum)
1017 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1018 try:
1019 page = urllib2.urlopen(request).read()
1020 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1021 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1022 return
1023
1024 # Extract video identifiers
1025 ids_in_page = []
1026 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1027 if mobj.group(1) not in ids_in_page:
1028 ids_in_page.append(mobj.group(1))
1029 video_ids.extend(ids_in_page)
1030
1031 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1032 break
1033 pagenum = pagenum + 1
1034
1035 for id in video_ids:
1036 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1037 return
1038
1039 class PostProcessor(object):
1040 """Post Processor class.
1041
1042 PostProcessor objects can be added to downloaders with their
1043 add_post_processor() method. When the downloader has finished a
1044 successful download, it will take its internal chain of PostProcessors
1045 and start calling the run() method on each one of them, first with
1046 an initial argument and then with the returned value of the previous
1047 PostProcessor.
1048
1049 The chain will be stopped if one of them ever returns None or the end
1050 of the chain is reached.
1051
1052 PostProcessor objects follow a "mutual registration" process similar
1053 to InfoExtractor objects.
1054 """
1055
1056 _downloader = None
1057
1058 def __init__(self, downloader=None):
1059 self._downloader = downloader
1060
1061 def set_downloader(self, downloader):
1062 """Sets the downloader for this PP."""
1063 self._downloader = downloader
1064
1065 def run(self, information):
1066 """Run the PostProcessor.
1067
1068 The "information" argument is a dictionary like the ones
1069 composed by InfoExtractors. The only difference is that this
1070 one has an extra field called "filepath" that points to the
1071 downloaded file.
1072
1073 When this method returns None, the postprocessing chain is
1074 stopped. However, this method may return an information
1075 dictionary that will be passed to the next postprocessing
1076 object in the chain. It can be the one it received after
1077 changing some fields.
1078
1079 In addition, this method may raise a PostProcessingError
1080 exception that will be taken into account by the downloader
1081 it was called from.
1082 """
1083 return information # by default, do nothing
1084
1085 ### MAIN PROGRAM ###
1086 if __name__ == '__main__':
1087 try:
1088 # Modules needed only when running the main program
1089 import getpass
1090 import optparse
1091
1092 # General configuration
1093 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1094 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1095 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1096
1097 # Parse command line
1098 parser = optparse.OptionParser(
1099 usage='Usage: %prog [options] url...',
1100 version='2009.09.13',
1101 conflict_handler='resolve',
1102 )
1103
1104 parser.add_option('-h', '--help',
1105 action='help', help='print this help text and exit')
1106 parser.add_option('-v', '--version',
1107 action='version', help='print program version and exit')
1108 parser.add_option('-i', '--ignore-errors',
1109 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1110 parser.add_option('-r', '--rate-limit',
1111 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1112
1113 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1114 authentication.add_option('-u', '--username',
1115 dest='username', metavar='UN', help='account username')
1116 authentication.add_option('-p', '--password',
1117 dest='password', metavar='PW', help='account password')
1118 authentication.add_option('-n', '--netrc',
1119 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1120 parser.add_option_group(authentication)
1121
1122 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1123 video_format.add_option('-f', '--format',
1124 action='store', dest='format', metavar='FMT', help='video format code')
1125 video_format.add_option('-b', '--best-quality',
1126 action='store_const', dest='format', help='download the best quality video possible', const='0')
1127 video_format.add_option('-m', '--mobile-version',
1128 action='store_const', dest='format', help='alias for -f 17', const='17')
1129 video_format.add_option('-d', '--high-def',
1130 action='store_const', dest='format', help='alias for -f 22', const='22')
1131 parser.add_option_group(video_format)
1132
1133 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1134 verbosity.add_option('-q', '--quiet',
1135 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1136 verbosity.add_option('-s', '--simulate',
1137 action='store_true', dest='simulate', help='do not download video', default=False)
1138 verbosity.add_option('-g', '--get-url',
1139 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1140 verbosity.add_option('-e', '--get-title',
1141 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1142 parser.add_option_group(verbosity)
1143
1144 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1145 filesystem.add_option('-t', '--title',
1146 action='store_true', dest='usetitle', help='use title in file name', default=False)
1147 filesystem.add_option('-l', '--literal',
1148 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1149 filesystem.add_option('-o', '--output',
1150 dest='outtmpl', metavar='TPL', help='output filename template')
1151 filesystem.add_option('-a', '--batch-file',
1152 dest='batchfile', metavar='F', help='file containing URLs to download')
1153 filesystem.add_option('-w', '--no-overwrites',
1154 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1155 filesystem.add_option('-c', '--continue',
1156 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1157 parser.add_option_group(filesystem)
1158
1159 (opts, args) = parser.parse_args()
1160
1161 # Batch file verification
1162 batchurls = []
1163 if opts.batchfile is not None:
1164 try:
1165 batchurls = open(opts.batchfile, 'r').readlines()
1166 batchurls = [x.strip() for x in batchurls]
1167 batchurls = [x for x in batchurls if len(x) > 0]
1168 except IOError:
1169 sys.exit(u'ERROR: batch file could not be read')
1170 all_urls = batchurls + args
1171
1172 # Conflicting, missing and erroneous options
1173 if len(all_urls) < 1:
1174 parser.error(u'you must provide at least one URL')
1175 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1176 parser.error(u'using .netrc conflicts with giving username/password')
1177 if opts.password is not None and opts.username is None:
1178 parser.error(u'account username missing')
1179 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1180 parser.error(u'using output template conflicts with using title or literal title')
1181 if opts.usetitle and opts.useliteral:
1182 parser.error(u'using title conflicts with using literal title')
1183 if opts.username is not None and opts.password is None:
1184 opts.password = getpass.getpass(u'Type account password and press return:')
1185 if opts.ratelimit is not None:
1186 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1187 if numeric_limit is None:
1188 parser.error(u'invalid rate limit specified')
1189 opts.ratelimit = numeric_limit
1190
1191 # Information extractors
1192 youtube_ie = YoutubeIE()
1193 metacafe_ie = MetacafeIE(youtube_ie)
1194 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1195 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1196
1197 # File downloader
1198 fd = FileDownloader({
1199 'usenetrc': opts.usenetrc,
1200 'username': opts.username,
1201 'password': opts.password,
1202 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1203 'forceurl': opts.geturl,
1204 'forcetitle': opts.gettitle,
1205 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1206 'format': opts.format,
1207 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1208 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1209 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1210 or u'%(id)s.%(ext)s'),
1211 'ignoreerrors': opts.ignoreerrors,
1212 'ratelimit': opts.ratelimit,
1213 'nooverwrites': opts.nooverwrites,
1214 'continuedl': opts.continue_dl,
1215 })
1216 fd.add_info_extractor(youtube_search_ie)
1217 fd.add_info_extractor(youtube_pl_ie)
1218 fd.add_info_extractor(metacafe_ie)
1219 fd.add_info_extractor(youtube_ie)
1220 retcode = fd.download(all_urls)
1221 sys.exit(retcode)
1222
1223 except DownloadError:
1224 sys.exit(1)
1225 except SameFileError:
1226 sys.exit(u'ERROR: fixed output name but more than one file to download')
1227 except KeyboardInterrupt:
1228 sys.exit(u'\nERROR: Interrupted by user')