]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
Imported Debian patch 2010.03.13-1
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 stream = open(filename, open_mode)
97 return (stream, filename)
98 except (IOError, OSError), err:
99 # In case of error, try to remove win32 forbidden chars
100 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
101
102 # An exception here should be caught in the caller
103 stream = open(filename, open_mode)
104 return (stream, filename)
105
106
107 class DownloadError(Exception):
108 """Download Error exception.
109
110 This exception may be thrown by FileDownloader objects if they are not
111 configured to continue on errors. They will contain the appropriate
112 error message.
113 """
114 pass
115
116 class SameFileError(Exception):
117 """Same File exception.
118
119 This exception will be thrown by FileDownloader objects if they detect
120 multiple files would have to be downloaded to the same file on disk.
121 """
122 pass
123
124 class PostProcessingError(Exception):
125 """Post Processing exception.
126
127 This exception may be raised by PostProcessor's .run() method to
128 indicate an error in the postprocessing task.
129 """
130 pass
131
132 class UnavailableFormatError(Exception):
133 """Unavailable Format exception.
134
135 This exception will be thrown when a video is requested
136 in a format that is not available for that video.
137 """
138 pass
139
140 class ContentTooShortError(Exception):
141 """Content Too Short exception.
142
143 This exception may be raised by FileDownloader objects when a file they
144 download is too small for what the server announced first, indicating
145 the connection was probably interrupted.
146 """
147 # Both in bytes
148 downloaded = None
149 expected = None
150
151 def __init__(self, downloaded, expected):
152 self.downloaded = downloaded
153 self.expected = expected
154
155 class FileDownloader(object):
156 """File Downloader class.
157
158 File downloader objects are the ones responsible of downloading the
159 actual video file and writing it to disk if the user has requested
160 it, among some other tasks. In most cases there should be one per
161 program. As, given a video URL, the downloader doesn't know how to
162 extract all the needed information, task that InfoExtractors do, it
163 has to pass the URL to one of them.
164
165 For this, file downloader objects have a method that allows
166 InfoExtractors to be registered in a given order. When it is passed
167 a URL, the file downloader handles it to the first InfoExtractor it
168 finds that reports being able to handle it. The InfoExtractor extracts
169 all the information about the video or videos the URL refers to, and
170 asks the FileDownloader to process the video information, possibly
171 downloading the video.
172
173 File downloaders accept a lot of parameters. In order not to saturate
174 the object constructor with arguments, it receives a dictionary of
175 options instead. These options are available through the params
176 attribute for the InfoExtractors to use. The FileDownloader also
177 registers itself as the downloader in charge for the InfoExtractors
178 that are added to it, so this is a "mutual registration".
179
180 Available options:
181
182 username: Username for authentication purposes.
183 password: Password for authentication purposes.
184 usenetrc: Use netrc for authentication instead.
185 quiet: Do not print messages to stdout.
186 forceurl: Force printing final URL.
187 forcetitle: Force printing title.
188 simulate: Do not download the video files.
189 format: Video format code.
190 outtmpl: Template for output names.
191 ignoreerrors: Do not stop on download errors.
192 ratelimit: Download speed limit, in bytes/sec.
193 nooverwrites: Prevent overwriting files.
194 continuedl: Try to continue downloads if possible.
195 noprogress: Do not print the progress bar.
196 """
197
198 params = None
199 _ies = []
200 _pps = []
201 _download_retcode = None
202
203 def __init__(self, params):
204 """Create a FileDownloader object with the given options."""
205 self._ies = []
206 self._pps = []
207 self._download_retcode = 0
208 self.params = params
209
210 @staticmethod
211 def pmkdir(filename):
212 """Create directory components in filename. Similar to Unix "mkdir -p"."""
213 components = filename.split(os.sep)
214 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
215 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
216 for dir in aggregate:
217 if not os.path.exists(dir):
218 os.mkdir(dir)
219
220 @staticmethod
221 def format_bytes(bytes):
222 if bytes is None:
223 return 'N/A'
224 if type(bytes) is str:
225 bytes = float(bytes)
226 if bytes == 0.0:
227 exponent = 0
228 else:
229 exponent = long(math.log(bytes, 1024.0))
230 suffix = 'bkMGTPEZY'[exponent]
231 converted = float(bytes) / float(1024**exponent)
232 return '%.2f%s' % (converted, suffix)
233
234 @staticmethod
235 def calc_percent(byte_counter, data_len):
236 if data_len is None:
237 return '---.-%'
238 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
239
240 @staticmethod
241 def calc_eta(start, now, total, current):
242 if total is None:
243 return '--:--'
244 dif = now - start
245 if current == 0 or dif < 0.001: # One millisecond
246 return '--:--'
247 rate = float(current) / dif
248 eta = long((float(total) - float(current)) / rate)
249 (eta_mins, eta_secs) = divmod(eta, 60)
250 if eta_mins > 99:
251 return '--:--'
252 return '%02d:%02d' % (eta_mins, eta_secs)
253
254 @staticmethod
255 def calc_speed(start, now, bytes):
256 dif = now - start
257 if bytes == 0 or dif < 0.001: # One millisecond
258 return '%10s' % '---b/s'
259 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
260
261 @staticmethod
262 def best_block_size(elapsed_time, bytes):
263 new_min = max(bytes / 2.0, 1.0)
264 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
265 if elapsed_time < 0.001:
266 return long(new_max)
267 rate = bytes / elapsed_time
268 if rate > new_max:
269 return long(new_max)
270 if rate < new_min:
271 return long(new_min)
272 return long(rate)
273
274 @staticmethod
275 def parse_bytes(bytestr):
276 """Parse a string indicating a byte quantity into a long integer."""
277 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
278 if matchobj is None:
279 return None
280 number = float(matchobj.group(1))
281 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
282 return long(round(number * multiplier))
283
284 @staticmethod
285 def verify_url(url):
286 """Verify a URL is valid and data could be downloaded. Return real data URL."""
287 request = urllib2.Request(url, None, std_headers)
288 data = urllib2.urlopen(request)
289 data.read(1)
290 url = data.geturl()
291 data.close()
292 return url
293
294 def add_info_extractor(self, ie):
295 """Add an InfoExtractor object to the end of the list."""
296 self._ies.append(ie)
297 ie.set_downloader(self)
298
299 def add_post_processor(self, pp):
300 """Add a PostProcessor object to the end of the chain."""
301 self._pps.append(pp)
302 pp.set_downloader(self)
303
304 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
305 """Print message to stdout if not in quiet mode."""
306 try:
307 if not self.params.get('quiet', False):
308 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
309 sys.stdout.flush()
310 except (UnicodeEncodeError), err:
311 if not ignore_encoding_errors:
312 raise
313
314 def to_stderr(self, message):
315 """Print message to stderr."""
316 print >>sys.stderr, message.encode(preferredencoding())
317
318 def fixed_template(self):
319 """Checks if the output template is fixed."""
320 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
321
322 def trouble(self, message=None):
323 """Determine action to take when a download problem appears.
324
325 Depending on if the downloader has been configured to ignore
326 download errors or not, this method may throw an exception or
327 not when errors are found, after printing the message.
328 """
329 if message is not None:
330 self.to_stderr(message)
331 if not self.params.get('ignoreerrors', False):
332 raise DownloadError(message)
333 self._download_retcode = 1
334
335 def slow_down(self, start_time, byte_counter):
336 """Sleep if the download speed is over the rate limit."""
337 rate_limit = self.params.get('ratelimit', None)
338 if rate_limit is None or byte_counter == 0:
339 return
340 now = time.time()
341 elapsed = now - start_time
342 if elapsed <= 0.0:
343 return
344 speed = float(byte_counter) / elapsed
345 if speed > rate_limit:
346 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
347
348 def report_destination(self, filename):
349 """Report destination filename."""
350 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
351
352 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
353 """Report download progress."""
354 if self.params.get('noprogress', False):
355 return
356 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
357 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
358
359 def report_resuming_byte(self, resume_len):
360 """Report attemtp to resume at given byte."""
361 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
362
363 def report_file_already_downloaded(self, file_name):
364 """Report file has already been fully downloaded."""
365 try:
366 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367 except (UnicodeEncodeError), err:
368 self.to_stdout(u'[download] The file has already been downloaded')
369
370 def report_unable_to_resume(self):
371 """Report it was impossible to resume download."""
372 self.to_stdout(u'[download] Unable to resume')
373
374 def report_finish(self):
375 """Report download finished."""
376 if self.params.get('noprogress', False):
377 self.to_stdout(u'[download] Download completed')
378 else:
379 self.to_stdout(u'')
380
381 def process_info(self, info_dict):
382 """Process a single dictionary returned by an InfoExtractor."""
383 # Do nothing else if in simulate mode
384 if self.params.get('simulate', False):
385 # Verify URL if it's an HTTP one
386 if info_dict['url'].startswith('http'):
387 try:
388 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
389 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
390 raise UnavailableFormatError
391
392 # Forced printings
393 if self.params.get('forcetitle', False):
394 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
395 if self.params.get('forceurl', False):
396 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
397
398 return
399
400 try:
401 template_dict = dict(info_dict)
402 template_dict['epoch'] = unicode(long(time.time()))
403 filename = self.params['outtmpl'] % template_dict
404 except (ValueError, KeyError), err:
405 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
406 if self.params.get('nooverwrites', False) and os.path.exists(filename):
407 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
408 return
409
410 try:
411 self.pmkdir(filename)
412 except (OSError, IOError), err:
413 self.trouble('ERROR: unable to create directories: %s' % str(err))
414 return
415
416 try:
417 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
418 except (OSError, IOError), err:
419 raise UnavailableFormatError
420 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
421 self.trouble('ERROR: unable to download video data: %s' % str(err))
422 return
423 except (ContentTooShortError, ), err:
424 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
425 return
426
427 if success:
428 try:
429 self.post_process(filename, info_dict)
430 except (PostProcessingError), err:
431 self.trouble('ERROR: postprocessing: %s' % str(err))
432 return
433
434 def download(self, url_list):
435 """Download a given list of URLs."""
436 if len(url_list) > 1 and self.fixed_template():
437 raise SameFileError(self.params['outtmpl'])
438
439 for url in url_list:
440 suitable_found = False
441 for ie in self._ies:
442 # Go to next InfoExtractor if not suitable
443 if not ie.suitable(url):
444 continue
445
446 # Suitable InfoExtractor found
447 suitable_found = True
448
449 # Extract information from URL and process it
450 ie.extract(url)
451
452 # Suitable InfoExtractor had been found; go to next URL
453 break
454
455 if not suitable_found:
456 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
457
458 return self._download_retcode
459
460 def post_process(self, filename, ie_info):
461 """Run the postprocessing chain on the given file."""
462 info = dict(ie_info)
463 info['filepath'] = filename
464 for pp in self._pps:
465 info = pp.run(info)
466 if info is None:
467 break
468
469 def _download_with_rtmpdump(self, filename, url):
470 self.report_destination(filename)
471
472 # Check for rtmpdump first
473 try:
474 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
475 except (OSError, IOError):
476 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
477 return False
478
479 # Download using rtmpdump. rtmpdump returns exit code 2 when
480 # the connection was interrumpted and resuming appears to be
481 # possible. This is part of rtmpdump's normal usage, AFAIK.
482 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
483 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
484 while retval == 2 or retval == 1:
485 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
486 time.sleep(2.0) # This seems to be needed
487 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
488 if retval == 0:
489 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
490 return True
491 else:
492 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
493 return False
494
495 def _do_download(self, filename, url):
496 # Attempt to download using rtmpdump
497 if url.startswith('rtmp'):
498 return self._download_with_rtmpdump(filename, url)
499
500 stream = None
501 open_mode = 'wb'
502 basic_request = urllib2.Request(url, None, std_headers)
503 request = urllib2.Request(url, None, std_headers)
504
505 # Establish possible resume length
506 if os.path.isfile(filename):
507 resume_len = os.path.getsize(filename)
508 else:
509 resume_len = 0
510
511 # Request parameters in case of being able to resume
512 if self.params.get('continuedl', False) and resume_len != 0:
513 self.report_resuming_byte(resume_len)
514 request.add_header('Range','bytes=%d-' % resume_len)
515 open_mode = 'ab'
516
517 # Establish connection
518 try:
519 data = urllib2.urlopen(request)
520 except (urllib2.HTTPError, ), err:
521 if err.code != 416: # 416 is 'Requested range not satisfiable'
522 raise
523 # Unable to resume
524 data = urllib2.urlopen(basic_request)
525 content_length = data.info()['Content-Length']
526
527 if content_length is not None and long(content_length) == resume_len:
528 # Because the file had already been fully downloaded
529 self.report_file_already_downloaded(filename)
530 return True
531 else:
532 # Because the server didn't let us
533 self.report_unable_to_resume()
534 open_mode = 'wb'
535
536 data_len = data.info().get('Content-length', None)
537 data_len_str = self.format_bytes(data_len)
538 byte_counter = 0
539 block_size = 1024
540 start = time.time()
541 while True:
542 # Download and write
543 before = time.time()
544 data_block = data.read(block_size)
545 after = time.time()
546 data_block_len = len(data_block)
547 if data_block_len == 0:
548 break
549 byte_counter += data_block_len
550
551 # Open file just in time
552 if stream is None:
553 try:
554 (stream, filename) = sanitize_open(filename, open_mode)
555 self.report_destination(filename)
556 except (OSError, IOError), err:
557 self.trouble('ERROR: unable to open for writing: %s' % str(err))
558 return False
559 stream.write(data_block)
560 block_size = self.best_block_size(after - before, data_block_len)
561
562 # Progress message
563 percent_str = self.calc_percent(byte_counter, data_len)
564 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
565 speed_str = self.calc_speed(start, time.time(), byte_counter)
566 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
567
568 # Apply rate limit
569 self.slow_down(start, byte_counter)
570
571 self.report_finish()
572 if data_len is not None and str(byte_counter) != data_len:
573 raise ContentTooShortError(byte_counter, long(data_len))
574 return True
575
576 class InfoExtractor(object):
577 """Information Extractor class.
578
579 Information extractors are the classes that, given a URL, extract
580 information from the video (or videos) the URL refers to. This
581 information includes the real video URL, the video title and simplified
582 title, author and others. The information is stored in a dictionary
583 which is then passed to the FileDownloader. The FileDownloader
584 processes this information possibly downloading the video to the file
585 system, among other possible outcomes. The dictionaries must include
586 the following fields:
587
588 id: Video identifier.
589 url: Final video URL.
590 uploader: Nickname of the video uploader.
591 title: Literal title.
592 stitle: Simplified title.
593 ext: Video filename extension.
594
595 Subclasses of this one should re-define the _real_initialize() and
596 _real_extract() methods, as well as the suitable() static method.
597 Probably, they should also be instantiated and added to the main
598 downloader.
599 """
600
601 _ready = False
602 _downloader = None
603
604 def __init__(self, downloader=None):
605 """Constructor. Receives an optional downloader."""
606 self._ready = False
607 self.set_downloader(downloader)
608
609 @staticmethod
610 def suitable(url):
611 """Receives a URL and returns True if suitable for this IE."""
612 return False
613
614 def initialize(self):
615 """Initializes an instance (authentication, etc)."""
616 if not self._ready:
617 self._real_initialize()
618 self._ready = True
619
620 def extract(self, url):
621 """Extracts URL information and returns it in list of dicts."""
622 self.initialize()
623 return self._real_extract(url)
624
625 def set_downloader(self, downloader):
626 """Sets the downloader for this IE."""
627 self._downloader = downloader
628
629 def _real_initialize(self):
630 """Real initialization process. Redefine in subclasses."""
631 pass
632
633 def _real_extract(self, url):
634 """Real extraction process. Redefine in subclasses."""
635 pass
636
637 class YoutubeIE(InfoExtractor):
638 """Information extractor for youtube.com."""
639
640 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
641 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
642 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
643 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
644 _NETRC_MACHINE = 'youtube'
645 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
646 _video_extensions = {
647 '13': '3gp',
648 '17': 'mp4',
649 '18': 'mp4',
650 '22': 'mp4',
651 '37': 'mp4',
652 }
653
654 @staticmethod
655 def suitable(url):
656 return (re.match(YoutubeIE._VALID_URL, url) is not None)
657
658 def report_lang(self):
659 """Report attempt to set language."""
660 self._downloader.to_stdout(u'[youtube] Setting language')
661
662 def report_login(self):
663 """Report attempt to log in."""
664 self._downloader.to_stdout(u'[youtube] Logging in')
665
666 def report_age_confirmation(self):
667 """Report attempt to confirm age."""
668 self._downloader.to_stdout(u'[youtube] Confirming age')
669
670 def report_video_info_webpage_download(self, video_id):
671 """Report attempt to download video info webpage."""
672 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
673
674 def report_information_extraction(self, video_id):
675 """Report attempt to extract video information."""
676 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
677
678 def report_unavailable_format(self, video_id, format):
679 """Report extracted video URL."""
680 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
681
682 def report_rtmp_download(self):
683 """Indicate the download will use the RTMP protocol."""
684 self._downloader.to_stdout(u'[youtube] RTMP download detected')
685
686 def _real_initialize(self):
687 if self._downloader is None:
688 return
689
690 username = None
691 password = None
692 downloader_params = self._downloader.params
693
694 # Attempt to use provided username and password or .netrc data
695 if downloader_params.get('username', None) is not None:
696 username = downloader_params['username']
697 password = downloader_params['password']
698 elif downloader_params.get('usenetrc', False):
699 try:
700 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
701 if info is not None:
702 username = info[0]
703 password = info[2]
704 else:
705 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
706 except (IOError, netrc.NetrcParseError), err:
707 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
708 return
709
710 # Set language
711 request = urllib2.Request(self._LANG_URL, None, std_headers)
712 try:
713 self.report_lang()
714 urllib2.urlopen(request).read()
715 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
716 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
717 return
718
719 # No authentication to be performed
720 if username is None:
721 return
722
723 # Log in
724 login_form = {
725 'current_form': 'loginForm',
726 'next': '/',
727 'action_login': 'Log In',
728 'username': username,
729 'password': password,
730 }
731 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
732 try:
733 self.report_login()
734 login_results = urllib2.urlopen(request).read()
735 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
736 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
737 return
738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
739 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
740 return
741
742 # Confirm age
743 age_form = {
744 'next_url': '/',
745 'action_confirm': 'Confirm',
746 }
747 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
748 try:
749 self.report_age_confirmation()
750 age_results = urllib2.urlopen(request).read()
751 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
752 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
753 return
754
755 def _real_extract(self, url):
756 # Extract video id from URL
757 mobj = re.match(self._VALID_URL, url)
758 if mobj is None:
759 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
760 return
761 video_id = mobj.group(2)
762
763 # Downloader parameters
764 best_quality = False
765 format_param = None
766 quality_index = 0
767 if self._downloader is not None:
768 params = self._downloader.params
769 format_param = params.get('format', None)
770 if format_param == '0':
771 format_param = self._available_formats[quality_index]
772 best_quality = True
773
774 while True:
775 # Extension
776 video_extension = self._video_extensions.get(format_param, 'flv')
777
778 # Get video info
779 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
780 request = urllib2.Request(video_info_url, None, std_headers)
781 try:
782 self.report_video_info_webpage_download(video_id)
783 video_info_webpage = urllib2.urlopen(request).read()
784 video_info = parse_qs(video_info_webpage)
785 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
786 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
787 return
788 self.report_information_extraction(video_id)
789
790 # "t" param
791 if 'token' not in video_info:
792 # Attempt to see if YouTube has issued an error message
793 if 'reason' not in video_info:
794 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
795 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
796 stream.write(video_info_webpage)
797 stream.close()
798 else:
799 reason = urllib.unquote_plus(video_info['reason'][0])
800 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
801 return
802 token = urllib.unquote_plus(video_info['token'][0])
803 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
804 if format_param is not None:
805 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
806
807 # Check possible RTMP download
808 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
809 self.report_rtmp_download()
810 video_real_url = video_info['conn'][0]
811
812 # uploader
813 if 'author' not in video_info:
814 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
815 return
816 video_uploader = urllib.unquote_plus(video_info['author'][0])
817
818 # title
819 if 'title' not in video_info:
820 self._downloader.trouble(u'ERROR: unable to extract video title')
821 return
822 video_title = urllib.unquote_plus(video_info['title'][0])
823 video_title = video_title.decode('utf-8')
824 video_title = sanitize_title(video_title)
825
826 # simplified title
827 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
828 simple_title = simple_title.strip(ur'_')
829
830 try:
831 # Process video information
832 self._downloader.process_info({
833 'id': video_id.decode('utf-8'),
834 'url': video_real_url.decode('utf-8'),
835 'uploader': video_uploader.decode('utf-8'),
836 'title': video_title,
837 'stitle': simple_title,
838 'ext': video_extension.decode('utf-8'),
839 })
840
841 return
842
843 except UnavailableFormatError, err:
844 if best_quality:
845 if quality_index == len(self._available_formats) - 1:
846 # I don't ever expect this to happen
847 self._downloader.trouble(u'ERROR: no known formats available for video')
848 return
849 else:
850 self.report_unavailable_format(video_id, format_param)
851 quality_index += 1
852 format_param = self._available_formats[quality_index]
853 continue
854 else:
855 self._downloader.trouble('ERROR: format not available for video')
856 return
857
858
859 class MetacafeIE(InfoExtractor):
860 """Information Extractor for metacafe.com."""
861
862 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
863 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
864 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
865 _youtube_ie = None
866
867 def __init__(self, youtube_ie, downloader=None):
868 InfoExtractor.__init__(self, downloader)
869 self._youtube_ie = youtube_ie
870
871 @staticmethod
872 def suitable(url):
873 return (re.match(MetacafeIE._VALID_URL, url) is not None)
874
875 def report_disclaimer(self):
876 """Report disclaimer retrieval."""
877 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
878
879 def report_age_confirmation(self):
880 """Report attempt to confirm age."""
881 self._downloader.to_stdout(u'[metacafe] Confirming age')
882
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
886
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
890
891 def _real_initialize(self):
892 # Retrieve disclaimer
893 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
894 try:
895 self.report_disclaimer()
896 disclaimer = urllib2.urlopen(request).read()
897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
898 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
899 return
900
901 # Confirm age
902 disclaimer_form = {
903 'filters': '0',
904 'submit': "Continue - I'm over 18",
905 }
906 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
907 try:
908 self.report_age_confirmation()
909 disclaimer = urllib2.urlopen(request).read()
910 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
911 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
912 return
913
914 def _real_extract(self, url):
915 # Extract id and simplified title from URL
916 mobj = re.match(self._VALID_URL, url)
917 if mobj is None:
918 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
919 return
920
921 video_id = mobj.group(1)
922
923 # Check if video comes from YouTube
924 mobj2 = re.match(r'^yt-(.*)$', video_id)
925 if mobj2 is not None:
926 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
927 return
928
929 simple_title = mobj.group(2).decode('utf-8')
930 video_extension = 'flv'
931
932 # Retrieve video webpage to extract further information
933 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
934 try:
935 self.report_download_webpage(video_id)
936 webpage = urllib2.urlopen(request).read()
937 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
938 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
939 return
940
941 # Extract URL, uploader and title from webpage
942 self.report_extraction(video_id)
943 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
944 if mobj is None:
945 self._downloader.trouble(u'ERROR: unable to extract media URL')
946 return
947 mediaURL = urllib.unquote(mobj.group(1))
948
949 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
950 #if mobj is None:
951 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
952 # return
953 #gdaKey = mobj.group(1)
954 #
955 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
956
957 video_url = mediaURL
958
959 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
960 if mobj is None:
961 self._downloader.trouble(u'ERROR: unable to extract title')
962 return
963 video_title = mobj.group(1).decode('utf-8')
964 video_title = sanitize_title(video_title)
965
966 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
967 if mobj is None:
968 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
969 return
970 video_uploader = mobj.group(1)
971
972 try:
973 # Process video information
974 self._downloader.process_info({
975 'id': video_id.decode('utf-8'),
976 'url': video_url.decode('utf-8'),
977 'uploader': video_uploader.decode('utf-8'),
978 'title': video_title,
979 'stitle': simple_title,
980 'ext': video_extension.decode('utf-8'),
981 })
982 except UnavailableFormatError:
983 self._downloader.trouble(u'ERROR: format not available for video')
984
985
986 class GoogleIE(InfoExtractor):
987 """Information extractor for video.google.com."""
988
989 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
990
991 def __init__(self, downloader=None):
992 InfoExtractor.__init__(self, downloader)
993
994 @staticmethod
995 def suitable(url):
996 return (re.match(GoogleIE._VALID_URL, url) is not None)
997
998 def report_download_webpage(self, video_id):
999 """Report webpage download."""
1000 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1001
1002 def report_extraction(self, video_id):
1003 """Report information extraction."""
1004 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1005
1006 def _real_initialize(self):
1007 return
1008
1009 def _real_extract(self, url):
1010 # Extract id from URL
1011 mobj = re.match(self._VALID_URL, url)
1012 if mobj is None:
1013 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1014 return
1015
1016 video_id = mobj.group(1)
1017
1018 video_extension = 'mp4'
1019
1020 # Retrieve video webpage to extract further information
1021 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1022 try:
1023 self.report_download_webpage(video_id)
1024 webpage = urllib2.urlopen(request).read()
1025 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1026 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1027 return
1028
1029 # Extract URL, uploader, and title from webpage
1030 self.report_extraction(video_id)
1031 mobj = re.search(r"download_url:'([^']+)'", webpage)
1032 if mobj is None:
1033 video_extension = 'flv'
1034 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1035 if mobj is None:
1036 self._downloader.trouble(u'ERROR: unable to extract media URL')
1037 return
1038 mediaURL = urllib.unquote(mobj.group(1))
1039 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1040 mediaURL = mediaURL.replace('\\x26', '\x26')
1041
1042 video_url = mediaURL
1043
1044 mobj = re.search(r'<title>(.*)</title>', webpage)
1045 if mobj is None:
1046 self._downloader.trouble(u'ERROR: unable to extract title')
1047 return
1048 video_title = mobj.group(1).decode('utf-8')
1049 video_title = sanitize_title(video_title)
1050 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1051
1052 # Google Video doesn't show uploader nicknames?
1053 video_uploader = 'NA'
1054
1055 try:
1056 # Process video information
1057 self._downloader.process_info({
1058 'id': video_id.decode('utf-8'),
1059 'url': video_url.decode('utf-8'),
1060 'uploader': video_uploader.decode('utf-8'),
1061 'title': video_title,
1062 'stitle': simple_title,
1063 'ext': video_extension.decode('utf-8'),
1064 })
1065 except UnavailableFormatError:
1066 self._downloader.trouble(u'ERROR: format not available for video')
1067
1068
1069 class PhotobucketIE(InfoExtractor):
1070 """Information extractor for photobucket.com."""
1071
1072 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1073
1074 def __init__(self, downloader=None):
1075 InfoExtractor.__init__(self, downloader)
1076
1077 @staticmethod
1078 def suitable(url):
1079 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1080
1081 def report_download_webpage(self, video_id):
1082 """Report webpage download."""
1083 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1084
1085 def report_extraction(self, video_id):
1086 """Report information extraction."""
1087 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1088
1089 def _real_initialize(self):
1090 return
1091
1092 def _real_extract(self, url):
1093 # Extract id from URL
1094 mobj = re.match(self._VALID_URL, url)
1095 if mobj is None:
1096 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1097 return
1098
1099 video_id = mobj.group(1)
1100
1101 video_extension = 'flv'
1102
1103 # Retrieve video webpage to extract further information
1104 request = urllib2.Request(url)
1105 try:
1106 self.report_download_webpage(video_id)
1107 webpage = urllib2.urlopen(request).read()
1108 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1109 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1110 return
1111
1112 # Extract URL, uploader, and title from webpage
1113 self.report_extraction(video_id)
1114 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1115 if mobj is None:
1116 self._downloader.trouble(u'ERROR: unable to extract media URL')
1117 return
1118 mediaURL = urllib.unquote(mobj.group(1))
1119
1120 video_url = mediaURL
1121
1122 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1123 if mobj is None:
1124 self._downloader.trouble(u'ERROR: unable to extract title')
1125 return
1126 video_title = mobj.group(1).decode('utf-8')
1127 video_title = sanitize_title(video_title)
1128 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1129
1130 video_uploader = mobj.group(2).decode('utf-8')
1131
1132 try:
1133 # Process video information
1134 self._downloader.process_info({
1135 'id': video_id.decode('utf-8'),
1136 'url': video_url.decode('utf-8'),
1137 'uploader': video_uploader,
1138 'title': video_title,
1139 'stitle': simple_title,
1140 'ext': video_extension.decode('utf-8'),
1141 })
1142 except UnavailableFormatError:
1143 self._downloader.trouble(u'ERROR: format not available for video')
1144
1145
1146 class GenericIE(InfoExtractor):
1147 """Generic last-resort information extractor."""
1148
1149 def __init__(self, downloader=None):
1150 InfoExtractor.__init__(self, downloader)
1151
1152 @staticmethod
1153 def suitable(url):
1154 return True
1155
1156 def report_download_webpage(self, video_id):
1157 """Report webpage download."""
1158 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1159 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1160
1161 def report_extraction(self, video_id):
1162 """Report information extraction."""
1163 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1164
1165 def _real_initialize(self):
1166 return
1167
1168 def _real_extract(self, url):
1169 video_id = url.split('/')[-1]
1170 request = urllib2.Request(url)
1171 try:
1172 self.report_download_webpage(video_id)
1173 webpage = urllib2.urlopen(request).read()
1174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1175 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1176 return
1177 except ValueError, err:
1178 # since this is the last-resort InfoExtractor, if
1179 # this error is thrown, it'll be thrown here
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 return
1182
1183 # Start with something easy: JW Player in SWFObject
1184 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1185 if mobj is None:
1186 # Broaden the search a little bit
1187 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1188 if mobj is None:
1189 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1190 return
1191
1192 # It's possible that one of the regexes
1193 # matched, but returned an empty group:
1194 if mobj.group(1) is None:
1195 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 return
1197
1198 video_url = urllib.unquote(mobj.group(1))
1199 video_id = os.path.basename(video_url)
1200
1201 # here's a fun little line of code for you:
1202 video_extension = os.path.splitext(video_id)[1][1:]
1203 video_id = os.path.splitext(video_id)[0]
1204
1205 # it's tempting to parse this further, but you would
1206 # have to take into account all the variations like
1207 # Video Title - Site Name
1208 # Site Name | Video Title
1209 # Video Title - Tagline | Site Name
1210 # and so on and so forth; it's just not practical
1211 mobj = re.search(r'<title>(.*)</title>', webpage)
1212 if mobj is None:
1213 self._downloader.trouble(u'ERROR: unable to extract title')
1214 return
1215 video_title = mobj.group(1).decode('utf-8')
1216 video_title = sanitize_title(video_title)
1217 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1218
1219 # video uploader is domain name
1220 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1221 if mobj is None:
1222 self._downloader.trouble(u'ERROR: unable to extract title')
1223 return
1224 video_uploader = mobj.group(1).decode('utf-8')
1225
1226 try:
1227 # Process video information
1228 self._downloader.process_info({
1229 'id': video_id.decode('utf-8'),
1230 'url': video_url.decode('utf-8'),
1231 'uploader': video_uploader,
1232 'title': video_title,
1233 'stitle': simple_title,
1234 'ext': video_extension.decode('utf-8'),
1235 })
1236 except UnavailableFormatError:
1237 self._downloader.trouble(u'ERROR: format not available for video')
1238
1239
1240 class YoutubeSearchIE(InfoExtractor):
1241 """Information Extractor for YouTube search queries."""
1242 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1243 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1244 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1245 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1246 _youtube_ie = None
1247 _max_youtube_results = 1000
1248
1249 def __init__(self, youtube_ie, downloader=None):
1250 InfoExtractor.__init__(self, downloader)
1251 self._youtube_ie = youtube_ie
1252
1253 @staticmethod
1254 def suitable(url):
1255 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1256
1257 def report_download_page(self, query, pagenum):
1258 """Report attempt to download playlist page with given number."""
1259 query = query.decode(preferredencoding())
1260 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1261
1262 def _real_initialize(self):
1263 self._youtube_ie.initialize()
1264
1265 def _real_extract(self, query):
1266 mobj = re.match(self._VALID_QUERY, query)
1267 if mobj is None:
1268 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1269 return
1270
1271 prefix, query = query.split(':')
1272 prefix = prefix[8:]
1273 query = query.encode('utf-8')
1274 if prefix == '':
1275 self._download_n_results(query, 1)
1276 return
1277 elif prefix == 'all':
1278 self._download_n_results(query, self._max_youtube_results)
1279 return
1280 else:
1281 try:
1282 n = long(prefix)
1283 if n <= 0:
1284 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1285 return
1286 elif n > self._max_youtube_results:
1287 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1288 n = self._max_youtube_results
1289 self._download_n_results(query, n)
1290 return
1291 except ValueError: # parsing prefix as integer fails
1292 self._download_n_results(query, 1)
1293 return
1294
1295 def _download_n_results(self, query, n):
1296 """Downloads a specified number of results for a query"""
1297
1298 video_ids = []
1299 already_seen = set()
1300 pagenum = 1
1301
1302 while True:
1303 self.report_download_page(query, pagenum)
1304 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1305 request = urllib2.Request(result_url, None, std_headers)
1306 try:
1307 page = urllib2.urlopen(request).read()
1308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1310 return
1311
1312 # Extract video identifiers
1313 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1314 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1315 if video_id not in already_seen:
1316 video_ids.append(video_id)
1317 already_seen.add(video_id)
1318 if len(video_ids) == n:
1319 # Specified n videos reached
1320 for id in video_ids:
1321 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1322 return
1323
1324 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1325 for id in video_ids:
1326 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1327 return
1328
1329 pagenum = pagenum + 1
1330
1331 class YoutubePlaylistIE(InfoExtractor):
1332 """Information Extractor for YouTube playlists."""
1333
1334 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1335 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1336 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1337 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1338 _youtube_ie = None
1339
1340 def __init__(self, youtube_ie, downloader=None):
1341 InfoExtractor.__init__(self, downloader)
1342 self._youtube_ie = youtube_ie
1343
1344 @staticmethod
1345 def suitable(url):
1346 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1347
1348 def report_download_page(self, playlist_id, pagenum):
1349 """Report attempt to download playlist page with given number."""
1350 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1351
1352 def _real_initialize(self):
1353 self._youtube_ie.initialize()
1354
1355 def _real_extract(self, url):
1356 # Extract playlist id
1357 mobj = re.match(self._VALID_URL, url)
1358 if mobj is None:
1359 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1360 return
1361
1362 # Download playlist pages
1363 playlist_id = mobj.group(1)
1364 video_ids = []
1365 pagenum = 1
1366
1367 while True:
1368 self.report_download_page(playlist_id, pagenum)
1369 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1370 try:
1371 page = urllib2.urlopen(request).read()
1372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1373 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1374 return
1375
1376 # Extract video identifiers
1377 ids_in_page = []
1378 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1379 if mobj.group(1) not in ids_in_page:
1380 ids_in_page.append(mobj.group(1))
1381 video_ids.extend(ids_in_page)
1382
1383 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1384 break
1385 pagenum = pagenum + 1
1386
1387 for id in video_ids:
1388 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1389 return
1390
1391 class YoutubeUserIE(InfoExtractor):
1392 """Information Extractor for YouTube users."""
1393
1394 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1395 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1396 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1397 _youtube_ie = None
1398
1399 def __init__(self, youtube_ie, downloader=None):
1400 InfoExtractor.__init__(self, downloader)
1401 self._youtube_ie = youtube_ie
1402
1403 @staticmethod
1404 def suitable(url):
1405 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1406
1407 def report_download_page(self, username):
1408 """Report attempt to download user page."""
1409 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1410
1411 def _real_initialize(self):
1412 self._youtube_ie.initialize()
1413
1414 def _real_extract(self, url):
1415 # Extract username
1416 mobj = re.match(self._VALID_URL, url)
1417 if mobj is None:
1418 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1419 return
1420
1421 # Download user page
1422 username = mobj.group(1)
1423 video_ids = []
1424 pagenum = 1
1425
1426 self.report_download_page(username)
1427 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1428 try:
1429 page = urllib2.urlopen(request).read()
1430 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1431 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1432 return
1433
1434 # Extract video identifiers
1435 ids_in_page = []
1436
1437 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1438 if mobj.group(1) not in ids_in_page:
1439 ids_in_page.append(mobj.group(1))
1440 video_ids.extend(ids_in_page)
1441
1442 for id in video_ids:
1443 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1444 return
1445
1446 class PostProcessor(object):
1447 """Post Processor class.
1448
1449 PostProcessor objects can be added to downloaders with their
1450 add_post_processor() method. When the downloader has finished a
1451 successful download, it will take its internal chain of PostProcessors
1452 and start calling the run() method on each one of them, first with
1453 an initial argument and then with the returned value of the previous
1454 PostProcessor.
1455
1456 The chain will be stopped if one of them ever returns None or the end
1457 of the chain is reached.
1458
1459 PostProcessor objects follow a "mutual registration" process similar
1460 to InfoExtractor objects.
1461 """
1462
1463 _downloader = None
1464
1465 def __init__(self, downloader=None):
1466 self._downloader = downloader
1467
1468 def set_downloader(self, downloader):
1469 """Sets the downloader for this PP."""
1470 self._downloader = downloader
1471
1472 def run(self, information):
1473 """Run the PostProcessor.
1474
1475 The "information" argument is a dictionary like the ones
1476 composed by InfoExtractors. The only difference is that this
1477 one has an extra field called "filepath" that points to the
1478 downloaded file.
1479
1480 When this method returns None, the postprocessing chain is
1481 stopped. However, this method may return an information
1482 dictionary that will be passed to the next postprocessing
1483 object in the chain. It can be the one it received after
1484 changing some fields.
1485
1486 In addition, this method may raise a PostProcessingError
1487 exception that will be taken into account by the downloader
1488 it was called from.
1489 """
1490 return information # by default, do nothing
1491
1492 ### MAIN PROGRAM ###
1493 if __name__ == '__main__':
1494 try:
1495 # Modules needed only when running the main program
1496 import getpass
1497 import optparse
1498
1499 # Function to update the program file with the latest version from bitbucket.org
1500 def update_self(downloader, filename):
1501 # Note: downloader only used for options
1502 if not os.access (filename, os.W_OK):
1503 sys.exit('ERROR: no write permissions on %s' % filename)
1504
1505 downloader.to_stdout('Updating to latest stable version...')
1506 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1507 latest_version = urllib.urlopen(latest_url).read().strip()
1508 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1509 newcontent = urllib.urlopen(prog_url).read()
1510 stream = open(filename, 'w')
1511 stream.write(newcontent)
1512 stream.close()
1513 downloader.to_stdout('Updated to version %s' % latest_version)
1514
1515 # General configuration
1516 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1517 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1518 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1519
1520 # Parse command line
1521 parser = optparse.OptionParser(
1522 usage='Usage: %prog [options] url...',
1523 version='2010.03.07',
1524 conflict_handler='resolve',
1525 )
1526
1527 parser.add_option('-h', '--help',
1528 action='help', help='print this help text and exit')
1529 parser.add_option('-v', '--version',
1530 action='version', help='print program version and exit')
1531 parser.add_option('-U', '--update',
1532 action='store_true', dest='update_self', help='update this program to latest stable version')
1533 parser.add_option('-i', '--ignore-errors',
1534 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1535 parser.add_option('-r', '--rate-limit',
1536 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1537
1538 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1539 authentication.add_option('-u', '--username',
1540 dest='username', metavar='UN', help='account username')
1541 authentication.add_option('-p', '--password',
1542 dest='password', metavar='PW', help='account password')
1543 authentication.add_option('-n', '--netrc',
1544 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1545 parser.add_option_group(authentication)
1546
1547 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1548 video_format.add_option('-f', '--format',
1549 action='store', dest='format', metavar='FMT', help='video format code')
1550 video_format.add_option('-b', '--best-quality',
1551 action='store_const', dest='format', help='download the best quality video possible', const='0')
1552 video_format.add_option('-m', '--mobile-version',
1553 action='store_const', dest='format', help='alias for -f 17', const='17')
1554 video_format.add_option('-d', '--high-def',
1555 action='store_const', dest='format', help='alias for -f 22', const='22')
1556 parser.add_option_group(video_format)
1557
1558 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1559 verbosity.add_option('-q', '--quiet',
1560 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1561 verbosity.add_option('-s', '--simulate',
1562 action='store_true', dest='simulate', help='do not download video', default=False)
1563 verbosity.add_option('-g', '--get-url',
1564 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1565 verbosity.add_option('-e', '--get-title',
1566 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1567 verbosity.add_option('--no-progress',
1568 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1569 parser.add_option_group(verbosity)
1570
1571 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1572 filesystem.add_option('-t', '--title',
1573 action='store_true', dest='usetitle', help='use title in file name', default=False)
1574 filesystem.add_option('-l', '--literal',
1575 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1576 filesystem.add_option('-o', '--output',
1577 dest='outtmpl', metavar='TPL', help='output filename template')
1578 filesystem.add_option('-a', '--batch-file',
1579 dest='batchfile', metavar='F', help='file containing URLs to download')
1580 filesystem.add_option('-w', '--no-overwrites',
1581 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1582 filesystem.add_option('-c', '--continue',
1583 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1584 parser.add_option_group(filesystem)
1585
1586 (opts, args) = parser.parse_args()
1587
1588 # Batch file verification
1589 batchurls = []
1590 if opts.batchfile is not None:
1591 try:
1592 batchurls = open(opts.batchfile, 'r').readlines()
1593 batchurls = [x.strip() for x in batchurls]
1594 batchurls = [x for x in batchurls if len(x) > 0]
1595 except IOError:
1596 sys.exit(u'ERROR: batch file could not be read')
1597 all_urls = batchurls + args
1598
1599 # Conflicting, missing and erroneous options
1600 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1601 parser.error(u'using .netrc conflicts with giving username/password')
1602 if opts.password is not None and opts.username is None:
1603 parser.error(u'account username missing')
1604 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1605 parser.error(u'using output template conflicts with using title or literal title')
1606 if opts.usetitle and opts.useliteral:
1607 parser.error(u'using title conflicts with using literal title')
1608 if opts.username is not None and opts.password is None:
1609 opts.password = getpass.getpass(u'Type account password and press return:')
1610 if opts.ratelimit is not None:
1611 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1612 if numeric_limit is None:
1613 parser.error(u'invalid rate limit specified')
1614 opts.ratelimit = numeric_limit
1615
1616 # Information extractors
1617 youtube_ie = YoutubeIE()
1618 metacafe_ie = MetacafeIE(youtube_ie)
1619 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1620 youtube_user_ie = YoutubeUserIE(youtube_ie)
1621 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1622 google_ie = GoogleIE()
1623 photobucket_ie = PhotobucketIE()
1624 generic_ie = GenericIE()
1625
1626 # File downloader
1627 fd = FileDownloader({
1628 'usenetrc': opts.usenetrc,
1629 'username': opts.username,
1630 'password': opts.password,
1631 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1632 'forceurl': opts.geturl,
1633 'forcetitle': opts.gettitle,
1634 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1635 'format': opts.format,
1636 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1637 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1638 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1639 or u'%(id)s.%(ext)s'),
1640 'ignoreerrors': opts.ignoreerrors,
1641 'ratelimit': opts.ratelimit,
1642 'nooverwrites': opts.nooverwrites,
1643 'continuedl': opts.continue_dl,
1644 'noprogress': opts.noprogress,
1645 })
1646 fd.add_info_extractor(youtube_search_ie)
1647 fd.add_info_extractor(youtube_pl_ie)
1648 fd.add_info_extractor(youtube_user_ie)
1649 fd.add_info_extractor(metacafe_ie)
1650 fd.add_info_extractor(youtube_ie)
1651 fd.add_info_extractor(google_ie)
1652 fd.add_info_extractor(photobucket_ie)
1653
1654 # This must come last since it's the
1655 # fallback if none of the others work
1656 fd.add_info_extractor(generic_ie)
1657
1658 # Update version
1659 if opts.update_self:
1660 update_self(fd, sys.argv[0])
1661
1662 # Maybe do nothing
1663 if len(all_urls) < 1:
1664 if not opts.update_self:
1665 parser.error(u'you must provide at least one URL')
1666 else:
1667 sys.exit()
1668 retcode = fd.download(all_urls)
1669 sys.exit(retcode)
1670
1671 except DownloadError:
1672 sys.exit(1)
1673 except SameFileError:
1674 sys.exit(u'ERROR: fixed output name but more than one file to download')
1675 except KeyboardInterrupt:
1676 sys.exit(u'\nERROR: Interrupted by user')