]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
47c64658631ca9581ff62f344aca60595c398e49
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 stream = open(filename, open_mode)
97 return (stream, filename)
98 except (IOError, OSError), err:
99 # In case of error, try to remove win32 forbidden chars
100 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
101
102 # An exception here should be caught in the caller
103 stream = open(filename, open_mode)
104 return (stream, filename)
105
106
107 class DownloadError(Exception):
108 """Download Error exception.
109
110 This exception may be thrown by FileDownloader objects if they are not
111 configured to continue on errors. They will contain the appropriate
112 error message.
113 """
114 pass
115
116 class SameFileError(Exception):
117 """Same File exception.
118
119 This exception will be thrown by FileDownloader objects if they detect
120 multiple files would have to be downloaded to the same file on disk.
121 """
122 pass
123
124 class PostProcessingError(Exception):
125 """Post Processing exception.
126
127 This exception may be raised by PostProcessor's .run() method to
128 indicate an error in the postprocessing task.
129 """
130 pass
131
132 class UnavailableFormatError(Exception):
133 """Unavailable Format exception.
134
135 This exception will be thrown when a video is requested
136 in a format that is not available for that video.
137 """
138 pass
139
140 class ContentTooShortError(Exception):
141 """Content Too Short exception.
142
143 This exception may be raised by FileDownloader objects when a file they
144 download is too small for what the server announced first, indicating
145 the connection was probably interrupted.
146 """
147 # Both in bytes
148 downloaded = None
149 expected = None
150
151 def __init__(self, downloaded, expected):
152 self.downloaded = downloaded
153 self.expected = expected
154
155 class FileDownloader(object):
156 """File Downloader class.
157
158 File downloader objects are the ones responsible of downloading the
159 actual video file and writing it to disk if the user has requested
160 it, among some other tasks. In most cases there should be one per
161 program. As, given a video URL, the downloader doesn't know how to
162 extract all the needed information, task that InfoExtractors do, it
163 has to pass the URL to one of them.
164
165 For this, file downloader objects have a method that allows
166 InfoExtractors to be registered in a given order. When it is passed
167 a URL, the file downloader handles it to the first InfoExtractor it
168 finds that reports being able to handle it. The InfoExtractor extracts
169 all the information about the video or videos the URL refers to, and
170 asks the FileDownloader to process the video information, possibly
171 downloading the video.
172
173 File downloaders accept a lot of parameters. In order not to saturate
174 the object constructor with arguments, it receives a dictionary of
175 options instead. These options are available through the params
176 attribute for the InfoExtractors to use. The FileDownloader also
177 registers itself as the downloader in charge for the InfoExtractors
178 that are added to it, so this is a "mutual registration".
179
180 Available options:
181
182 username: Username for authentication purposes.
183 password: Password for authentication purposes.
184 usenetrc: Use netrc for authentication instead.
185 quiet: Do not print messages to stdout.
186 forceurl: Force printing final URL.
187 forcetitle: Force printing title.
188 simulate: Do not download the video files.
189 format: Video format code.
190 outtmpl: Template for output names.
191 ignoreerrors: Do not stop on download errors.
192 ratelimit: Download speed limit, in bytes/sec.
193 nooverwrites: Prevent overwriting files.
194 continuedl: Try to continue downloads if possible.
195 """
196
197 params = None
198 _ies = []
199 _pps = []
200 _download_retcode = None
201
202 def __init__(self, params):
203 """Create a FileDownloader object with the given options."""
204 self._ies = []
205 self._pps = []
206 self._download_retcode = 0
207 self.params = params
208
209 @staticmethod
210 def pmkdir(filename):
211 """Create directory components in filename. Similar to Unix "mkdir -p"."""
212 components = filename.split(os.sep)
213 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
214 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
215 for dir in aggregate:
216 if not os.path.exists(dir):
217 os.mkdir(dir)
218
219 @staticmethod
220 def format_bytes(bytes):
221 if bytes is None:
222 return 'N/A'
223 if type(bytes) is str:
224 bytes = float(bytes)
225 if bytes == 0.0:
226 exponent = 0
227 else:
228 exponent = long(math.log(bytes, 1024.0))
229 suffix = 'bkMGTPEZY'[exponent]
230 converted = float(bytes) / float(1024**exponent)
231 return '%.2f%s' % (converted, suffix)
232
233 @staticmethod
234 def calc_percent(byte_counter, data_len):
235 if data_len is None:
236 return '---.-%'
237 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
238
239 @staticmethod
240 def calc_eta(start, now, total, current):
241 if total is None:
242 return '--:--'
243 dif = now - start
244 if current == 0 or dif < 0.001: # One millisecond
245 return '--:--'
246 rate = float(current) / dif
247 eta = long((float(total) - float(current)) / rate)
248 (eta_mins, eta_secs) = divmod(eta, 60)
249 if eta_mins > 99:
250 return '--:--'
251 return '%02d:%02d' % (eta_mins, eta_secs)
252
253 @staticmethod
254 def calc_speed(start, now, bytes):
255 dif = now - start
256 if bytes == 0 or dif < 0.001: # One millisecond
257 return '%10s' % '---b/s'
258 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
259
260 @staticmethod
261 def best_block_size(elapsed_time, bytes):
262 new_min = max(bytes / 2.0, 1.0)
263 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
264 if elapsed_time < 0.001:
265 return long(new_max)
266 rate = bytes / elapsed_time
267 if rate > new_max:
268 return long(new_max)
269 if rate < new_min:
270 return long(new_min)
271 return long(rate)
272
273 @staticmethod
274 def parse_bytes(bytestr):
275 """Parse a string indicating a byte quantity into a long integer."""
276 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
277 if matchobj is None:
278 return None
279 number = float(matchobj.group(1))
280 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
281 return long(round(number * multiplier))
282
283 @staticmethod
284 def verify_url(url):
285 """Verify a URL is valid and data could be downloaded. Return real data URL."""
286 request = urllib2.Request(url, None, std_headers)
287 data = urllib2.urlopen(request)
288 data.read(1)
289 url = data.geturl()
290 data.close()
291 return url
292
293 def add_info_extractor(self, ie):
294 """Add an InfoExtractor object to the end of the list."""
295 self._ies.append(ie)
296 ie.set_downloader(self)
297
298 def add_post_processor(self, pp):
299 """Add a PostProcessor object to the end of the chain."""
300 self._pps.append(pp)
301 pp.set_downloader(self)
302
303 def to_stdout(self, message, skip_eol=False):
304 """Print message to stdout if not in quiet mode."""
305 if not self.params.get('quiet', False):
306 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
307 sys.stdout.flush()
308
309 def to_stderr(self, message):
310 """Print message to stderr."""
311 print >>sys.stderr, message.encode(preferredencoding())
312
313 def fixed_template(self):
314 """Checks if the output template is fixed."""
315 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
316
317 def trouble(self, message=None):
318 """Determine action to take when a download problem appears.
319
320 Depending on if the downloader has been configured to ignore
321 download errors or not, this method may throw an exception or
322 not when errors are found, after printing the message.
323 """
324 if message is not None:
325 self.to_stderr(message)
326 if not self.params.get('ignoreerrors', False):
327 raise DownloadError(message)
328 self._download_retcode = 1
329
330 def slow_down(self, start_time, byte_counter):
331 """Sleep if the download speed is over the rate limit."""
332 rate_limit = self.params.get('ratelimit', None)
333 if rate_limit is None or byte_counter == 0:
334 return
335 now = time.time()
336 elapsed = now - start_time
337 if elapsed <= 0.0:
338 return
339 speed = float(byte_counter) / elapsed
340 if speed > rate_limit:
341 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
342
343 def report_destination(self, filename):
344 """Report destination filename."""
345 self.to_stdout(u'[download] Destination: %s' % filename)
346
347 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
348 """Report download progress."""
349 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
350 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
351
352 def report_resuming_byte(self, resume_len):
353 """Report attemtp to resume at given byte."""
354 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
355
356 def report_file_already_downloaded(self, file_name):
357 """Report file has already been fully downloaded."""
358 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
359
360 def report_unable_to_resume(self):
361 """Report it was impossible to resume download."""
362 self.to_stdout(u'[download] Unable to resume')
363
364 def report_finish(self):
365 """Report download finished."""
366 self.to_stdout(u'')
367
368 def process_info(self, info_dict):
369 """Process a single dictionary returned by an InfoExtractor."""
370 # Do nothing else if in simulate mode
371 if self.params.get('simulate', False):
372 # Verify URL if it's an HTTP one
373 if info_dict['url'].startswith('http'):
374 try:
375 info_dict['url'] = self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
376 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
377 raise UnavailableFormatError
378
379 # Forced printings
380 if self.params.get('forcetitle', False):
381 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
382 if self.params.get('forceurl', False):
383 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
384
385 return
386
387 try:
388 template_dict = dict(info_dict)
389 template_dict['epoch'] = unicode(long(time.time()))
390 filename = self.params['outtmpl'] % template_dict
391 except (ValueError, KeyError), err:
392 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
393 if self.params.get('nooverwrites', False) and os.path.exists(filename):
394 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
395 return
396
397 try:
398 self.pmkdir(filename)
399 except (OSError, IOError), err:
400 self.trouble('ERROR: unable to create directories: %s' % str(err))
401 return
402
403 try:
404 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
405 except (OSError, IOError), err:
406 raise UnavailableFormatError
407 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
408 self.trouble('ERROR: unable to download video data: %s' % str(err))
409 return
410 except (ContentTooShortError, ), err:
411 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
412 return
413
414 if success:
415 try:
416 self.post_process(filename, info_dict)
417 except (PostProcessingError), err:
418 self.trouble('ERROR: postprocessing: %s' % str(err))
419 return
420
421 def download(self, url_list):
422 """Download a given list of URLs."""
423 if len(url_list) > 1 and self.fixed_template():
424 raise SameFileError(self.params['outtmpl'])
425
426 for url in url_list:
427 suitable_found = False
428 for ie in self._ies:
429 # Go to next InfoExtractor if not suitable
430 if not ie.suitable(url):
431 continue
432
433 # Suitable InfoExtractor found
434 suitable_found = True
435
436 # Extract information from URL and process it
437 ie.extract(url)
438
439 # Suitable InfoExtractor had been found; go to next URL
440 break
441
442 if not suitable_found:
443 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
444
445 return self._download_retcode
446
447 def post_process(self, filename, ie_info):
448 """Run the postprocessing chain on the given file."""
449 info = dict(ie_info)
450 info['filepath'] = filename
451 for pp in self._pps:
452 info = pp.run(info)
453 if info is None:
454 break
455
456 def _download_with_rtmpdump(self, filename, url):
457 self.report_destination(filename)
458
459 # Check for rtmpdump first
460 try:
461 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
462 except (OSError, IOError):
463 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
464 return False
465
466 # Download using rtmpdump. rtmpdump returns exit code 2 when
467 # the connection was interrumpted and resuming appears to be
468 # possible. This is part of rtmpdump's normal usage, AFAIK.
469 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
470 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
471 while retval == 2 or retval == 1:
472 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
473 time.sleep(2.0) # This seems to be needed
474 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
475 if retval == 0:
476 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
477 return True
478 else:
479 self.trouble('ERROR: rtmpdump exited with code %d' % retval)
480 return False
481
482 def _do_download(self, filename, url):
483 # Attempt to download using rtmpdump
484 if url.startswith('rtmp'):
485 return self._download_with_rtmpdump(filename, url)
486
487 stream = None
488 open_mode = 'wb'
489 basic_request = urllib2.Request(url, None, std_headers)
490 request = urllib2.Request(url, None, std_headers)
491
492 # Establish possible resume length
493 if os.path.isfile(filename):
494 resume_len = os.path.getsize(filename)
495 else:
496 resume_len = 0
497
498 # Request parameters in case of being able to resume
499 if self.params.get('continuedl', False) and resume_len != 0:
500 self.report_resuming_byte(resume_len)
501 request.add_header('Range','bytes=%d-' % resume_len)
502 open_mode = 'ab'
503
504 # Establish connection
505 try:
506 data = urllib2.urlopen(request)
507 except (urllib2.HTTPError, ), err:
508 if err.code != 416: # 416 is 'Requested range not satisfiable'
509 raise
510 # Unable to resume
511 data = urllib2.urlopen(basic_request)
512 content_length = data.info()['Content-Length']
513
514 if content_length is not None and long(content_length) == resume_len:
515 # Because the file had already been fully downloaded
516 self.report_file_already_downloaded(filename)
517 return True
518 else:
519 # Because the server didn't let us
520 self.report_unable_to_resume()
521 open_mode = 'wb'
522
523 data_len = data.info().get('Content-length', None)
524 data_len_str = self.format_bytes(data_len)
525 byte_counter = 0
526 block_size = 1024
527 start = time.time()
528 while True:
529 # Download and write
530 before = time.time()
531 data_block = data.read(block_size)
532 after = time.time()
533 data_block_len = len(data_block)
534 if data_block_len == 0:
535 break
536 byte_counter += data_block_len
537
538 # Open file just in time
539 if stream is None:
540 try:
541 (stream, filename) = sanitize_open(filename, open_mode)
542 self.report_destination(filename)
543 except (OSError, IOError), err:
544 self.trouble('ERROR: unable to open for writing: %s' % str(err))
545 return False
546 stream.write(data_block)
547 block_size = self.best_block_size(after - before, data_block_len)
548
549 # Progress message
550 percent_str = self.calc_percent(byte_counter, data_len)
551 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
552 speed_str = self.calc_speed(start, time.time(), byte_counter)
553 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
554
555 # Apply rate limit
556 self.slow_down(start, byte_counter)
557
558 self.report_finish()
559 if data_len is not None and str(byte_counter) != data_len:
560 raise ContentTooShortError(byte_counter, long(data_len))
561 return True
562
563 class InfoExtractor(object):
564 """Information Extractor class.
565
566 Information extractors are the classes that, given a URL, extract
567 information from the video (or videos) the URL refers to. This
568 information includes the real video URL, the video title and simplified
569 title, author and others. The information is stored in a dictionary
570 which is then passed to the FileDownloader. The FileDownloader
571 processes this information possibly downloading the video to the file
572 system, among other possible outcomes. The dictionaries must include
573 the following fields:
574
575 id: Video identifier.
576 url: Final video URL.
577 uploader: Nickname of the video uploader.
578 title: Literal title.
579 stitle: Simplified title.
580 ext: Video filename extension.
581
582 Subclasses of this one should re-define the _real_initialize() and
583 _real_extract() methods, as well as the suitable() static method.
584 Probably, they should also be instantiated and added to the main
585 downloader.
586 """
587
588 _ready = False
589 _downloader = None
590
591 def __init__(self, downloader=None):
592 """Constructor. Receives an optional downloader."""
593 self._ready = False
594 self.set_downloader(downloader)
595
596 @staticmethod
597 def suitable(url):
598 """Receives a URL and returns True if suitable for this IE."""
599 return False
600
601 def initialize(self):
602 """Initializes an instance (authentication, etc)."""
603 if not self._ready:
604 self._real_initialize()
605 self._ready = True
606
607 def extract(self, url):
608 """Extracts URL information and returns it in list of dicts."""
609 self.initialize()
610 return self._real_extract(url)
611
612 def set_downloader(self, downloader):
613 """Sets the downloader for this IE."""
614 self._downloader = downloader
615
616 def _real_initialize(self):
617 """Real initialization process. Redefine in subclasses."""
618 pass
619
620 def _real_extract(self, url):
621 """Real extraction process. Redefine in subclasses."""
622 pass
623
624 class YoutubeIE(InfoExtractor):
625 """Information extractor for youtube.com."""
626
627 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
628 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
629 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
630 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
631 _NETRC_MACHINE = 'youtube'
632 _available_formats = ['37', '22', '35', '18', '5', '17', '13', None] # listed in order of priority for -b flag
633 _video_extensions = {
634 '13': '3gp',
635 '17': 'mp4',
636 '18': 'mp4',
637 '22': 'mp4',
638 '37': 'mp4',
639 }
640
641 @staticmethod
642 def suitable(url):
643 return (re.match(YoutubeIE._VALID_URL, url) is not None)
644
645 def report_lang(self):
646 """Report attempt to set language."""
647 self._downloader.to_stdout(u'[youtube] Setting language')
648
649 def report_login(self):
650 """Report attempt to log in."""
651 self._downloader.to_stdout(u'[youtube] Logging in')
652
653 def report_age_confirmation(self):
654 """Report attempt to confirm age."""
655 self._downloader.to_stdout(u'[youtube] Confirming age')
656
657 def report_video_info_webpage_download(self, video_id):
658 """Report attempt to download video info webpage."""
659 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
660
661 def report_information_extraction(self, video_id):
662 """Report attempt to extract video information."""
663 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
664
665 def report_unavailable_format(self, video_id, format):
666 """Report extracted video URL."""
667 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
668
669 def report_rtmp_download(self):
670 """Indicate the download will use the RTMP protocol."""
671 self._downloader.to_stdout(u'[youtube] RTMP download detected')
672
673 def _real_initialize(self):
674 if self._downloader is None:
675 return
676
677 username = None
678 password = None
679 downloader_params = self._downloader.params
680
681 # Attempt to use provided username and password or .netrc data
682 if downloader_params.get('username', None) is not None:
683 username = downloader_params['username']
684 password = downloader_params['password']
685 elif downloader_params.get('usenetrc', False):
686 try:
687 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
688 if info is not None:
689 username = info[0]
690 password = info[2]
691 else:
692 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
693 except (IOError, netrc.NetrcParseError), err:
694 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
695 return
696
697 # Set language
698 request = urllib2.Request(self._LANG_URL, None, std_headers)
699 try:
700 self.report_lang()
701 urllib2.urlopen(request).read()
702 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
703 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
704 return
705
706 # No authentication to be performed
707 if username is None:
708 return
709
710 # Log in
711 login_form = {
712 'current_form': 'loginForm',
713 'next': '/',
714 'action_login': 'Log In',
715 'username': username,
716 'password': password,
717 }
718 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
719 try:
720 self.report_login()
721 login_results = urllib2.urlopen(request).read()
722 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
723 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
724 return
725 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
726 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
727 return
728
729 # Confirm age
730 age_form = {
731 'next_url': '/',
732 'action_confirm': 'Confirm',
733 }
734 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
735 try:
736 self.report_age_confirmation()
737 age_results = urllib2.urlopen(request).read()
738 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
739 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
740 return
741
742 def _real_extract(self, url):
743 # Extract video id from URL
744 mobj = re.match(self._VALID_URL, url)
745 if mobj is None:
746 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
747 return
748 video_id = mobj.group(2)
749
750 # Downloader parameters
751 best_quality = False
752 format_param = None
753 quality_index = 0
754 if self._downloader is not None:
755 params = self._downloader.params
756 format_param = params.get('format', None)
757 if format_param == '0':
758 format_param = self._available_formats[quality_index]
759 best_quality = True
760
761 while True:
762 # Extension
763 video_extension = self._video_extensions.get(format_param, 'flv')
764
765 # Get video info
766 video_info_url = 'http://www.youtube.com/get_video_info?&video_id=%s&el=detailpage&ps=default&eurl=&gl=US&hl=en' % video_id
767 request = urllib2.Request(video_info_url, None, std_headers)
768 try:
769 self.report_video_info_webpage_download(video_id)
770 video_info_webpage = urllib2.urlopen(request).read()
771 video_info = parse_qs(video_info_webpage)
772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
774 return
775 self.report_information_extraction(video_id)
776
777 # "t" param
778 if 'token' not in video_info:
779 # Attempt to see if YouTube has issued an error message
780 if 'reason' not in video_info:
781 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
782 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
783 stream.write(video_info_webpage)
784 stream.close()
785 else:
786 reason = urllib.unquote_plus(video_info['reason'][0])
787 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
788 return
789 token = urllib.unquote_plus(video_info['token'][0])
790 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
791 if format_param is not None:
792 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
793
794 # Check possible RTMP download
795 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
796 self.report_rtmp_download()
797 video_real_url = video_info['conn'][0]
798
799 # uploader
800 if 'author' not in video_info:
801 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
802 return
803 video_uploader = urllib.unquote_plus(video_info['author'][0])
804
805 # title
806 if 'title' not in video_info:
807 self._downloader.trouble(u'ERROR: unable to extract video title')
808 return
809 video_title = urllib.unquote_plus(video_info['title'][0])
810 video_title = video_title.decode('utf-8')
811 video_title = sanitize_title(video_title)
812
813 # simplified title
814 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
815 simple_title = simple_title.strip(ur'_')
816
817 try:
818 # Process video information
819 self._downloader.process_info({
820 'id': video_id.decode('utf-8'),
821 'url': video_real_url.decode('utf-8'),
822 'uploader': video_uploader.decode('utf-8'),
823 'title': video_title,
824 'stitle': simple_title,
825 'ext': video_extension.decode('utf-8'),
826 })
827
828 return
829
830 except UnavailableFormatError, err:
831 if best_quality:
832 if quality_index == len(self._available_formats) - 1:
833 # I don't ever expect this to happen
834 self._downloader.trouble(u'ERROR: no known formats available for video')
835 return
836 else:
837 self.report_unavailable_format(video_id, format_param)
838 quality_index += 1
839 format_param = self._available_formats[quality_index]
840 continue
841 else:
842 self._downloader.trouble('ERROR: format not available for video')
843 return
844
845
846 class MetacafeIE(InfoExtractor):
847 """Information Extractor for metacafe.com."""
848
849 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
850 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
851 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
852 _youtube_ie = None
853
854 def __init__(self, youtube_ie, downloader=None):
855 InfoExtractor.__init__(self, downloader)
856 self._youtube_ie = youtube_ie
857
858 @staticmethod
859 def suitable(url):
860 return (re.match(MetacafeIE._VALID_URL, url) is not None)
861
862 def report_disclaimer(self):
863 """Report disclaimer retrieval."""
864 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
865
866 def report_age_confirmation(self):
867 """Report attempt to confirm age."""
868 self._downloader.to_stdout(u'[metacafe] Confirming age')
869
870 def report_download_webpage(self, video_id):
871 """Report webpage download."""
872 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
873
874 def report_extraction(self, video_id):
875 """Report information extraction."""
876 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
877
878 def _real_initialize(self):
879 # Retrieve disclaimer
880 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
881 try:
882 self.report_disclaimer()
883 disclaimer = urllib2.urlopen(request).read()
884 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
885 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
886 return
887
888 # Confirm age
889 disclaimer_form = {
890 'filters': '0',
891 'submit': "Continue - I'm over 18",
892 }
893 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
894 try:
895 self.report_age_confirmation()
896 disclaimer = urllib2.urlopen(request).read()
897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
898 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
899 return
900
901 def _real_extract(self, url):
902 # Extract id and simplified title from URL
903 mobj = re.match(self._VALID_URL, url)
904 if mobj is None:
905 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
906 return
907
908 video_id = mobj.group(1)
909
910 # Check if video comes from YouTube
911 mobj2 = re.match(r'^yt-(.*)$', video_id)
912 if mobj2 is not None:
913 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
914 return
915
916 simple_title = mobj.group(2).decode('utf-8')
917 video_extension = 'flv'
918
919 # Retrieve video webpage to extract further information
920 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
921 try:
922 self.report_download_webpage(video_id)
923 webpage = urllib2.urlopen(request).read()
924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
925 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
926 return
927
928 # Extract URL, uploader and title from webpage
929 self.report_extraction(video_id)
930 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
931 if mobj is None:
932 self._downloader.trouble(u'ERROR: unable to extract media URL')
933 return
934 mediaURL = urllib.unquote(mobj.group(1))
935
936 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
937 #if mobj is None:
938 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
939 # return
940 #gdaKey = mobj.group(1)
941 #
942 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
943
944 video_url = mediaURL
945
946 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
947 if mobj is None:
948 self._downloader.trouble(u'ERROR: unable to extract title')
949 return
950 video_title = mobj.group(1).decode('utf-8')
951 video_title = sanitize_title(video_title)
952
953 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
954 if mobj is None:
955 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
956 return
957 video_uploader = mobj.group(1)
958
959 try:
960 # Process video information
961 self._downloader.process_info({
962 'id': video_id.decode('utf-8'),
963 'url': video_url.decode('utf-8'),
964 'uploader': video_uploader.decode('utf-8'),
965 'title': video_title,
966 'stitle': simple_title,
967 'ext': video_extension.decode('utf-8'),
968 })
969 except UnavailableFormatError:
970 self._downloader.trouble(u'ERROR: format not available for video')
971
972
973 class GoogleIE(InfoExtractor):
974 """Information extractor for video.google.com."""
975
976 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
977
978 def __init__(self, downloader=None):
979 InfoExtractor.__init__(self, downloader)
980
981 @staticmethod
982 def suitable(url):
983 return (re.match(GoogleIE._VALID_URL, url) is not None)
984
985 def report_download_webpage(self, video_id):
986 """Report webpage download."""
987 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
988
989 def report_extraction(self, video_id):
990 """Report information extraction."""
991 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
992
993 def _real_initialize(self):
994 return
995
996 def _real_extract(self, url):
997 # Extract id from URL
998 mobj = re.match(self._VALID_URL, url)
999 if mobj is None:
1000 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1001 return
1002
1003 video_id = mobj.group(1)
1004
1005 video_extension = 'mp4'
1006
1007 # Retrieve video webpage to extract further information
1008 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1009 try:
1010 self.report_download_webpage(video_id)
1011 webpage = urllib2.urlopen(request).read()
1012 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1013 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1014 return
1015
1016 # Extract URL, uploader, and title from webpage
1017 self.report_extraction(video_id)
1018 mobj = re.search(r"download_url:'([^']+)'", webpage)
1019 if mobj is None:
1020 video_extension = 'flv'
1021 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1022 if mobj is None:
1023 self._downloader.trouble(u'ERROR: unable to extract media URL')
1024 return
1025 mediaURL = urllib.unquote(mobj.group(1))
1026 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1027 mediaURL = mediaURL.replace('\\x26', '\x26')
1028
1029 video_url = mediaURL
1030
1031 mobj = re.search(r'<title>(.*)</title>', webpage)
1032 if mobj is None:
1033 self._downloader.trouble(u'ERROR: unable to extract title')
1034 return
1035 video_title = mobj.group(1).decode('utf-8')
1036 video_title = sanitize_title(video_title)
1037
1038 # Google Video doesn't show uploader nicknames?
1039 video_uploader = 'NA'
1040
1041 try:
1042 # Process video information
1043 self._downloader.process_info({
1044 'id': video_id.decode('utf-8'),
1045 'url': video_url.decode('utf-8'),
1046 'uploader': video_uploader.decode('utf-8'),
1047 'title': video_title,
1048 'stitle': video_title,
1049 'ext': video_extension.decode('utf-8'),
1050 })
1051 except UnavailableFormatError:
1052 self._downloader.trouble(u'ERROR: format not available for video')
1053
1054
1055 class PhotobucketIE(InfoExtractor):
1056 """Information extractor for photobucket.com."""
1057
1058 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1059
1060 def __init__(self, downloader=None):
1061 InfoExtractor.__init__(self, downloader)
1062
1063 @staticmethod
1064 def suitable(url):
1065 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1066
1067 def report_download_webpage(self, video_id):
1068 """Report webpage download."""
1069 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1070
1071 def report_extraction(self, video_id):
1072 """Report information extraction."""
1073 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1074
1075 def _real_initialize(self):
1076 return
1077
1078 def _real_extract(self, url):
1079 # Extract id from URL
1080 mobj = re.match(self._VALID_URL, url)
1081 if mobj is None:
1082 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1083 return
1084
1085 video_id = mobj.group(1)
1086
1087 video_extension = 'flv'
1088
1089 # Retrieve video webpage to extract further information
1090 request = urllib2.Request(url)
1091 try:
1092 self.report_download_webpage(video_id)
1093 webpage = urllib2.urlopen(request).read()
1094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1095 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1096 return
1097
1098 # Extract URL, uploader, and title from webpage
1099 self.report_extraction(video_id)
1100 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1101 if mobj is None:
1102 self._downloader.trouble(u'ERROR: unable to extract media URL')
1103 return
1104 mediaURL = urllib.unquote(mobj.group(1))
1105
1106 video_url = mediaURL
1107
1108 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1109 if mobj is None:
1110 self._downloader.trouble(u'ERROR: unable to extract title')
1111 return
1112 video_title = mobj.group(1).decode('utf-8')
1113 video_title = sanitize_title(video_title)
1114
1115 video_uploader = mobj.group(2).decode('utf-8')
1116
1117 try:
1118 # Process video information
1119 self._downloader.process_info({
1120 'id': video_id.decode('utf-8'),
1121 'url': video_url.decode('utf-8'),
1122 'uploader': video_uploader,
1123 'title': video_title,
1124 'stitle': video_title,
1125 'ext': video_extension.decode('utf-8'),
1126 })
1127 except UnavailableFormatError:
1128 self._downloader.trouble(u'ERROR: format not available for video')
1129
1130
1131 class GenericIE(InfoExtractor):
1132 """Generic last-resort information extractor."""
1133
1134 def __init__(self, downloader=None):
1135 InfoExtractor.__init__(self, downloader)
1136
1137 @staticmethod
1138 def suitable(url):
1139 return True
1140
1141 def report_download_webpage(self, video_id):
1142 """Report webpage download."""
1143 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1144 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1145
1146 def report_extraction(self, video_id):
1147 """Report information extraction."""
1148 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1149
1150 def _real_initialize(self):
1151 return
1152
1153 def _real_extract(self, url):
1154 video_id = url.split('/')[-1]
1155 request = urllib2.Request(url)
1156 try:
1157 self.report_download_webpage(video_id)
1158 webpage = urllib2.urlopen(request).read()
1159 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1160 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1161 return
1162 except ValueError, err:
1163 # since this is the last-resort InfoExtractor, if
1164 # this error is thrown, it'll be thrown here
1165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1166 return
1167
1168 # Start with something easy: JW Player in SWFObject
1169 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1170 if mobj is None:
1171 # Broaden the search a little bit
1172 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1173 if mobj is None:
1174 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1175 return
1176
1177 # It's possible that one of the regexes
1178 # matched, but returned an empty group:
1179 if mobj.group(1) is None:
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 return
1182
1183 video_url = urllib.unquote(mobj.group(1))
1184 video_id = os.path.basename(video_url)
1185
1186 # here's a fun little line of code for you:
1187 video_extension = os.path.splitext(video_id)[1][1:]
1188 video_id = os.path.splitext(video_id)[0]
1189
1190 # it's tempting to parse this further, but you would
1191 # have to take into account all the variations like
1192 # Video Title - Site Name
1193 # Site Name | Video Title
1194 # Video Title - Tagline | Site Name
1195 # and so on and so forth; it's just not practical
1196 mobj = re.search(r'<title>(.*)</title>', webpage)
1197 if mobj is None:
1198 self._downloader.trouble(u'ERROR: unable to extract title')
1199 return
1200 video_title = mobj.group(1).decode('utf-8')
1201 video_title = sanitize_title(video_title)
1202
1203 # video uploader is domain name
1204 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1205 if mobj is None:
1206 self._downloader.trouble(u'ERROR: unable to extract title')
1207 return
1208 video_uploader = mobj.group(1).decode('utf-8')
1209
1210 try:
1211 # Process video information
1212 self._downloader.process_info({
1213 'id': video_id.decode('utf-8'),
1214 'url': video_url.decode('utf-8'),
1215 'uploader': video_uploader,
1216 'title': video_title,
1217 'stitle': video_title,
1218 'ext': video_extension.decode('utf-8'),
1219 })
1220 except UnavailableFormatError:
1221 self._downloader.trouble(u'ERROR: format not available for video')
1222
1223
1224 class YoutubeSearchIE(InfoExtractor):
1225 """Information Extractor for YouTube search queries."""
1226 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1227 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1228 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1229 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1230 _youtube_ie = None
1231 _max_youtube_results = 1000
1232
1233 def __init__(self, youtube_ie, downloader=None):
1234 InfoExtractor.__init__(self, downloader)
1235 self._youtube_ie = youtube_ie
1236
1237 @staticmethod
1238 def suitable(url):
1239 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1240
1241 def report_download_page(self, query, pagenum):
1242 """Report attempt to download playlist page with given number."""
1243 query = query.decode(preferredencoding())
1244 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1245
1246 def _real_initialize(self):
1247 self._youtube_ie.initialize()
1248
1249 def _real_extract(self, query):
1250 mobj = re.match(self._VALID_QUERY, query)
1251 if mobj is None:
1252 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1253 return
1254
1255 prefix, query = query.split(':')
1256 prefix = prefix[8:]
1257 query = query.encode('utf-8')
1258 if prefix == '':
1259 self._download_n_results(query, 1)
1260 return
1261 elif prefix == 'all':
1262 self._download_n_results(query, self._max_youtube_results)
1263 return
1264 else:
1265 try:
1266 n = long(prefix)
1267 if n <= 0:
1268 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1269 return
1270 elif n > self._max_youtube_results:
1271 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1272 n = self._max_youtube_results
1273 self._download_n_results(query, n)
1274 return
1275 except ValueError: # parsing prefix as integer fails
1276 self._download_n_results(query, 1)
1277 return
1278
1279 def _download_n_results(self, query, n):
1280 """Downloads a specified number of results for a query"""
1281
1282 video_ids = []
1283 already_seen = set()
1284 pagenum = 1
1285
1286 while True:
1287 self.report_download_page(query, pagenum)
1288 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1289 request = urllib2.Request(result_url, None, std_headers)
1290 try:
1291 page = urllib2.urlopen(request).read()
1292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1293 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1294 return
1295
1296 # Extract video identifiers
1297 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1298 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1299 if video_id not in already_seen:
1300 video_ids.append(video_id)
1301 already_seen.add(video_id)
1302 if len(video_ids) == n:
1303 # Specified n videos reached
1304 for id in video_ids:
1305 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1306 return
1307
1308 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1309 for id in video_ids:
1310 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1311 return
1312
1313 pagenum = pagenum + 1
1314
1315 class YoutubePlaylistIE(InfoExtractor):
1316 """Information Extractor for YouTube playlists."""
1317
1318 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:view_play_list|my_playlists)\?.*?p=([^&]+).*'
1319 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1320 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1321 _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&page=%s'
1322 _youtube_ie = None
1323
1324 def __init__(self, youtube_ie, downloader=None):
1325 InfoExtractor.__init__(self, downloader)
1326 self._youtube_ie = youtube_ie
1327
1328 @staticmethod
1329 def suitable(url):
1330 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1331
1332 def report_download_page(self, playlist_id, pagenum):
1333 """Report attempt to download playlist page with given number."""
1334 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1335
1336 def _real_initialize(self):
1337 self._youtube_ie.initialize()
1338
1339 def _real_extract(self, url):
1340 # Extract playlist id
1341 mobj = re.match(self._VALID_URL, url)
1342 if mobj is None:
1343 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1344 return
1345
1346 # Download playlist pages
1347 playlist_id = mobj.group(1)
1348 video_ids = []
1349 pagenum = 1
1350
1351 while True:
1352 self.report_download_page(playlist_id, pagenum)
1353 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1354 try:
1355 page = urllib2.urlopen(request).read()
1356 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1357 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1358 return
1359
1360 # Extract video identifiers
1361 ids_in_page = []
1362 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1363 if mobj.group(1) not in ids_in_page:
1364 ids_in_page.append(mobj.group(1))
1365 video_ids.extend(ids_in_page)
1366
1367 if (self._MORE_PAGES_INDICATOR % (playlist_id.upper(), pagenum + 1)) not in page:
1368 break
1369 pagenum = pagenum + 1
1370
1371 for id in video_ids:
1372 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1373 return
1374
1375 class YoutubeUserIE(InfoExtractor):
1376 """Information Extractor for YouTube users."""
1377
1378 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1379 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1380 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1381 _youtube_ie = None
1382
1383 def __init__(self, youtube_ie, downloader=None):
1384 InfoExtractor.__init__(self, downloader)
1385 self._youtube_ie = youtube_ie
1386
1387 @staticmethod
1388 def suitable(url):
1389 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1390
1391 def report_download_page(self, username):
1392 """Report attempt to download user page."""
1393 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1394
1395 def _real_initialize(self):
1396 self._youtube_ie.initialize()
1397
1398 def _real_extract(self, url):
1399 # Extract username
1400 mobj = re.match(self._VALID_URL, url)
1401 if mobj is None:
1402 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1403 return
1404
1405 # Download user page
1406 username = mobj.group(1)
1407 video_ids = []
1408 pagenum = 1
1409
1410 self.report_download_page(username)
1411 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1412 try:
1413 page = urllib2.urlopen(request).read()
1414 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1415 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1416 return
1417
1418 # Extract video identifiers
1419 ids_in_page = []
1420
1421 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1422 if mobj.group(1) not in ids_in_page:
1423 ids_in_page.append(mobj.group(1))
1424 video_ids.extend(ids_in_page)
1425
1426 for id in video_ids:
1427 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1428 return
1429
1430 class PostProcessor(object):
1431 """Post Processor class.
1432
1433 PostProcessor objects can be added to downloaders with their
1434 add_post_processor() method. When the downloader has finished a
1435 successful download, it will take its internal chain of PostProcessors
1436 and start calling the run() method on each one of them, first with
1437 an initial argument and then with the returned value of the previous
1438 PostProcessor.
1439
1440 The chain will be stopped if one of them ever returns None or the end
1441 of the chain is reached.
1442
1443 PostProcessor objects follow a "mutual registration" process similar
1444 to InfoExtractor objects.
1445 """
1446
1447 _downloader = None
1448
1449 def __init__(self, downloader=None):
1450 self._downloader = downloader
1451
1452 def set_downloader(self, downloader):
1453 """Sets the downloader for this PP."""
1454 self._downloader = downloader
1455
1456 def run(self, information):
1457 """Run the PostProcessor.
1458
1459 The "information" argument is a dictionary like the ones
1460 composed by InfoExtractors. The only difference is that this
1461 one has an extra field called "filepath" that points to the
1462 downloaded file.
1463
1464 When this method returns None, the postprocessing chain is
1465 stopped. However, this method may return an information
1466 dictionary that will be passed to the next postprocessing
1467 object in the chain. It can be the one it received after
1468 changing some fields.
1469
1470 In addition, this method may raise a PostProcessingError
1471 exception that will be taken into account by the downloader
1472 it was called from.
1473 """
1474 return information # by default, do nothing
1475
1476 ### MAIN PROGRAM ###
1477 if __name__ == '__main__':
1478 try:
1479 # Modules needed only when running the main program
1480 import getpass
1481 import optparse
1482
1483 # Function to update the program file with the latest version from bitbucket.org
1484 def update_self(downloader, filename):
1485 # Note: downloader only used for options
1486 if not os.access (filename, os.W_OK):
1487 sys.exit('ERROR: no write permissions on %s' % filename)
1488
1489 downloader.to_stdout('Updating to latest stable version...')
1490 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1491 latest_version = urllib.urlopen(latest_url).read().strip()
1492 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1493 newcontent = urllib.urlopen(prog_url).read()
1494 stream = open(filename, 'w')
1495 stream.write(newcontent)
1496 stream.close()
1497 downloader.to_stdout('Updated to version %s' % latest_version)
1498
1499 # General configuration
1500 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1501 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1502 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1503
1504 # Parse command line
1505 parser = optparse.OptionParser(
1506 usage='Usage: %prog [options] url...',
1507 version='2010.02.13',
1508 conflict_handler='resolve',
1509 )
1510
1511 parser.add_option('-h', '--help',
1512 action='help', help='print this help text and exit')
1513 parser.add_option('-v', '--version',
1514 action='version', help='print program version and exit')
1515 parser.add_option('-U', '--update',
1516 action='store_true', dest='update_self', help='update this program to latest stable version')
1517 parser.add_option('-i', '--ignore-errors',
1518 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1519 parser.add_option('-r', '--rate-limit',
1520 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1521
1522 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1523 authentication.add_option('-u', '--username',
1524 dest='username', metavar='UN', help='account username')
1525 authentication.add_option('-p', '--password',
1526 dest='password', metavar='PW', help='account password')
1527 authentication.add_option('-n', '--netrc',
1528 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1529 parser.add_option_group(authentication)
1530
1531 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1532 video_format.add_option('-f', '--format',
1533 action='store', dest='format', metavar='FMT', help='video format code')
1534 video_format.add_option('-b', '--best-quality',
1535 action='store_const', dest='format', help='download the best quality video possible', const='0')
1536 video_format.add_option('-m', '--mobile-version',
1537 action='store_const', dest='format', help='alias for -f 17', const='17')
1538 video_format.add_option('-d', '--high-def',
1539 action='store_const', dest='format', help='alias for -f 22', const='22')
1540 parser.add_option_group(video_format)
1541
1542 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1543 verbosity.add_option('-q', '--quiet',
1544 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1545 verbosity.add_option('-s', '--simulate',
1546 action='store_true', dest='simulate', help='do not download video', default=False)
1547 verbosity.add_option('-g', '--get-url',
1548 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1549 verbosity.add_option('-e', '--get-title',
1550 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1551 parser.add_option_group(verbosity)
1552
1553 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1554 filesystem.add_option('-t', '--title',
1555 action='store_true', dest='usetitle', help='use title in file name', default=False)
1556 filesystem.add_option('-l', '--literal',
1557 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1558 filesystem.add_option('-o', '--output',
1559 dest='outtmpl', metavar='TPL', help='output filename template')
1560 filesystem.add_option('-a', '--batch-file',
1561 dest='batchfile', metavar='F', help='file containing URLs to download')
1562 filesystem.add_option('-w', '--no-overwrites',
1563 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1564 filesystem.add_option('-c', '--continue',
1565 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1566 parser.add_option_group(filesystem)
1567
1568 (opts, args) = parser.parse_args()
1569
1570 # Batch file verification
1571 batchurls = []
1572 if opts.batchfile is not None:
1573 try:
1574 batchurls = open(opts.batchfile, 'r').readlines()
1575 batchurls = [x.strip() for x in batchurls]
1576 batchurls = [x for x in batchurls if len(x) > 0]
1577 except IOError:
1578 sys.exit(u'ERROR: batch file could not be read')
1579 all_urls = batchurls + args
1580
1581 # Make sure all URLs are in our preferred encoding
1582 for i in range(0, len(all_urls)):
1583 all_urls[i] = unicode(all_urls[i], preferredencoding())
1584
1585 # Conflicting, missing and erroneous options
1586 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1587 parser.error(u'using .netrc conflicts with giving username/password')
1588 if opts.password is not None and opts.username is None:
1589 parser.error(u'account username missing')
1590 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1591 parser.error(u'using output template conflicts with using title or literal title')
1592 if opts.usetitle and opts.useliteral:
1593 parser.error(u'using title conflicts with using literal title')
1594 if opts.username is not None and opts.password is None:
1595 opts.password = getpass.getpass(u'Type account password and press return:')
1596 if opts.ratelimit is not None:
1597 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1598 if numeric_limit is None:
1599 parser.error(u'invalid rate limit specified')
1600 opts.ratelimit = numeric_limit
1601
1602 # Information extractors
1603 youtube_ie = YoutubeIE()
1604 metacafe_ie = MetacafeIE(youtube_ie)
1605 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1606 youtube_user_ie = YoutubeUserIE(youtube_ie)
1607 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1608 google_ie = GoogleIE()
1609 photobucket_ie = PhotobucketIE()
1610 generic_ie = GenericIE()
1611
1612 # File downloader
1613 fd = FileDownloader({
1614 'usenetrc': opts.usenetrc,
1615 'username': opts.username,
1616 'password': opts.password,
1617 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1618 'forceurl': opts.geturl,
1619 'forcetitle': opts.gettitle,
1620 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1621 'format': opts.format,
1622 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1623 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1624 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1625 or u'%(id)s.%(ext)s'),
1626 'ignoreerrors': opts.ignoreerrors,
1627 'ratelimit': opts.ratelimit,
1628 'nooverwrites': opts.nooverwrites,
1629 'continuedl': opts.continue_dl,
1630 })
1631 fd.add_info_extractor(youtube_search_ie)
1632 fd.add_info_extractor(youtube_pl_ie)
1633 fd.add_info_extractor(youtube_user_ie)
1634 fd.add_info_extractor(metacafe_ie)
1635 fd.add_info_extractor(youtube_ie)
1636 fd.add_info_extractor(google_ie)
1637 fd.add_info_extractor(photobucket_ie)
1638
1639 # This must come last since it's the
1640 # fallback if none of the others work
1641 fd.add_info_extractor(generic_ie)
1642
1643 # Update version
1644 if opts.update_self:
1645 update_self(fd, sys.argv[0])
1646
1647 # Maybe do nothing
1648 if len(all_urls) < 1:
1649 if not opts.update_self:
1650 parser.error(u'you must provide at least one URL')
1651 else:
1652 sys.exit()
1653 retcode = fd.download(all_urls)
1654 sys.exit(retcode)
1655
1656 except DownloadError:
1657 sys.exit(1)
1658 except SameFileError:
1659 sys.exit(u'ERROR: fixed output name but more than one file to download')
1660 except KeyboardInterrupt:
1661 sys.exit(u'\nERROR: Interrupted by user')