]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
Imported Debian patch 2010.04.04-1
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 if filename == u'-':
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
103
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
107
108
109 class DownloadError(Exception):
110 """Download Error exception.
111
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
114 error message.
115 """
116 pass
117
118 class SameFileError(Exception):
119 """Same File exception.
120
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
123 """
124 pass
125
126 class PostProcessingError(Exception):
127 """Post Processing exception.
128
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
131 """
132 pass
133
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
136
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
139 """
140 pass
141
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
144
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
148 """
149 # Both in bytes
150 downloaded = None
151 expected = None
152
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
156
157 class FileDownloader(object):
158 """File Downloader class.
159
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
174
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 continuedl: Try to continue downloads if possible.
197 noprogress: Do not print the progress bar.
198 """
199
200 params = None
201 _ies = []
202 _pps = []
203 _download_retcode = None
204 _num_downloads = None
205
206 def __init__(self, params):
207 """Create a FileDownloader object with the given options."""
208 self._ies = []
209 self._pps = []
210 self._download_retcode = 0
211 self._num_downloads = 0
212 self.params = params
213
214 @staticmethod
215 def pmkdir(filename):
216 """Create directory components in filename. Similar to Unix "mkdir -p"."""
217 components = filename.split(os.sep)
218 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
219 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
220 for dir in aggregate:
221 if not os.path.exists(dir):
222 os.mkdir(dir)
223
224 @staticmethod
225 def format_bytes(bytes):
226 if bytes is None:
227 return 'N/A'
228 if type(bytes) is str:
229 bytes = float(bytes)
230 if bytes == 0.0:
231 exponent = 0
232 else:
233 exponent = long(math.log(bytes, 1024.0))
234 suffix = 'bkMGTPEZY'[exponent]
235 converted = float(bytes) / float(1024**exponent)
236 return '%.2f%s' % (converted, suffix)
237
238 @staticmethod
239 def calc_percent(byte_counter, data_len):
240 if data_len is None:
241 return '---.-%'
242 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
243
244 @staticmethod
245 def calc_eta(start, now, total, current):
246 if total is None:
247 return '--:--'
248 dif = now - start
249 if current == 0 or dif < 0.001: # One millisecond
250 return '--:--'
251 rate = float(current) / dif
252 eta = long((float(total) - float(current)) / rate)
253 (eta_mins, eta_secs) = divmod(eta, 60)
254 if eta_mins > 99:
255 return '--:--'
256 return '%02d:%02d' % (eta_mins, eta_secs)
257
258 @staticmethod
259 def calc_speed(start, now, bytes):
260 dif = now - start
261 if bytes == 0 or dif < 0.001: # One millisecond
262 return '%10s' % '---b/s'
263 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
264
265 @staticmethod
266 def best_block_size(elapsed_time, bytes):
267 new_min = max(bytes / 2.0, 1.0)
268 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
269 if elapsed_time < 0.001:
270 return long(new_max)
271 rate = bytes / elapsed_time
272 if rate > new_max:
273 return long(new_max)
274 if rate < new_min:
275 return long(new_min)
276 return long(rate)
277
278 @staticmethod
279 def parse_bytes(bytestr):
280 """Parse a string indicating a byte quantity into a long integer."""
281 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
282 if matchobj is None:
283 return None
284 number = float(matchobj.group(1))
285 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
286 return long(round(number * multiplier))
287
288 @staticmethod
289 def verify_url(url):
290 """Verify a URL is valid and data could be downloaded. Return real data URL."""
291 request = urllib2.Request(url, None, std_headers)
292 data = urllib2.urlopen(request)
293 data.read(1)
294 url = data.geturl()
295 data.close()
296 return url
297
298 def add_info_extractor(self, ie):
299 """Add an InfoExtractor object to the end of the list."""
300 self._ies.append(ie)
301 ie.set_downloader(self)
302
303 def add_post_processor(self, pp):
304 """Add a PostProcessor object to the end of the chain."""
305 self._pps.append(pp)
306 pp.set_downloader(self)
307
308 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
309 """Print message to stdout if not in quiet mode."""
310 try:
311 if not self.params.get('quiet', False):
312 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
313 sys.stdout.flush()
314 except (UnicodeEncodeError), err:
315 if not ignore_encoding_errors:
316 raise
317
318 def to_stderr(self, message):
319 """Print message to stderr."""
320 print >>sys.stderr, message.encode(preferredencoding())
321
322 def fixed_template(self):
323 """Checks if the output template is fixed."""
324 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
325
326 def trouble(self, message=None):
327 """Determine action to take when a download problem appears.
328
329 Depending on if the downloader has been configured to ignore
330 download errors or not, this method may throw an exception or
331 not when errors are found, after printing the message.
332 """
333 if message is not None:
334 self.to_stderr(message)
335 if not self.params.get('ignoreerrors', False):
336 raise DownloadError(message)
337 self._download_retcode = 1
338
339 def slow_down(self, start_time, byte_counter):
340 """Sleep if the download speed is over the rate limit."""
341 rate_limit = self.params.get('ratelimit', None)
342 if rate_limit is None or byte_counter == 0:
343 return
344 now = time.time()
345 elapsed = now - start_time
346 if elapsed <= 0.0:
347 return
348 speed = float(byte_counter) / elapsed
349 if speed > rate_limit:
350 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
351
352 def report_destination(self, filename):
353 """Report destination filename."""
354 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
355
356 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
357 """Report download progress."""
358 if self.params.get('noprogress', False):
359 return
360 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
361 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
362
363 def report_resuming_byte(self, resume_len):
364 """Report attemtp to resume at given byte."""
365 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
366
367 def report_file_already_downloaded(self, file_name):
368 """Report file has already been fully downloaded."""
369 try:
370 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
371 except (UnicodeEncodeError), err:
372 self.to_stdout(u'[download] The file has already been downloaded')
373
374 def report_unable_to_resume(self):
375 """Report it was impossible to resume download."""
376 self.to_stdout(u'[download] Unable to resume')
377
378 def report_finish(self):
379 """Report download finished."""
380 if self.params.get('noprogress', False):
381 self.to_stdout(u'[download] Download completed')
382 else:
383 self.to_stdout(u'')
384
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Verify URL if it's an HTTP one
390 if info_dict['url'].startswith('http'):
391 try:
392 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
393 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
394 raise UnavailableFormatError
395
396 # Forced printings
397 if self.params.get('forcetitle', False):
398 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
399 if self.params.get('forceurl', False):
400 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
401
402 return
403
404 try:
405 template_dict = dict(info_dict)
406 template_dict['epoch'] = unicode(long(time.time()))
407 template_dict['ord'] = unicode('%05d' % self._num_downloads)
408 filename = self.params['outtmpl'] % template_dict
409 except (ValueError, KeyError), err:
410 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
411 if self.params.get('nooverwrites', False) and os.path.exists(filename):
412 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
413 return
414
415 try:
416 self.pmkdir(filename)
417 except (OSError, IOError), err:
418 self.trouble('ERROR: unable to create directories: %s' % str(err))
419 return
420
421 try:
422 success = self._do_download(filename, info_dict['url'].encode('utf-8'))
423 except (OSError, IOError), err:
424 raise UnavailableFormatError
425 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
426 self.trouble('ERROR: unable to download video data: %s' % str(err))
427 return
428 except (ContentTooShortError, ), err:
429 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
430 return
431
432 if success:
433 try:
434 self.post_process(filename, info_dict)
435 except (PostProcessingError), err:
436 self.trouble('ERROR: postprocessing: %s' % str(err))
437 return
438
439 def download(self, url_list):
440 """Download a given list of URLs."""
441 if len(url_list) > 1 and self.fixed_template():
442 raise SameFileError(self.params['outtmpl'])
443
444 for url in url_list:
445 suitable_found = False
446 for ie in self._ies:
447 # Go to next InfoExtractor if not suitable
448 if not ie.suitable(url):
449 continue
450
451 # Suitable InfoExtractor found
452 suitable_found = True
453
454 # Extract information from URL and process it
455 ie.extract(url)
456
457 # Suitable InfoExtractor had been found; go to next URL
458 break
459
460 if not suitable_found:
461 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
462
463 return self._download_retcode
464
465 def post_process(self, filename, ie_info):
466 """Run the postprocessing chain on the given file."""
467 info = dict(ie_info)
468 info['filepath'] = filename
469 for pp in self._pps:
470 info = pp.run(info)
471 if info is None:
472 break
473
474 def _download_with_rtmpdump(self, filename, url):
475 self.report_destination(filename)
476
477 # Check for rtmpdump first
478 try:
479 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
480 except (OSError, IOError):
481 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
482 return False
483
484 # Download using rtmpdump. rtmpdump returns exit code 2 when
485 # the connection was interrumpted and resuming appears to be
486 # possible. This is part of rtmpdump's normal usage, AFAIK.
487 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
488 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
489 while retval == 2 or retval == 1:
490 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
491 time.sleep(2.0) # This seems to be needed
492 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
493 if retval == 0:
494 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
495 return True
496 else:
497 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
498 return False
499
500 def _do_download(self, filename, url):
501 # Attempt to download using rtmpdump
502 if url.startswith('rtmp'):
503 return self._download_with_rtmpdump(filename, url)
504
505 stream = None
506 open_mode = 'wb'
507 basic_request = urllib2.Request(url, None, std_headers)
508 request = urllib2.Request(url, None, std_headers)
509
510 # Establish possible resume length
511 if os.path.isfile(filename):
512 resume_len = os.path.getsize(filename)
513 else:
514 resume_len = 0
515
516 # Request parameters in case of being able to resume
517 if self.params.get('continuedl', False) and resume_len != 0:
518 self.report_resuming_byte(resume_len)
519 request.add_header('Range','bytes=%d-' % resume_len)
520 open_mode = 'ab'
521
522 # Establish connection
523 try:
524 data = urllib2.urlopen(request)
525 except (urllib2.HTTPError, ), err:
526 if err.code != 416: # 416 is 'Requested range not satisfiable'
527 raise
528 # Unable to resume
529 data = urllib2.urlopen(basic_request)
530 content_length = data.info()['Content-Length']
531
532 if content_length is not None and long(content_length) == resume_len:
533 # Because the file had already been fully downloaded
534 self.report_file_already_downloaded(filename)
535 return True
536 else:
537 # Because the server didn't let us
538 self.report_unable_to_resume()
539 open_mode = 'wb'
540
541 data_len = data.info().get('Content-length', None)
542 data_len_str = self.format_bytes(data_len)
543 byte_counter = 0
544 block_size = 1024
545 start = time.time()
546 while True:
547 # Download and write
548 before = time.time()
549 data_block = data.read(block_size)
550 after = time.time()
551 data_block_len = len(data_block)
552 if data_block_len == 0:
553 break
554 byte_counter += data_block_len
555
556 # Open file just in time
557 if stream is None:
558 try:
559 (stream, filename) = sanitize_open(filename, open_mode)
560 self.report_destination(filename)
561 self._num_downloads += 1
562 except (OSError, IOError), err:
563 self.trouble('ERROR: unable to open for writing: %s' % str(err))
564 return False
565 stream.write(data_block)
566 block_size = self.best_block_size(after - before, data_block_len)
567
568 # Progress message
569 percent_str = self.calc_percent(byte_counter, data_len)
570 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
571 speed_str = self.calc_speed(start, time.time(), byte_counter)
572 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
573
574 # Apply rate limit
575 self.slow_down(start, byte_counter)
576
577 self.report_finish()
578 if data_len is not None and str(byte_counter) != data_len:
579 raise ContentTooShortError(byte_counter, long(data_len))
580 return True
581
582 class InfoExtractor(object):
583 """Information Extractor class.
584
585 Information extractors are the classes that, given a URL, extract
586 information from the video (or videos) the URL refers to. This
587 information includes the real video URL, the video title and simplified
588 title, author and others. The information is stored in a dictionary
589 which is then passed to the FileDownloader. The FileDownloader
590 processes this information possibly downloading the video to the file
591 system, among other possible outcomes. The dictionaries must include
592 the following fields:
593
594 id: Video identifier.
595 url: Final video URL.
596 uploader: Nickname of the video uploader.
597 title: Literal title.
598 stitle: Simplified title.
599 ext: Video filename extension.
600 format: Video format.
601
602 Subclasses of this one should re-define the _real_initialize() and
603 _real_extract() methods, as well as the suitable() static method.
604 Probably, they should also be instantiated and added to the main
605 downloader.
606 """
607
608 _ready = False
609 _downloader = None
610
611 def __init__(self, downloader=None):
612 """Constructor. Receives an optional downloader."""
613 self._ready = False
614 self.set_downloader(downloader)
615
616 @staticmethod
617 def suitable(url):
618 """Receives a URL and returns True if suitable for this IE."""
619 return False
620
621 def initialize(self):
622 """Initializes an instance (authentication, etc)."""
623 if not self._ready:
624 self._real_initialize()
625 self._ready = True
626
627 def extract(self, url):
628 """Extracts URL information and returns it in list of dicts."""
629 self.initialize()
630 return self._real_extract(url)
631
632 def set_downloader(self, downloader):
633 """Sets the downloader for this IE."""
634 self._downloader = downloader
635
636 def _real_initialize(self):
637 """Real initialization process. Redefine in subclasses."""
638 pass
639
640 def _real_extract(self, url):
641 """Real extraction process. Redefine in subclasses."""
642 pass
643
644 class YoutubeIE(InfoExtractor):
645 """Information extractor for youtube.com."""
646
647 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
648 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
649 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
650 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
651 _NETRC_MACHINE = 'youtube'
652 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
653 _video_extensions = {
654 '13': '3gp',
655 '17': 'mp4',
656 '18': 'mp4',
657 '22': 'mp4',
658 '37': 'mp4',
659 }
660
661 @staticmethod
662 def suitable(url):
663 return (re.match(YoutubeIE._VALID_URL, url) is not None)
664
665 def report_lang(self):
666 """Report attempt to set language."""
667 self._downloader.to_stdout(u'[youtube] Setting language')
668
669 def report_login(self):
670 """Report attempt to log in."""
671 self._downloader.to_stdout(u'[youtube] Logging in')
672
673 def report_age_confirmation(self):
674 """Report attempt to confirm age."""
675 self._downloader.to_stdout(u'[youtube] Confirming age')
676
677 def report_video_info_webpage_download(self, video_id):
678 """Report attempt to download video info webpage."""
679 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
680
681 def report_information_extraction(self, video_id):
682 """Report attempt to extract video information."""
683 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
684
685 def report_unavailable_format(self, video_id, format):
686 """Report extracted video URL."""
687 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
688
689 def report_rtmp_download(self):
690 """Indicate the download will use the RTMP protocol."""
691 self._downloader.to_stdout(u'[youtube] RTMP download detected')
692
693 def _real_initialize(self):
694 if self._downloader is None:
695 return
696
697 username = None
698 password = None
699 downloader_params = self._downloader.params
700
701 # Attempt to use provided username and password or .netrc data
702 if downloader_params.get('username', None) is not None:
703 username = downloader_params['username']
704 password = downloader_params['password']
705 elif downloader_params.get('usenetrc', False):
706 try:
707 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
708 if info is not None:
709 username = info[0]
710 password = info[2]
711 else:
712 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
713 except (IOError, netrc.NetrcParseError), err:
714 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
715 return
716
717 # Set language
718 request = urllib2.Request(self._LANG_URL, None, std_headers)
719 try:
720 self.report_lang()
721 urllib2.urlopen(request).read()
722 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
723 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
724 return
725
726 # No authentication to be performed
727 if username is None:
728 return
729
730 # Log in
731 login_form = {
732 'current_form': 'loginForm',
733 'next': '/',
734 'action_login': 'Log In',
735 'username': username,
736 'password': password,
737 }
738 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
739 try:
740 self.report_login()
741 login_results = urllib2.urlopen(request).read()
742 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
743 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
744 return
745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
746 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
747 return
748
749 # Confirm age
750 age_form = {
751 'next_url': '/',
752 'action_confirm': 'Confirm',
753 }
754 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
755 try:
756 self.report_age_confirmation()
757 age_results = urllib2.urlopen(request).read()
758 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
759 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
760 return
761
762 def _real_extract(self, url):
763 # Extract video id from URL
764 mobj = re.match(self._VALID_URL, url)
765 if mobj is None:
766 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
767 return
768 video_id = mobj.group(2)
769
770 # Downloader parameters
771 best_quality = False
772 all_formats = False
773 format_param = None
774 quality_index = 0
775 if self._downloader is not None:
776 params = self._downloader.params
777 format_param = params.get('format', None)
778 if format_param == '0':
779 format_param = self._available_formats[quality_index]
780 best_quality = True
781 elif format_param == '-1':
782 format_param = self._available_formats[quality_index]
783 all_formats = True
784
785 while True:
786 # Extension
787 video_extension = self._video_extensions.get(format_param, 'flv')
788
789 # Get video info
790 self.report_video_info_webpage_download(video_id)
791 for el_type in ['embedded', 'detailpage', 'vevo']:
792 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
793 % (video_id, el_type))
794 request = urllib2.Request(video_info_url, None, std_headers)
795 try:
796 video_info_webpage = urllib2.urlopen(request).read()
797 video_info = parse_qs(video_info_webpage)
798 if 'token' in video_info:
799 break
800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
802 return
803 self.report_information_extraction(video_id)
804
805 # "t" param
806 if 'token' not in video_info:
807 # Attempt to see if YouTube has issued an error message
808 if 'reason' not in video_info:
809 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
810 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
811 stream.write(video_info_webpage)
812 stream.close()
813 else:
814 reason = urllib.unquote_plus(video_info['reason'][0])
815 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
816 return
817 token = urllib.unquote_plus(video_info['token'][0])
818 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
819 if format_param is not None:
820 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
821
822 # Check possible RTMP download
823 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
824 self.report_rtmp_download()
825 video_real_url = video_info['conn'][0]
826
827 # uploader
828 if 'author' not in video_info:
829 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
830 return
831 video_uploader = urllib.unquote_plus(video_info['author'][0])
832
833 # title
834 if 'title' not in video_info:
835 self._downloader.trouble(u'ERROR: unable to extract video title')
836 return
837 video_title = urllib.unquote_plus(video_info['title'][0])
838 video_title = video_title.decode('utf-8')
839 video_title = sanitize_title(video_title)
840
841 # simplified title
842 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
843 simple_title = simple_title.strip(ur'_')
844
845 try:
846 # Process video information
847 self._downloader.process_info({
848 'id': video_id.decode('utf-8'),
849 'url': video_real_url.decode('utf-8'),
850 'uploader': video_uploader.decode('utf-8'),
851 'title': video_title,
852 'stitle': simple_title,
853 'ext': video_extension.decode('utf-8'),
854 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
855 })
856
857 if all_formats:
858 if quality_index == len(self._available_formats) - 1:
859 # None left to get
860 return
861 else:
862 quality_index += 1
863 format_param = self._available_formats[quality_index]
864 if format_param == None:
865 return
866 continue
867
868 return
869
870 except UnavailableFormatError, err:
871 if best_quality or all_formats:
872 if quality_index == len(self._available_formats) - 1:
873 # I don't ever expect this to happen
874 if not all_formats:
875 self._downloader.trouble(u'ERROR: no known formats available for video')
876 return
877 else:
878 self.report_unavailable_format(video_id, format_param)
879 quality_index += 1
880 format_param = self._available_formats[quality_index]
881 if format_param == None:
882 return
883 continue
884 else:
885 self._downloader.trouble('ERROR: format not available for video')
886 return
887
888
889 class MetacafeIE(InfoExtractor):
890 """Information Extractor for metacafe.com."""
891
892 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
893 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
894 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
895 _youtube_ie = None
896
897 def __init__(self, youtube_ie, downloader=None):
898 InfoExtractor.__init__(self, downloader)
899 self._youtube_ie = youtube_ie
900
901 @staticmethod
902 def suitable(url):
903 return (re.match(MetacafeIE._VALID_URL, url) is not None)
904
905 def report_disclaimer(self):
906 """Report disclaimer retrieval."""
907 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
908
909 def report_age_confirmation(self):
910 """Report attempt to confirm age."""
911 self._downloader.to_stdout(u'[metacafe] Confirming age')
912
913 def report_download_webpage(self, video_id):
914 """Report webpage download."""
915 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
916
917 def report_extraction(self, video_id):
918 """Report information extraction."""
919 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
920
921 def _real_initialize(self):
922 # Retrieve disclaimer
923 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
924 try:
925 self.report_disclaimer()
926 disclaimer = urllib2.urlopen(request).read()
927 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
928 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
929 return
930
931 # Confirm age
932 disclaimer_form = {
933 'filters': '0',
934 'submit': "Continue - I'm over 18",
935 }
936 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
937 try:
938 self.report_age_confirmation()
939 disclaimer = urllib2.urlopen(request).read()
940 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
941 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
942 return
943
944 def _real_extract(self, url):
945 # Extract id and simplified title from URL
946 mobj = re.match(self._VALID_URL, url)
947 if mobj is None:
948 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
949 return
950
951 video_id = mobj.group(1)
952
953 # Check if video comes from YouTube
954 mobj2 = re.match(r'^yt-(.*)$', video_id)
955 if mobj2 is not None:
956 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
957 return
958
959 simple_title = mobj.group(2).decode('utf-8')
960 video_extension = 'flv'
961
962 # Retrieve video webpage to extract further information
963 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
964 try:
965 self.report_download_webpage(video_id)
966 webpage = urllib2.urlopen(request).read()
967 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
968 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
969 return
970
971 # Extract URL, uploader and title from webpage
972 self.report_extraction(video_id)
973 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
974 if mobj is None:
975 self._downloader.trouble(u'ERROR: unable to extract media URL')
976 return
977 mediaURL = urllib.unquote(mobj.group(1))
978
979 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
980 #if mobj is None:
981 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
982 # return
983 #gdaKey = mobj.group(1)
984 #
985 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
986
987 video_url = mediaURL
988
989 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
990 if mobj is None:
991 self._downloader.trouble(u'ERROR: unable to extract title')
992 return
993 video_title = mobj.group(1).decode('utf-8')
994 video_title = sanitize_title(video_title)
995
996 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
997 if mobj is None:
998 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
999 return
1000 video_uploader = mobj.group(1)
1001
1002 try:
1003 # Process video information
1004 self._downloader.process_info({
1005 'id': video_id.decode('utf-8'),
1006 'url': video_url.decode('utf-8'),
1007 'uploader': video_uploader.decode('utf-8'),
1008 'title': video_title,
1009 'stitle': simple_title,
1010 'ext': video_extension.decode('utf-8'),
1011 'format': u'NA',
1012 })
1013 except UnavailableFormatError:
1014 self._downloader.trouble(u'ERROR: format not available for video')
1015
1016
1017 class GoogleIE(InfoExtractor):
1018 """Information extractor for video.google.com."""
1019
1020 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1021
1022 def __init__(self, downloader=None):
1023 InfoExtractor.__init__(self, downloader)
1024
1025 @staticmethod
1026 def suitable(url):
1027 return (re.match(GoogleIE._VALID_URL, url) is not None)
1028
1029 def report_download_webpage(self, video_id):
1030 """Report webpage download."""
1031 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1032
1033 def report_extraction(self, video_id):
1034 """Report information extraction."""
1035 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1036
1037 def _real_initialize(self):
1038 return
1039
1040 def _real_extract(self, url):
1041 # Extract id from URL
1042 mobj = re.match(self._VALID_URL, url)
1043 if mobj is None:
1044 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1045 return
1046
1047 video_id = mobj.group(1)
1048
1049 video_extension = 'mp4'
1050
1051 # Retrieve video webpage to extract further information
1052 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1053 try:
1054 self.report_download_webpage(video_id)
1055 webpage = urllib2.urlopen(request).read()
1056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1057 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1058 return
1059
1060 # Extract URL, uploader, and title from webpage
1061 self.report_extraction(video_id)
1062 mobj = re.search(r"download_url:'([^']+)'", webpage)
1063 if mobj is None:
1064 video_extension = 'flv'
1065 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1066 if mobj is None:
1067 self._downloader.trouble(u'ERROR: unable to extract media URL')
1068 return
1069 mediaURL = urllib.unquote(mobj.group(1))
1070 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1071 mediaURL = mediaURL.replace('\\x26', '\x26')
1072
1073 video_url = mediaURL
1074
1075 mobj = re.search(r'<title>(.*)</title>', webpage)
1076 if mobj is None:
1077 self._downloader.trouble(u'ERROR: unable to extract title')
1078 return
1079 video_title = mobj.group(1).decode('utf-8')
1080 video_title = sanitize_title(video_title)
1081 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1082
1083 try:
1084 # Process video information
1085 self._downloader.process_info({
1086 'id': video_id.decode('utf-8'),
1087 'url': video_url.decode('utf-8'),
1088 'uploader': u'NA',
1089 'title': video_title,
1090 'stitle': simple_title,
1091 'ext': video_extension.decode('utf-8'),
1092 'format': u'NA',
1093 })
1094 except UnavailableFormatError:
1095 self._downloader.trouble(u'ERROR: format not available for video')
1096
1097
1098 class PhotobucketIE(InfoExtractor):
1099 """Information extractor for photobucket.com."""
1100
1101 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1102
1103 def __init__(self, downloader=None):
1104 InfoExtractor.__init__(self, downloader)
1105
1106 @staticmethod
1107 def suitable(url):
1108 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1109
1110 def report_download_webpage(self, video_id):
1111 """Report webpage download."""
1112 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1113
1114 def report_extraction(self, video_id):
1115 """Report information extraction."""
1116 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1117
1118 def _real_initialize(self):
1119 return
1120
1121 def _real_extract(self, url):
1122 # Extract id from URL
1123 mobj = re.match(self._VALID_URL, url)
1124 if mobj is None:
1125 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1126 return
1127
1128 video_id = mobj.group(1)
1129
1130 video_extension = 'flv'
1131
1132 # Retrieve video webpage to extract further information
1133 request = urllib2.Request(url)
1134 try:
1135 self.report_download_webpage(video_id)
1136 webpage = urllib2.urlopen(request).read()
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1139 return
1140
1141 # Extract URL, uploader, and title from webpage
1142 self.report_extraction(video_id)
1143 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1144 if mobj is None:
1145 self._downloader.trouble(u'ERROR: unable to extract media URL')
1146 return
1147 mediaURL = urllib.unquote(mobj.group(1))
1148
1149 video_url = mediaURL
1150
1151 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1152 if mobj is None:
1153 self._downloader.trouble(u'ERROR: unable to extract title')
1154 return
1155 video_title = mobj.group(1).decode('utf-8')
1156 video_title = sanitize_title(video_title)
1157 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1158
1159 video_uploader = mobj.group(2).decode('utf-8')
1160
1161 try:
1162 # Process video information
1163 self._downloader.process_info({
1164 'id': video_id.decode('utf-8'),
1165 'url': video_url.decode('utf-8'),
1166 'uploader': video_uploader,
1167 'title': video_title,
1168 'stitle': simple_title,
1169 'ext': video_extension.decode('utf-8'),
1170 'format': u'NA',
1171 })
1172 except UnavailableFormatError:
1173 self._downloader.trouble(u'ERROR: format not available for video')
1174
1175
1176 class YahooIE(InfoExtractor):
1177 """Information extractor for video.yahoo.com."""
1178
1179 # _VALID_URL matches all Yahoo! Video URLs
1180 # _VPAGE_URL matches only the extractable '/watch/' URLs
1181 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1182 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1183
1184 def __init__(self, downloader=None):
1185 InfoExtractor.__init__(self, downloader)
1186
1187 @staticmethod
1188 def suitable(url):
1189 return (re.match(YahooIE._VALID_URL, url) is not None)
1190
1191 def report_download_webpage(self, video_id):
1192 """Report webpage download."""
1193 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1194
1195 def report_extraction(self, video_id):
1196 """Report information extraction."""
1197 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1198
1199 def _real_initialize(self):
1200 return
1201
1202 def _real_extract(self, url):
1203 # Extract ID from URL
1204 mobj = re.match(self._VALID_URL, url)
1205 if mobj is None:
1206 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1207 return
1208
1209 video_id = mobj.group(2)
1210 video_extension = 'flv'
1211
1212 # Rewrite valid but non-extractable URLs as
1213 # extractable English language /watch/ URLs
1214 if re.match(self._VPAGE_URL, url) is None:
1215 request = urllib2.Request(url)
1216 try:
1217 webpage = urllib2.urlopen(request).read()
1218 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1219 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1220 return
1221
1222 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1223 if mobj is None:
1224 self._downloader.trouble(u'ERROR: Unable to extract id field')
1225 return
1226 yahoo_id = mobj.group(1)
1227
1228 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1229 if mobj is None:
1230 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1231 return
1232 yahoo_vid = mobj.group(1)
1233
1234 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1235 return self._real_extract(url)
1236
1237 # Retrieve video webpage to extract further information
1238 request = urllib2.Request(url)
1239 try:
1240 self.report_download_webpage(video_id)
1241 webpage = urllib2.urlopen(request).read()
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1244 return
1245
1246 # Extract uploader and title from webpage
1247 self.report_extraction(video_id)
1248 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1249 if mobj is None:
1250 self._downloader.trouble(u'ERROR: unable to extract video title')
1251 return
1252 video_title = mobj.group(1).decode('utf-8')
1253 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1254
1255 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1256 if mobj is None:
1257 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1258 return
1259 video_uploader = mobj.group(1).decode('utf-8')
1260
1261 # Extract video height and width
1262 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1263 if mobj is None:
1264 self._downloader.trouble(u'ERROR: unable to extract video height')
1265 return
1266 yv_video_height = mobj.group(1)
1267
1268 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1269 if mobj is None:
1270 self._downloader.trouble(u'ERROR: unable to extract video width')
1271 return
1272 yv_video_width = mobj.group(1)
1273
1274 # Retrieve video playlist to extract media URL
1275 # I'm not completely sure what all these options are, but we
1276 # seem to need most of them, otherwise the server sends a 401.
1277 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1278 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1279 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1280 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1281 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1282 try:
1283 self.report_download_webpage(video_id)
1284 webpage = urllib2.urlopen(request).read()
1285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1287 return
1288
1289 # Extract media URL from playlist XML
1290 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1291 if mobj is None:
1292 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1293 return
1294 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1295 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1296
1297 try:
1298 # Process video information
1299 self._downloader.process_info({
1300 'id': video_id.decode('utf-8'),
1301 'url': video_url,
1302 'uploader': video_uploader,
1303 'title': video_title,
1304 'stitle': simple_title,
1305 'ext': video_extension.decode('utf-8'),
1306 })
1307 except UnavailableFormatError:
1308 self._downloader.trouble(u'ERROR: format not available for video')
1309
1310
1311 class GenericIE(InfoExtractor):
1312 """Generic last-resort information extractor."""
1313
1314 def __init__(self, downloader=None):
1315 InfoExtractor.__init__(self, downloader)
1316
1317 @staticmethod
1318 def suitable(url):
1319 return True
1320
1321 def report_download_webpage(self, video_id):
1322 """Report webpage download."""
1323 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1324 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1325
1326 def report_extraction(self, video_id):
1327 """Report information extraction."""
1328 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1329
1330 def _real_initialize(self):
1331 return
1332
1333 def _real_extract(self, url):
1334 video_id = url.split('/')[-1]
1335 request = urllib2.Request(url)
1336 try:
1337 self.report_download_webpage(video_id)
1338 webpage = urllib2.urlopen(request).read()
1339 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1340 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1341 return
1342 except ValueError, err:
1343 # since this is the last-resort InfoExtractor, if
1344 # this error is thrown, it'll be thrown here
1345 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1346 return
1347
1348 # Start with something easy: JW Player in SWFObject
1349 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1350 if mobj is None:
1351 # Broaden the search a little bit
1352 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1353 if mobj is None:
1354 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1355 return
1356
1357 # It's possible that one of the regexes
1358 # matched, but returned an empty group:
1359 if mobj.group(1) is None:
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1361 return
1362
1363 video_url = urllib.unquote(mobj.group(1))
1364 video_id = os.path.basename(video_url)
1365
1366 # here's a fun little line of code for you:
1367 video_extension = os.path.splitext(video_id)[1][1:]
1368 video_id = os.path.splitext(video_id)[0]
1369
1370 # it's tempting to parse this further, but you would
1371 # have to take into account all the variations like
1372 # Video Title - Site Name
1373 # Site Name | Video Title
1374 # Video Title - Tagline | Site Name
1375 # and so on and so forth; it's just not practical
1376 mobj = re.search(r'<title>(.*)</title>', webpage)
1377 if mobj is None:
1378 self._downloader.trouble(u'ERROR: unable to extract title')
1379 return
1380 video_title = mobj.group(1).decode('utf-8')
1381 video_title = sanitize_title(video_title)
1382 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1383
1384 # video uploader is domain name
1385 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1386 if mobj is None:
1387 self._downloader.trouble(u'ERROR: unable to extract title')
1388 return
1389 video_uploader = mobj.group(1).decode('utf-8')
1390
1391 try:
1392 # Process video information
1393 self._downloader.process_info({
1394 'id': video_id.decode('utf-8'),
1395 'url': video_url.decode('utf-8'),
1396 'uploader': video_uploader,
1397 'title': video_title,
1398 'stitle': simple_title,
1399 'ext': video_extension.decode('utf-8'),
1400 'format': u'NA',
1401 })
1402 except UnavailableFormatError:
1403 self._downloader.trouble(u'ERROR: format not available for video')
1404
1405
1406 class YoutubeSearchIE(InfoExtractor):
1407 """Information Extractor for YouTube search queries."""
1408 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1409 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1410 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1411 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1412 _youtube_ie = None
1413 _max_youtube_results = 1000
1414
1415 def __init__(self, youtube_ie, downloader=None):
1416 InfoExtractor.__init__(self, downloader)
1417 self._youtube_ie = youtube_ie
1418
1419 @staticmethod
1420 def suitable(url):
1421 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1422
1423 def report_download_page(self, query, pagenum):
1424 """Report attempt to download playlist page with given number."""
1425 query = query.decode(preferredencoding())
1426 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1427
1428 def _real_initialize(self):
1429 self._youtube_ie.initialize()
1430
1431 def _real_extract(self, query):
1432 mobj = re.match(self._VALID_QUERY, query)
1433 if mobj is None:
1434 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1435 return
1436
1437 prefix, query = query.split(':')
1438 prefix = prefix[8:]
1439 query = query.encode('utf-8')
1440 if prefix == '':
1441 self._download_n_results(query, 1)
1442 return
1443 elif prefix == 'all':
1444 self._download_n_results(query, self._max_youtube_results)
1445 return
1446 else:
1447 try:
1448 n = long(prefix)
1449 if n <= 0:
1450 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1451 return
1452 elif n > self._max_youtube_results:
1453 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1454 n = self._max_youtube_results
1455 self._download_n_results(query, n)
1456 return
1457 except ValueError: # parsing prefix as integer fails
1458 self._download_n_results(query, 1)
1459 return
1460
1461 def _download_n_results(self, query, n):
1462 """Downloads a specified number of results for a query"""
1463
1464 video_ids = []
1465 already_seen = set()
1466 pagenum = 1
1467
1468 while True:
1469 self.report_download_page(query, pagenum)
1470 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1471 request = urllib2.Request(result_url, None, std_headers)
1472 try:
1473 page = urllib2.urlopen(request).read()
1474 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1475 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1476 return
1477
1478 # Extract video identifiers
1479 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1480 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1481 if video_id not in already_seen:
1482 video_ids.append(video_id)
1483 already_seen.add(video_id)
1484 if len(video_ids) == n:
1485 # Specified n videos reached
1486 for id in video_ids:
1487 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1488 return
1489
1490 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1491 for id in video_ids:
1492 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1493 return
1494
1495 pagenum = pagenum + 1
1496
1497 class YoutubePlaylistIE(InfoExtractor):
1498 """Information Extractor for YouTube playlists."""
1499
1500 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1501 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1502 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1503 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1504 _youtube_ie = None
1505
1506 def __init__(self, youtube_ie, downloader=None):
1507 InfoExtractor.__init__(self, downloader)
1508 self._youtube_ie = youtube_ie
1509
1510 @staticmethod
1511 def suitable(url):
1512 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1513
1514 def report_download_page(self, playlist_id, pagenum):
1515 """Report attempt to download playlist page with given number."""
1516 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1517
1518 def _real_initialize(self):
1519 self._youtube_ie.initialize()
1520
1521 def _real_extract(self, url):
1522 # Extract playlist id
1523 mobj = re.match(self._VALID_URL, url)
1524 if mobj is None:
1525 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1526 return
1527
1528 # Download playlist pages
1529 playlist_id = mobj.group(1)
1530 video_ids = []
1531 pagenum = 1
1532
1533 while True:
1534 self.report_download_page(playlist_id, pagenum)
1535 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1536 try:
1537 page = urllib2.urlopen(request).read()
1538 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1539 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1540 return
1541
1542 # Extract video identifiers
1543 ids_in_page = []
1544 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1545 if mobj.group(1) not in ids_in_page:
1546 ids_in_page.append(mobj.group(1))
1547 video_ids.extend(ids_in_page)
1548
1549 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1550 break
1551 pagenum = pagenum + 1
1552
1553 for id in video_ids:
1554 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1555 return
1556
1557 class YoutubeUserIE(InfoExtractor):
1558 """Information Extractor for YouTube users."""
1559
1560 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1561 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1562 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1563 _youtube_ie = None
1564
1565 def __init__(self, youtube_ie, downloader=None):
1566 InfoExtractor.__init__(self, downloader)
1567 self._youtube_ie = youtube_ie
1568
1569 @staticmethod
1570 def suitable(url):
1571 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1572
1573 def report_download_page(self, username):
1574 """Report attempt to download user page."""
1575 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1576
1577 def _real_initialize(self):
1578 self._youtube_ie.initialize()
1579
1580 def _real_extract(self, url):
1581 # Extract username
1582 mobj = re.match(self._VALID_URL, url)
1583 if mobj is None:
1584 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1585 return
1586
1587 # Download user page
1588 username = mobj.group(1)
1589 video_ids = []
1590 pagenum = 1
1591
1592 self.report_download_page(username)
1593 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1594 try:
1595 page = urllib2.urlopen(request).read()
1596 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1597 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1598 return
1599
1600 # Extract video identifiers
1601 ids_in_page = []
1602
1603 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1604 if mobj.group(1) not in ids_in_page:
1605 ids_in_page.append(mobj.group(1))
1606 video_ids.extend(ids_in_page)
1607
1608 for id in video_ids:
1609 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1610 return
1611
1612 class PostProcessor(object):
1613 """Post Processor class.
1614
1615 PostProcessor objects can be added to downloaders with their
1616 add_post_processor() method. When the downloader has finished a
1617 successful download, it will take its internal chain of PostProcessors
1618 and start calling the run() method on each one of them, first with
1619 an initial argument and then with the returned value of the previous
1620 PostProcessor.
1621
1622 The chain will be stopped if one of them ever returns None or the end
1623 of the chain is reached.
1624
1625 PostProcessor objects follow a "mutual registration" process similar
1626 to InfoExtractor objects.
1627 """
1628
1629 _downloader = None
1630
1631 def __init__(self, downloader=None):
1632 self._downloader = downloader
1633
1634 def set_downloader(self, downloader):
1635 """Sets the downloader for this PP."""
1636 self._downloader = downloader
1637
1638 def run(self, information):
1639 """Run the PostProcessor.
1640
1641 The "information" argument is a dictionary like the ones
1642 composed by InfoExtractors. The only difference is that this
1643 one has an extra field called "filepath" that points to the
1644 downloaded file.
1645
1646 When this method returns None, the postprocessing chain is
1647 stopped. However, this method may return an information
1648 dictionary that will be passed to the next postprocessing
1649 object in the chain. It can be the one it received after
1650 changing some fields.
1651
1652 In addition, this method may raise a PostProcessingError
1653 exception that will be taken into account by the downloader
1654 it was called from.
1655 """
1656 return information # by default, do nothing
1657
1658 ### MAIN PROGRAM ###
1659 if __name__ == '__main__':
1660 try:
1661 # Modules needed only when running the main program
1662 import getpass
1663 import optparse
1664
1665 # Function to update the program file with the latest version from bitbucket.org
1666 def update_self(downloader, filename):
1667 # Note: downloader only used for options
1668 if not os.access (filename, os.W_OK):
1669 sys.exit('ERROR: no write permissions on %s' % filename)
1670
1671 downloader.to_stdout('Updating to latest stable version...')
1672 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1673 latest_version = urllib.urlopen(latest_url).read().strip()
1674 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1675 newcontent = urllib.urlopen(prog_url).read()
1676 stream = open(filename, 'w')
1677 stream.write(newcontent)
1678 stream.close()
1679 downloader.to_stdout('Updated to version %s' % latest_version)
1680
1681 # General configuration
1682 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1683 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1684 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1685
1686 # Parse command line
1687 parser = optparse.OptionParser(
1688 usage='Usage: %prog [options] url...',
1689 version='2010.04.04',
1690 conflict_handler='resolve',
1691 )
1692
1693 parser.add_option('-h', '--help',
1694 action='help', help='print this help text and exit')
1695 parser.add_option('-v', '--version',
1696 action='version', help='print program version and exit')
1697 parser.add_option('-U', '--update',
1698 action='store_true', dest='update_self', help='update this program to latest stable version')
1699 parser.add_option('-i', '--ignore-errors',
1700 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1701 parser.add_option('-r', '--rate-limit',
1702 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1703
1704 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1705 authentication.add_option('-u', '--username',
1706 dest='username', metavar='UN', help='account username')
1707 authentication.add_option('-p', '--password',
1708 dest='password', metavar='PW', help='account password')
1709 authentication.add_option('-n', '--netrc',
1710 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1711 parser.add_option_group(authentication)
1712
1713 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1714 video_format.add_option('-f', '--format',
1715 action='store', dest='format', metavar='FMT', help='video format code')
1716 video_format.add_option('-b', '--best-quality',
1717 action='store_const', dest='format', help='download the best quality video possible', const='0')
1718 video_format.add_option('-m', '--mobile-version',
1719 action='store_const', dest='format', help='alias for -f 17', const='17')
1720 video_format.add_option('-d', '--high-def',
1721 action='store_const', dest='format', help='alias for -f 22', const='22')
1722 video_format.add_option('--all-formats',
1723 action='store_const', dest='format', help='download all available video formats', const='-1')
1724 parser.add_option_group(video_format)
1725
1726 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1727 verbosity.add_option('-q', '--quiet',
1728 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1729 verbosity.add_option('-s', '--simulate',
1730 action='store_true', dest='simulate', help='do not download video', default=False)
1731 verbosity.add_option('-g', '--get-url',
1732 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1733 verbosity.add_option('-e', '--get-title',
1734 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1735 verbosity.add_option('--no-progress',
1736 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1737 parser.add_option_group(verbosity)
1738
1739 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
1740 filesystem.add_option('-t', '--title',
1741 action='store_true', dest='usetitle', help='use title in file name', default=False)
1742 filesystem.add_option('-l', '--literal',
1743 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
1744 filesystem.add_option('-o', '--output',
1745 dest='outtmpl', metavar='TPL', help='output filename template')
1746 filesystem.add_option('-a', '--batch-file',
1747 dest='batchfile', metavar='F', help='file containing URLs to download')
1748 filesystem.add_option('-w', '--no-overwrites',
1749 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
1750 filesystem.add_option('-c', '--continue',
1751 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
1752 parser.add_option_group(filesystem)
1753
1754 (opts, args) = parser.parse_args()
1755
1756 # Batch file verification
1757 batchurls = []
1758 if opts.batchfile is not None:
1759 try:
1760 batchurls = open(opts.batchfile, 'r').readlines()
1761 batchurls = [x.strip() for x in batchurls]
1762 batchurls = [x for x in batchurls if len(x) > 0]
1763 except IOError:
1764 sys.exit(u'ERROR: batch file could not be read')
1765 all_urls = batchurls + args
1766
1767 # Conflicting, missing and erroneous options
1768 if opts.usenetrc and (opts.username is not None or opts.password is not None):
1769 parser.error(u'using .netrc conflicts with giving username/password')
1770 if opts.password is not None and opts.username is None:
1771 parser.error(u'account username missing')
1772 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
1773 parser.error(u'using output template conflicts with using title or literal title')
1774 if opts.usetitle and opts.useliteral:
1775 parser.error(u'using title conflicts with using literal title')
1776 if opts.username is not None and opts.password is None:
1777 opts.password = getpass.getpass(u'Type account password and press return:')
1778 if opts.ratelimit is not None:
1779 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
1780 if numeric_limit is None:
1781 parser.error(u'invalid rate limit specified')
1782 opts.ratelimit = numeric_limit
1783
1784 # Information extractors
1785 youtube_ie = YoutubeIE()
1786 metacafe_ie = MetacafeIE(youtube_ie)
1787 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
1788 youtube_user_ie = YoutubeUserIE(youtube_ie)
1789 youtube_search_ie = YoutubeSearchIE(youtube_ie)
1790 google_ie = GoogleIE()
1791 photobucket_ie = PhotobucketIE()
1792 yahoo_ie = YahooIE()
1793 generic_ie = GenericIE()
1794
1795 # File downloader
1796 fd = FileDownloader({
1797 'usenetrc': opts.usenetrc,
1798 'username': opts.username,
1799 'password': opts.password,
1800 'quiet': (opts.quiet or opts.geturl or opts.gettitle),
1801 'forceurl': opts.geturl,
1802 'forcetitle': opts.gettitle,
1803 'simulate': (opts.simulate or opts.geturl or opts.gettitle),
1804 'format': opts.format,
1805 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
1806 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
1807 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
1808 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
1809 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
1810 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
1811 or u'%(id)s.%(ext)s'),
1812 'ignoreerrors': opts.ignoreerrors,
1813 'ratelimit': opts.ratelimit,
1814 'nooverwrites': opts.nooverwrites,
1815 'continuedl': opts.continue_dl,
1816 'noprogress': opts.noprogress,
1817 })
1818 fd.add_info_extractor(youtube_search_ie)
1819 fd.add_info_extractor(youtube_pl_ie)
1820 fd.add_info_extractor(youtube_user_ie)
1821 fd.add_info_extractor(metacafe_ie)
1822 fd.add_info_extractor(youtube_ie)
1823 fd.add_info_extractor(google_ie)
1824 fd.add_info_extractor(photobucket_ie)
1825 fd.add_info_extractor(yahoo_ie)
1826
1827 # This must come last since it's the
1828 # fallback if none of the others work
1829 fd.add_info_extractor(generic_ie)
1830
1831 # Update version
1832 if opts.update_self:
1833 update_self(fd, sys.argv[0])
1834
1835 # Maybe do nothing
1836 if len(all_urls) < 1:
1837 if not opts.update_self:
1838 parser.error(u'you must provide at least one URL')
1839 else:
1840 sys.exit()
1841 retcode = fd.download(all_urls)
1842 sys.exit(retcode)
1843
1844 except DownloadError:
1845 sys.exit(1)
1846 except SameFileError:
1847 sys.exit(u'ERROR: fixed output name but more than one file to download')
1848 except KeyboardInterrupt:
1849 sys.exit(u'\nERROR: Interrupted by user')