]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
Imported Upstream version 2010.06.06
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 if filename == u'-':
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
103
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
107
108
109 class DownloadError(Exception):
110 """Download Error exception.
111
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
114 error message.
115 """
116 pass
117
118 class SameFileError(Exception):
119 """Same File exception.
120
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
123 """
124 pass
125
126 class PostProcessingError(Exception):
127 """Post Processing exception.
128
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
131 """
132 pass
133
134 class UnavailableFormatError(Exception):
135 """Unavailable Format exception.
136
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
139 """
140 pass
141
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
144
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
148 """
149 # Both in bytes
150 downloaded = None
151 expected = None
152
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
156
157 class FileDownloader(object):
158 """File Downloader class.
159
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
174
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 outtmpl: Template for output names.
193 ignoreerrors: Do not stop on download errors.
194 ratelimit: Download speed limit, in bytes/sec.
195 nooverwrites: Prevent overwriting files.
196 retries: Number of times to retry for HTTP error 503
197 continuedl: Try to continue downloads if possible.
198 noprogress: Do not print the progress bar.
199 """
200
201 params = None
202 _ies = []
203 _pps = []
204 _download_retcode = None
205 _num_downloads = None
206
207 def __init__(self, params):
208 """Create a FileDownloader object with the given options."""
209 self._ies = []
210 self._pps = []
211 self._download_retcode = 0
212 self._num_downloads = 0
213 self.params = params
214
215 @staticmethod
216 def pmkdir(filename):
217 """Create directory components in filename. Similar to Unix "mkdir -p"."""
218 components = filename.split(os.sep)
219 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
220 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
221 for dir in aggregate:
222 if not os.path.exists(dir):
223 os.mkdir(dir)
224
225 @staticmethod
226 def format_bytes(bytes):
227 if bytes is None:
228 return 'N/A'
229 if type(bytes) is str:
230 bytes = float(bytes)
231 if bytes == 0.0:
232 exponent = 0
233 else:
234 exponent = long(math.log(bytes, 1024.0))
235 suffix = 'bkMGTPEZY'[exponent]
236 converted = float(bytes) / float(1024**exponent)
237 return '%.2f%s' % (converted, suffix)
238
239 @staticmethod
240 def calc_percent(byte_counter, data_len):
241 if data_len is None:
242 return '---.-%'
243 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
244
245 @staticmethod
246 def calc_eta(start, now, total, current):
247 if total is None:
248 return '--:--'
249 dif = now - start
250 if current == 0 or dif < 0.001: # One millisecond
251 return '--:--'
252 rate = float(current) / dif
253 eta = long((float(total) - float(current)) / rate)
254 (eta_mins, eta_secs) = divmod(eta, 60)
255 if eta_mins > 99:
256 return '--:--'
257 return '%02d:%02d' % (eta_mins, eta_secs)
258
259 @staticmethod
260 def calc_speed(start, now, bytes):
261 dif = now - start
262 if bytes == 0 or dif < 0.001: # One millisecond
263 return '%10s' % '---b/s'
264 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
265
266 @staticmethod
267 def best_block_size(elapsed_time, bytes):
268 new_min = max(bytes / 2.0, 1.0)
269 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
270 if elapsed_time < 0.001:
271 return long(new_max)
272 rate = bytes / elapsed_time
273 if rate > new_max:
274 return long(new_max)
275 if rate < new_min:
276 return long(new_min)
277 return long(rate)
278
279 @staticmethod
280 def parse_bytes(bytestr):
281 """Parse a string indicating a byte quantity into a long integer."""
282 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
283 if matchobj is None:
284 return None
285 number = float(matchobj.group(1))
286 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
287 return long(round(number * multiplier))
288
289 @staticmethod
290 def verify_url(url):
291 """Verify a URL is valid and data could be downloaded. Return real data URL."""
292 request = urllib2.Request(url, None, std_headers)
293 data = urllib2.urlopen(request)
294 data.read(1)
295 url = data.geturl()
296 data.close()
297 return url
298
299 def add_info_extractor(self, ie):
300 """Add an InfoExtractor object to the end of the list."""
301 self._ies.append(ie)
302 ie.set_downloader(self)
303
304 def add_post_processor(self, pp):
305 """Add a PostProcessor object to the end of the chain."""
306 self._pps.append(pp)
307 pp.set_downloader(self)
308
309 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
310 """Print message to stdout if not in quiet mode."""
311 try:
312 if not self.params.get('quiet', False):
313 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
314 sys.stdout.flush()
315 except (UnicodeEncodeError), err:
316 if not ignore_encoding_errors:
317 raise
318
319 def to_stderr(self, message):
320 """Print message to stderr."""
321 print >>sys.stderr, message.encode(preferredencoding())
322
323 def fixed_template(self):
324 """Checks if the output template is fixed."""
325 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
326
327 def trouble(self, message=None):
328 """Determine action to take when a download problem appears.
329
330 Depending on if the downloader has been configured to ignore
331 download errors or not, this method may throw an exception or
332 not when errors are found, after printing the message.
333 """
334 if message is not None:
335 self.to_stderr(message)
336 if not self.params.get('ignoreerrors', False):
337 raise DownloadError(message)
338 self._download_retcode = 1
339
340 def slow_down(self, start_time, byte_counter):
341 """Sleep if the download speed is over the rate limit."""
342 rate_limit = self.params.get('ratelimit', None)
343 if rate_limit is None or byte_counter == 0:
344 return
345 now = time.time()
346 elapsed = now - start_time
347 if elapsed <= 0.0:
348 return
349 speed = float(byte_counter) / elapsed
350 if speed > rate_limit:
351 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
352
353 def report_destination(self, filename):
354 """Report destination filename."""
355 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
356
357 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
358 """Report download progress."""
359 if self.params.get('noprogress', False):
360 return
361 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
362 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
363
364 def report_resuming_byte(self, resume_len):
365 """Report attemtp to resume at given byte."""
366 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
367
368 def report_retry(self, count, retries):
369 """Report retry in case of HTTP error 503"""
370 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
371
372 def report_file_already_downloaded(self, file_name):
373 """Report file has already been fully downloaded."""
374 try:
375 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
376 except (UnicodeEncodeError), err:
377 self.to_stdout(u'[download] The file has already been downloaded')
378
379 def report_unable_to_resume(self):
380 """Report it was impossible to resume download."""
381 self.to_stdout(u'[download] Unable to resume')
382
383 def report_finish(self):
384 """Report download finished."""
385 if self.params.get('noprogress', False):
386 self.to_stdout(u'[download] Download completed')
387 else:
388 self.to_stdout(u'')
389
390 def process_info(self, info_dict):
391 """Process a single dictionary returned by an InfoExtractor."""
392 # Do nothing else if in simulate mode
393 if self.params.get('simulate', False):
394 # Verify URL if it's an HTTP one
395 if info_dict['url'].startswith('http'):
396 try:
397 self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
398 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
399 raise UnavailableFormatError
400
401 # Forced printings
402 if self.params.get('forcetitle', False):
403 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
404 if self.params.get('forceurl', False):
405 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
406 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
407 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
408 if self.params.get('forcedescription', False) and 'description' in info_dict:
409 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
410
411 return
412
413 try:
414 template_dict = dict(info_dict)
415 template_dict['epoch'] = unicode(long(time.time()))
416 template_dict['ord'] = unicode('%05d' % self._num_downloads)
417 filename = self.params['outtmpl'] % template_dict
418 except (ValueError, KeyError), err:
419 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
420 if self.params.get('nooverwrites', False) and os.path.exists(filename):
421 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
422 return
423
424 try:
425 self.pmkdir(filename)
426 except (OSError, IOError), err:
427 self.trouble('ERROR: unable to create directories: %s' % str(err))
428 return
429
430 try:
431 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
432 except (OSError, IOError), err:
433 raise UnavailableFormatError
434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
435 self.trouble('ERROR: unable to download video data: %s' % str(err))
436 return
437 except (ContentTooShortError, ), err:
438 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
439 return
440
441 if success:
442 try:
443 self.post_process(filename, info_dict)
444 except (PostProcessingError), err:
445 self.trouble('ERROR: postprocessing: %s' % str(err))
446 return
447
448 def download(self, url_list):
449 """Download a given list of URLs."""
450 if len(url_list) > 1 and self.fixed_template():
451 raise SameFileError(self.params['outtmpl'])
452
453 for url in url_list:
454 suitable_found = False
455 for ie in self._ies:
456 # Go to next InfoExtractor if not suitable
457 if not ie.suitable(url):
458 continue
459
460 # Suitable InfoExtractor found
461 suitable_found = True
462
463 # Extract information from URL and process it
464 ie.extract(url)
465
466 # Suitable InfoExtractor had been found; go to next URL
467 break
468
469 if not suitable_found:
470 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
471
472 return self._download_retcode
473
474 def post_process(self, filename, ie_info):
475 """Run the postprocessing chain on the given file."""
476 info = dict(ie_info)
477 info['filepath'] = filename
478 for pp in self._pps:
479 info = pp.run(info)
480 if info is None:
481 break
482
483 def _download_with_rtmpdump(self, filename, url, player_url):
484 self.report_destination(filename)
485
486 # Check for rtmpdump first
487 try:
488 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
489 except (OSError, IOError):
490 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
491 return False
492
493 # Download using rtmpdump. rtmpdump returns exit code 2 when
494 # the connection was interrumpted and resuming appears to be
495 # possible. This is part of rtmpdump's normal usage, AFAIK.
496 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
497 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
498 while retval == 2 or retval == 1:
499 prevsize = os.path.getsize(filename)
500 self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
501 time.sleep(5.0) # This seems to be needed
502 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
503 cursize = os.path.getsize(filename)
504 if prevsize == cursize and retval == 1:
505 break
506 if retval == 0:
507 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
508 return True
509 else:
510 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
511 return False
512
513 def _do_download(self, filename, url, player_url):
514 # Attempt to download using rtmpdump
515 if url.startswith('rtmp'):
516 return self._download_with_rtmpdump(filename, url, player_url)
517
518 stream = None
519 open_mode = 'wb'
520 basic_request = urllib2.Request(url, None, std_headers)
521 request = urllib2.Request(url, None, std_headers)
522
523 # Establish possible resume length
524 if os.path.isfile(filename):
525 resume_len = os.path.getsize(filename)
526 else:
527 resume_len = 0
528
529 # Request parameters in case of being able to resume
530 if self.params.get('continuedl', False) and resume_len != 0:
531 self.report_resuming_byte(resume_len)
532 request.add_header('Range','bytes=%d-' % resume_len)
533 open_mode = 'ab'
534
535 count = 0
536 retries = self.params.get('retries', 0)
537 while True:
538 # Establish connection
539 try:
540 data = urllib2.urlopen(request)
541 break
542 except (urllib2.HTTPError, ), err:
543 if err.code == 503:
544 # Retry in case of HTTP error 503
545 count += 1
546 if count <= retries:
547 self.report_retry(count, retries)
548 continue
549 if err.code != 416: # 416 is 'Requested range not satisfiable'
550 raise
551 # Unable to resume
552 data = urllib2.urlopen(basic_request)
553 content_length = data.info()['Content-Length']
554
555 if content_length is not None and long(content_length) == resume_len:
556 # Because the file had already been fully downloaded
557 self.report_file_already_downloaded(filename)
558 self._num_downloads += 1
559 return True
560 else:
561 # Because the server didn't let us
562 self.report_unable_to_resume()
563 open_mode = 'wb'
564
565 data_len = data.info().get('Content-length', None)
566 data_len_str = self.format_bytes(data_len)
567 byte_counter = 0
568 block_size = 1024
569 start = time.time()
570 while True:
571 # Download and write
572 before = time.time()
573 data_block = data.read(block_size)
574 after = time.time()
575 data_block_len = len(data_block)
576 if data_block_len == 0:
577 break
578 byte_counter += data_block_len
579
580 # Open file just in time
581 if stream is None:
582 try:
583 (stream, filename) = sanitize_open(filename, open_mode)
584 self.report_destination(filename)
585 self._num_downloads += 1
586 except (OSError, IOError), err:
587 self.trouble('ERROR: unable to open for writing: %s' % str(err))
588 return False
589 try:
590 stream.write(data_block)
591 except (IOError, OSError), err:
592 self.trouble('\nERROR: unable to write data: %s' % str(err))
593 block_size = self.best_block_size(after - before, data_block_len)
594
595 # Progress message
596 percent_str = self.calc_percent(byte_counter, data_len)
597 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
598 speed_str = self.calc_speed(start, time.time(), byte_counter)
599 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
600
601 # Apply rate limit
602 self.slow_down(start, byte_counter)
603
604 self.report_finish()
605 if data_len is not None and str(byte_counter) != data_len:
606 raise ContentTooShortError(byte_counter, long(data_len))
607 return True
608
609 class InfoExtractor(object):
610 """Information Extractor class.
611
612 Information extractors are the classes that, given a URL, extract
613 information from the video (or videos) the URL refers to. This
614 information includes the real video URL, the video title and simplified
615 title, author and others. The information is stored in a dictionary
616 which is then passed to the FileDownloader. The FileDownloader
617 processes this information possibly downloading the video to the file
618 system, among other possible outcomes. The dictionaries must include
619 the following fields:
620
621 id: Video identifier.
622 url: Final video URL.
623 uploader: Nickname of the video uploader.
624 title: Literal title.
625 stitle: Simplified title.
626 ext: Video filename extension.
627 format: Video format.
628 player_url: SWF Player URL (may be None).
629
630 The following fields are optional. Their primary purpose is to allow
631 youtube-dl to serve as the backend for a video search function, such
632 as the one in youtube2mp3. They are only used when their respective
633 forced printing functions are called:
634
635 thumbnail: Full URL to a video thumbnail image.
636 description: One-line video description.
637
638 Subclasses of this one should re-define the _real_initialize() and
639 _real_extract() methods, as well as the suitable() static method.
640 Probably, they should also be instantiated and added to the main
641 downloader.
642 """
643
644 _ready = False
645 _downloader = None
646
647 def __init__(self, downloader=None):
648 """Constructor. Receives an optional downloader."""
649 self._ready = False
650 self.set_downloader(downloader)
651
652 @staticmethod
653 def suitable(url):
654 """Receives a URL and returns True if suitable for this IE."""
655 return False
656
657 def initialize(self):
658 """Initializes an instance (authentication, etc)."""
659 if not self._ready:
660 self._real_initialize()
661 self._ready = True
662
663 def extract(self, url):
664 """Extracts URL information and returns it in list of dicts."""
665 self.initialize()
666 return self._real_extract(url)
667
668 def set_downloader(self, downloader):
669 """Sets the downloader for this IE."""
670 self._downloader = downloader
671
672 def _real_initialize(self):
673 """Real initialization process. Redefine in subclasses."""
674 pass
675
676 def _real_extract(self, url):
677 """Real extraction process. Redefine in subclasses."""
678 pass
679
680 class YoutubeIE(InfoExtractor):
681 """Information extractor for youtube.com."""
682
683 _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
684 _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
685 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
686 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
687 _NETRC_MACHINE = 'youtube'
688 _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
689 _video_extensions = {
690 '13': '3gp',
691 '17': 'mp4',
692 '18': 'mp4',
693 '22': 'mp4',
694 '37': 'mp4',
695 '43': 'webm',
696 '45': 'webm',
697 }
698
699 @staticmethod
700 def suitable(url):
701 return (re.match(YoutubeIE._VALID_URL, url) is not None)
702
703 def report_lang(self):
704 """Report attempt to set language."""
705 self._downloader.to_stdout(u'[youtube] Setting language')
706
707 def report_login(self):
708 """Report attempt to log in."""
709 self._downloader.to_stdout(u'[youtube] Logging in')
710
711 def report_age_confirmation(self):
712 """Report attempt to confirm age."""
713 self._downloader.to_stdout(u'[youtube] Confirming age')
714
715 def report_video_webpage_download(self, video_id):
716 """Report attempt to download video webpage."""
717 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
718
719 def report_video_info_webpage_download(self, video_id):
720 """Report attempt to download video info webpage."""
721 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
722
723 def report_information_extraction(self, video_id):
724 """Report attempt to extract video information."""
725 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
726
727 def report_unavailable_format(self, video_id, format):
728 """Report extracted video URL."""
729 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
730
731 def report_rtmp_download(self):
732 """Indicate the download will use the RTMP protocol."""
733 self._downloader.to_stdout(u'[youtube] RTMP download detected')
734
735 def _real_initialize(self):
736 if self._downloader is None:
737 return
738
739 username = None
740 password = None
741 downloader_params = self._downloader.params
742
743 # Attempt to use provided username and password or .netrc data
744 if downloader_params.get('username', None) is not None:
745 username = downloader_params['username']
746 password = downloader_params['password']
747 elif downloader_params.get('usenetrc', False):
748 try:
749 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
750 if info is not None:
751 username = info[0]
752 password = info[2]
753 else:
754 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
755 except (IOError, netrc.NetrcParseError), err:
756 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
757 return
758
759 # Set language
760 request = urllib2.Request(self._LANG_URL, None, std_headers)
761 try:
762 self.report_lang()
763 urllib2.urlopen(request).read()
764 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
765 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
766 return
767
768 # No authentication to be performed
769 if username is None:
770 return
771
772 # Log in
773 login_form = {
774 'current_form': 'loginForm',
775 'next': '/',
776 'action_login': 'Log In',
777 'username': username,
778 'password': password,
779 }
780 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
781 try:
782 self.report_login()
783 login_results = urllib2.urlopen(request).read()
784 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
785 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
786 return
787 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
788 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
789 return
790
791 # Confirm age
792 age_form = {
793 'next_url': '/',
794 'action_confirm': 'Confirm',
795 }
796 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
797 try:
798 self.report_age_confirmation()
799 age_results = urllib2.urlopen(request).read()
800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
801 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
802 return
803
804 def _real_extract(self, url):
805 # Extract video id from URL
806 mobj = re.match(self._VALID_URL, url)
807 if mobj is None:
808 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
809 return
810 video_id = mobj.group(2)
811
812 # Downloader parameters
813 best_quality = False
814 all_formats = False
815 format_param = None
816 quality_index = 0
817 if self._downloader is not None:
818 params = self._downloader.params
819 format_param = params.get('format', None)
820 if format_param == '0':
821 format_param = self._available_formats[quality_index]
822 best_quality = True
823 elif format_param == '-1':
824 format_param = self._available_formats[quality_index]
825 all_formats = True
826
827 while True:
828 # Extension
829 video_extension = self._video_extensions.get(format_param, 'flv')
830
831 # Get video webpage
832 self.report_video_webpage_download(video_id)
833 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
834 try:
835 video_webpage = urllib2.urlopen(request).read()
836 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
837 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
838 return
839
840 # Attempt to extract SWF player URL
841 mobj = re.search(r'swfConfig.*"(http://.*?watch-.*?\.swf)"', video_webpage)
842 if mobj is not None:
843 player_url = mobj.group(1)
844 else:
845 player_url = None
846
847 # Get video info
848 self.report_video_info_webpage_download(video_id)
849 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
850 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
851 % (video_id, el_type))
852 request = urllib2.Request(video_info_url, None, std_headers)
853 try:
854 video_info_webpage = urllib2.urlopen(request).read()
855 video_info = parse_qs(video_info_webpage)
856 if 'token' in video_info:
857 break
858 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
859 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
860 return
861 self.report_information_extraction(video_id)
862
863 # "t" param
864 if 'token' not in video_info:
865 # Attempt to see if YouTube has issued an error message
866 if 'reason' not in video_info:
867 self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
868 stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
869 stream.write(video_info_webpage)
870 stream.close()
871 else:
872 reason = urllib.unquote_plus(video_info['reason'][0])
873 self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
874 return
875 token = urllib.unquote_plus(video_info['token'][0])
876 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
877 if format_param is not None:
878 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
879
880 # Check possible RTMP download
881 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
882 self.report_rtmp_download()
883 video_real_url = video_info['conn'][0]
884
885 # uploader
886 if 'author' not in video_info:
887 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
888 return
889 video_uploader = urllib.unquote_plus(video_info['author'][0])
890
891 # title
892 if 'title' not in video_info:
893 self._downloader.trouble(u'ERROR: unable to extract video title')
894 return
895 video_title = urllib.unquote_plus(video_info['title'][0])
896 video_title = video_title.decode('utf-8')
897 video_title = sanitize_title(video_title)
898
899 # simplified title
900 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
901 simple_title = simple_title.strip(ur'_')
902
903 # thumbnail image
904 if 'thumbnail_url' not in video_info:
905 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
906 video_thumbnail = ''
907 else: # don't panic if we can't find it
908 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
909
910 # description
911 video_description = 'No description available.'
912 if self._downloader.params.get('forcedescription', False):
913 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
914 if mobj is not None:
915 video_description = mobj.group(1)
916
917 try:
918 # Process video information
919 self._downloader.process_info({
920 'id': video_id.decode('utf-8'),
921 'url': video_real_url.decode('utf-8'),
922 'uploader': video_uploader.decode('utf-8'),
923 'title': video_title,
924 'stitle': simple_title,
925 'ext': video_extension.decode('utf-8'),
926 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
927 'thumbnail': video_thumbnail.decode('utf-8'),
928 'description': video_description.decode('utf-8'),
929 'player_url': player_url,
930 })
931
932 if all_formats:
933 quality_index += 1
934 if quality_index == len(self._available_formats):
935 # None left to get
936 return
937 else:
938 format_param = self._available_formats[quality_index]
939 continue
940 return
941
942 except UnavailableFormatError, err:
943 if best_quality or all_formats:
944 quality_index += 1
945 if quality_index == len(self._available_formats):
946 # I don't ever expect this to happen
947 if not all_formats:
948 self._downloader.trouble(u'ERROR: no known formats available for video')
949 return
950 else:
951 self.report_unavailable_format(video_id, format_param)
952 format_param = self._available_formats[quality_index]
953 continue
954 else:
955 self._downloader.trouble('ERROR: format not available for video')
956 return
957
958
959 class MetacafeIE(InfoExtractor):
960 """Information Extractor for metacafe.com."""
961
962 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
963 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
964 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
965 _youtube_ie = None
966
967 def __init__(self, youtube_ie, downloader=None):
968 InfoExtractor.__init__(self, downloader)
969 self._youtube_ie = youtube_ie
970
971 @staticmethod
972 def suitable(url):
973 return (re.match(MetacafeIE._VALID_URL, url) is not None)
974
975 def report_disclaimer(self):
976 """Report disclaimer retrieval."""
977 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
978
979 def report_age_confirmation(self):
980 """Report attempt to confirm age."""
981 self._downloader.to_stdout(u'[metacafe] Confirming age')
982
983 def report_download_webpage(self, video_id):
984 """Report webpage download."""
985 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
986
987 def report_extraction(self, video_id):
988 """Report information extraction."""
989 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
990
991 def _real_initialize(self):
992 # Retrieve disclaimer
993 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
994 try:
995 self.report_disclaimer()
996 disclaimer = urllib2.urlopen(request).read()
997 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
998 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
999 return
1000
1001 # Confirm age
1002 disclaimer_form = {
1003 'filters': '0',
1004 'submit': "Continue - I'm over 18",
1005 }
1006 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1007 try:
1008 self.report_age_confirmation()
1009 disclaimer = urllib2.urlopen(request).read()
1010 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1011 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1012 return
1013
1014 def _real_extract(self, url):
1015 # Extract id and simplified title from URL
1016 mobj = re.match(self._VALID_URL, url)
1017 if mobj is None:
1018 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1019 return
1020
1021 video_id = mobj.group(1)
1022
1023 # Check if video comes from YouTube
1024 mobj2 = re.match(r'^yt-(.*)$', video_id)
1025 if mobj2 is not None:
1026 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1027 return
1028
1029 simple_title = mobj.group(2).decode('utf-8')
1030 video_extension = 'flv'
1031
1032 # Retrieve video webpage to extract further information
1033 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1034 try:
1035 self.report_download_webpage(video_id)
1036 webpage = urllib2.urlopen(request).read()
1037 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1038 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1039 return
1040
1041 # Extract URL, uploader and title from webpage
1042 self.report_extraction(video_id)
1043 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1044 if mobj is None:
1045 self._downloader.trouble(u'ERROR: unable to extract media URL')
1046 return
1047 mediaURL = urllib.unquote(mobj.group(1))
1048
1049 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1050 #if mobj is None:
1051 # self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1052 # return
1053 #gdaKey = mobj.group(1)
1054 #
1055 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1056
1057 video_url = mediaURL
1058
1059 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1060 if mobj is None:
1061 self._downloader.trouble(u'ERROR: unable to extract title')
1062 return
1063 video_title = mobj.group(1).decode('utf-8')
1064 video_title = sanitize_title(video_title)
1065
1066 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1067 if mobj is None:
1068 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1069 return
1070 video_uploader = mobj.group(1)
1071
1072 try:
1073 # Process video information
1074 self._downloader.process_info({
1075 'id': video_id.decode('utf-8'),
1076 'url': video_url.decode('utf-8'),
1077 'uploader': video_uploader.decode('utf-8'),
1078 'title': video_title,
1079 'stitle': simple_title,
1080 'ext': video_extension.decode('utf-8'),
1081 'format': u'NA',
1082 'player_url': None,
1083 })
1084 except UnavailableFormatError:
1085 self._downloader.trouble(u'ERROR: format not available for video')
1086
1087
1088 class GoogleIE(InfoExtractor):
1089 """Information extractor for video.google.com."""
1090
1091 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1092
1093 def __init__(self, downloader=None):
1094 InfoExtractor.__init__(self, downloader)
1095
1096 @staticmethod
1097 def suitable(url):
1098 return (re.match(GoogleIE._VALID_URL, url) is not None)
1099
1100 def report_download_webpage(self, video_id):
1101 """Report webpage download."""
1102 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1103
1104 def report_extraction(self, video_id):
1105 """Report information extraction."""
1106 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1107
1108 def _real_initialize(self):
1109 return
1110
1111 def _real_extract(self, url):
1112 # Extract id from URL
1113 mobj = re.match(self._VALID_URL, url)
1114 if mobj is None:
1115 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1116 return
1117
1118 video_id = mobj.group(1)
1119
1120 video_extension = 'mp4'
1121
1122 # Retrieve video webpage to extract further information
1123 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1124 try:
1125 self.report_download_webpage(video_id)
1126 webpage = urllib2.urlopen(request).read()
1127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1128 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1129 return
1130
1131 # Extract URL, uploader, and title from webpage
1132 self.report_extraction(video_id)
1133 mobj = re.search(r"download_url:'([^']+)'", webpage)
1134 if mobj is None:
1135 video_extension = 'flv'
1136 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1137 if mobj is None:
1138 self._downloader.trouble(u'ERROR: unable to extract media URL')
1139 return
1140 mediaURL = urllib.unquote(mobj.group(1))
1141 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1142 mediaURL = mediaURL.replace('\\x26', '\x26')
1143
1144 video_url = mediaURL
1145
1146 mobj = re.search(r'<title>(.*)</title>', webpage)
1147 if mobj is None:
1148 self._downloader.trouble(u'ERROR: unable to extract title')
1149 return
1150 video_title = mobj.group(1).decode('utf-8')
1151 video_title = sanitize_title(video_title)
1152 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1153
1154 # Extract video description
1155 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1156 if mobj is None:
1157 self._downloader.trouble(u'ERROR: unable to extract video description')
1158 return
1159 video_description = mobj.group(1).decode('utf-8')
1160 if not video_description:
1161 video_description = 'No description available.'
1162
1163 # Extract video thumbnail
1164 if self._downloader.params.get('forcethumbnail', False):
1165 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1166 try:
1167 webpage = urllib2.urlopen(request).read()
1168 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1169 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1170 return
1171 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1172 if mobj is None:
1173 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1174 return
1175 video_thumbnail = mobj.group(1)
1176 else: # we need something to pass to process_info
1177 video_thumbnail = ''
1178
1179
1180 try:
1181 # Process video information
1182 self._downloader.process_info({
1183 'id': video_id.decode('utf-8'),
1184 'url': video_url.decode('utf-8'),
1185 'uploader': u'NA',
1186 'title': video_title,
1187 'stitle': simple_title,
1188 'ext': video_extension.decode('utf-8'),
1189 'format': u'NA',
1190 'player_url': None,
1191 })
1192 except UnavailableFormatError:
1193 self._downloader.trouble(u'ERROR: format not available for video')
1194
1195
1196 class PhotobucketIE(InfoExtractor):
1197 """Information extractor for photobucket.com."""
1198
1199 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1200
1201 def __init__(self, downloader=None):
1202 InfoExtractor.__init__(self, downloader)
1203
1204 @staticmethod
1205 def suitable(url):
1206 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1207
1208 def report_download_webpage(self, video_id):
1209 """Report webpage download."""
1210 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1211
1212 def report_extraction(self, video_id):
1213 """Report information extraction."""
1214 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1215
1216 def _real_initialize(self):
1217 return
1218
1219 def _real_extract(self, url):
1220 # Extract id from URL
1221 mobj = re.match(self._VALID_URL, url)
1222 if mobj is None:
1223 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1224 return
1225
1226 video_id = mobj.group(1)
1227
1228 video_extension = 'flv'
1229
1230 # Retrieve video webpage to extract further information
1231 request = urllib2.Request(url)
1232 try:
1233 self.report_download_webpage(video_id)
1234 webpage = urllib2.urlopen(request).read()
1235 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1236 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1237 return
1238
1239 # Extract URL, uploader, and title from webpage
1240 self.report_extraction(video_id)
1241 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1242 if mobj is None:
1243 self._downloader.trouble(u'ERROR: unable to extract media URL')
1244 return
1245 mediaURL = urllib.unquote(mobj.group(1))
1246
1247 video_url = mediaURL
1248
1249 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1250 if mobj is None:
1251 self._downloader.trouble(u'ERROR: unable to extract title')
1252 return
1253 video_title = mobj.group(1).decode('utf-8')
1254 video_title = sanitize_title(video_title)
1255 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1256
1257 video_uploader = mobj.group(2).decode('utf-8')
1258
1259 try:
1260 # Process video information
1261 self._downloader.process_info({
1262 'id': video_id.decode('utf-8'),
1263 'url': video_url.decode('utf-8'),
1264 'uploader': video_uploader,
1265 'title': video_title,
1266 'stitle': simple_title,
1267 'ext': video_extension.decode('utf-8'),
1268 'format': u'NA',
1269 'player_url': None,
1270 })
1271 except UnavailableFormatError:
1272 self._downloader.trouble(u'ERROR: format not available for video')
1273
1274
1275 class YahooIE(InfoExtractor):
1276 """Information extractor for video.yahoo.com."""
1277
1278 # _VALID_URL matches all Yahoo! Video URLs
1279 # _VPAGE_URL matches only the extractable '/watch/' URLs
1280 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1281 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1282
1283 def __init__(self, downloader=None):
1284 InfoExtractor.__init__(self, downloader)
1285
1286 @staticmethod
1287 def suitable(url):
1288 return (re.match(YahooIE._VALID_URL, url) is not None)
1289
1290 def report_download_webpage(self, video_id):
1291 """Report webpage download."""
1292 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1293
1294 def report_extraction(self, video_id):
1295 """Report information extraction."""
1296 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1297
1298 def _real_initialize(self):
1299 return
1300
1301 def _real_extract(self, url):
1302 # Extract ID from URL
1303 mobj = re.match(self._VALID_URL, url)
1304 if mobj is None:
1305 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306 return
1307
1308 video_id = mobj.group(2)
1309 video_extension = 'flv'
1310
1311 # Rewrite valid but non-extractable URLs as
1312 # extractable English language /watch/ URLs
1313 if re.match(self._VPAGE_URL, url) is None:
1314 request = urllib2.Request(url)
1315 try:
1316 webpage = urllib2.urlopen(request).read()
1317 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1318 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1319 return
1320
1321 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1322 if mobj is None:
1323 self._downloader.trouble(u'ERROR: Unable to extract id field')
1324 return
1325 yahoo_id = mobj.group(1)
1326
1327 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1328 if mobj is None:
1329 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1330 return
1331 yahoo_vid = mobj.group(1)
1332
1333 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1334 return self._real_extract(url)
1335
1336 # Retrieve video webpage to extract further information
1337 request = urllib2.Request(url)
1338 try:
1339 self.report_download_webpage(video_id)
1340 webpage = urllib2.urlopen(request).read()
1341 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1342 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1343 return
1344
1345 # Extract uploader and title from webpage
1346 self.report_extraction(video_id)
1347 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1348 if mobj is None:
1349 self._downloader.trouble(u'ERROR: unable to extract video title')
1350 return
1351 video_title = mobj.group(1).decode('utf-8')
1352 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1353
1354 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1355 if mobj is None:
1356 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1357 return
1358 video_uploader = mobj.group(1).decode('utf-8')
1359
1360 # Extract video thumbnail
1361 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1362 if mobj is None:
1363 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1364 return
1365 video_thumbnail = mobj.group(1).decode('utf-8')
1366
1367 # Extract video description
1368 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1369 if mobj is None:
1370 self._downloader.trouble(u'ERROR: unable to extract video description')
1371 return
1372 video_description = mobj.group(1).decode('utf-8')
1373 if not video_description: video_description = 'No description available.'
1374
1375 # Extract video height and width
1376 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1377 if mobj is None:
1378 self._downloader.trouble(u'ERROR: unable to extract video height')
1379 return
1380 yv_video_height = mobj.group(1)
1381
1382 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1383 if mobj is None:
1384 self._downloader.trouble(u'ERROR: unable to extract video width')
1385 return
1386 yv_video_width = mobj.group(1)
1387
1388 # Retrieve video playlist to extract media URL
1389 # I'm not completely sure what all these options are, but we
1390 # seem to need most of them, otherwise the server sends a 401.
1391 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1392 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1393 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1394 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1395 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1396 try:
1397 self.report_download_webpage(video_id)
1398 webpage = urllib2.urlopen(request).read()
1399 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1400 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1401 return
1402
1403 # Extract media URL from playlist XML
1404 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1405 if mobj is None:
1406 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1407 return
1408 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1409 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1410
1411 try:
1412 # Process video information
1413 self._downloader.process_info({
1414 'id': video_id.decode('utf-8'),
1415 'url': video_url,
1416 'uploader': video_uploader,
1417 'title': video_title,
1418 'stitle': simple_title,
1419 'ext': video_extension.decode('utf-8'),
1420 'thumbnail': video_thumbnail.decode('utf-8'),
1421 'description': video_description,
1422 'thumbnail': video_thumbnail,
1423 'description': video_description,
1424 'player_url': None,
1425 })
1426 except UnavailableFormatError:
1427 self._downloader.trouble(u'ERROR: format not available for video')
1428
1429
1430 class GenericIE(InfoExtractor):
1431 """Generic last-resort information extractor."""
1432
1433 def __init__(self, downloader=None):
1434 InfoExtractor.__init__(self, downloader)
1435
1436 @staticmethod
1437 def suitable(url):
1438 return True
1439
1440 def report_download_webpage(self, video_id):
1441 """Report webpage download."""
1442 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1443 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1444
1445 def report_extraction(self, video_id):
1446 """Report information extraction."""
1447 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1448
1449 def _real_initialize(self):
1450 return
1451
1452 def _real_extract(self, url):
1453 video_id = url.split('/')[-1]
1454 request = urllib2.Request(url)
1455 try:
1456 self.report_download_webpage(video_id)
1457 webpage = urllib2.urlopen(request).read()
1458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1459 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1460 return
1461 except ValueError, err:
1462 # since this is the last-resort InfoExtractor, if
1463 # this error is thrown, it'll be thrown here
1464 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1465 return
1466
1467 # Start with something easy: JW Player in SWFObject
1468 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1469 if mobj is None:
1470 # Broaden the search a little bit
1471 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1472 if mobj is None:
1473 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1474 return
1475
1476 # It's possible that one of the regexes
1477 # matched, but returned an empty group:
1478 if mobj.group(1) is None:
1479 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1480 return
1481
1482 video_url = urllib.unquote(mobj.group(1))
1483 video_id = os.path.basename(video_url)
1484
1485 # here's a fun little line of code for you:
1486 video_extension = os.path.splitext(video_id)[1][1:]
1487 video_id = os.path.splitext(video_id)[0]
1488
1489 # it's tempting to parse this further, but you would
1490 # have to take into account all the variations like
1491 # Video Title - Site Name
1492 # Site Name | Video Title
1493 # Video Title - Tagline | Site Name
1494 # and so on and so forth; it's just not practical
1495 mobj = re.search(r'<title>(.*)</title>', webpage)
1496 if mobj is None:
1497 self._downloader.trouble(u'ERROR: unable to extract title')
1498 return
1499 video_title = mobj.group(1).decode('utf-8')
1500 video_title = sanitize_title(video_title)
1501 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1502
1503 # video uploader is domain name
1504 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1505 if mobj is None:
1506 self._downloader.trouble(u'ERROR: unable to extract title')
1507 return
1508 video_uploader = mobj.group(1).decode('utf-8')
1509
1510 try:
1511 # Process video information
1512 self._downloader.process_info({
1513 'id': video_id.decode('utf-8'),
1514 'url': video_url.decode('utf-8'),
1515 'uploader': video_uploader,
1516 'title': video_title,
1517 'stitle': simple_title,
1518 'ext': video_extension.decode('utf-8'),
1519 'format': u'NA',
1520 'player_url': None,
1521 })
1522 except UnavailableFormatError:
1523 self._downloader.trouble(u'ERROR: format not available for video')
1524
1525
1526 class YoutubeSearchIE(InfoExtractor):
1527 """Information Extractor for YouTube search queries."""
1528 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1529 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1530 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1531 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1532 _youtube_ie = None
1533 _max_youtube_results = 1000
1534
1535 def __init__(self, youtube_ie, downloader=None):
1536 InfoExtractor.__init__(self, downloader)
1537 self._youtube_ie = youtube_ie
1538
1539 @staticmethod
1540 def suitable(url):
1541 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1542
1543 def report_download_page(self, query, pagenum):
1544 """Report attempt to download playlist page with given number."""
1545 query = query.decode(preferredencoding())
1546 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1547
1548 def _real_initialize(self):
1549 self._youtube_ie.initialize()
1550
1551 def _real_extract(self, query):
1552 mobj = re.match(self._VALID_QUERY, query)
1553 if mobj is None:
1554 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1555 return
1556
1557 prefix, query = query.split(':')
1558 prefix = prefix[8:]
1559 query = query.encode('utf-8')
1560 if prefix == '':
1561 self._download_n_results(query, 1)
1562 return
1563 elif prefix == 'all':
1564 self._download_n_results(query, self._max_youtube_results)
1565 return
1566 else:
1567 try:
1568 n = long(prefix)
1569 if n <= 0:
1570 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1571 return
1572 elif n > self._max_youtube_results:
1573 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1574 n = self._max_youtube_results
1575 self._download_n_results(query, n)
1576 return
1577 except ValueError: # parsing prefix as integer fails
1578 self._download_n_results(query, 1)
1579 return
1580
1581 def _download_n_results(self, query, n):
1582 """Downloads a specified number of results for a query"""
1583
1584 video_ids = []
1585 already_seen = set()
1586 pagenum = 1
1587
1588 while True:
1589 self.report_download_page(query, pagenum)
1590 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1591 request = urllib2.Request(result_url, None, std_headers)
1592 try:
1593 page = urllib2.urlopen(request).read()
1594 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1595 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1596 return
1597
1598 # Extract video identifiers
1599 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1600 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1601 if video_id not in already_seen:
1602 video_ids.append(video_id)
1603 already_seen.add(video_id)
1604 if len(video_ids) == n:
1605 # Specified n videos reached
1606 for id in video_ids:
1607 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1608 return
1609
1610 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1611 for id in video_ids:
1612 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1613 return
1614
1615 pagenum = pagenum + 1
1616
1617 class GoogleSearchIE(InfoExtractor):
1618 """Information Extractor for Google Video search queries."""
1619 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1620 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1621 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1622 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1623 _google_ie = None
1624 _max_google_results = 1000
1625
1626 def __init__(self, google_ie, downloader=None):
1627 InfoExtractor.__init__(self, downloader)
1628 self._google_ie = google_ie
1629
1630 @staticmethod
1631 def suitable(url):
1632 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1633
1634 def report_download_page(self, query, pagenum):
1635 """Report attempt to download playlist page with given number."""
1636 query = query.decode(preferredencoding())
1637 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1638
1639 def _real_initialize(self):
1640 self._google_ie.initialize()
1641
1642 def _real_extract(self, query):
1643 mobj = re.match(self._VALID_QUERY, query)
1644 if mobj is None:
1645 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1646 return
1647
1648 prefix, query = query.split(':')
1649 prefix = prefix[8:]
1650 query = query.encode('utf-8')
1651 if prefix == '':
1652 self._download_n_results(query, 1)
1653 return
1654 elif prefix == 'all':
1655 self._download_n_results(query, self._max_google_results)
1656 return
1657 else:
1658 try:
1659 n = long(prefix)
1660 if n <= 0:
1661 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1662 return
1663 elif n > self._max_google_results:
1664 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1665 n = self._max_google_results
1666 self._download_n_results(query, n)
1667 return
1668 except ValueError: # parsing prefix as integer fails
1669 self._download_n_results(query, 1)
1670 return
1671
1672 def _download_n_results(self, query, n):
1673 """Downloads a specified number of results for a query"""
1674
1675 video_ids = []
1676 already_seen = set()
1677 pagenum = 1
1678
1679 while True:
1680 self.report_download_page(query, pagenum)
1681 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1682 request = urllib2.Request(result_url, None, std_headers)
1683 try:
1684 page = urllib2.urlopen(request).read()
1685 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1686 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687 return
1688
1689 # Extract video identifiers
1690 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1691 video_id = mobj.group(1)
1692 if video_id not in already_seen:
1693 video_ids.append(video_id)
1694 already_seen.add(video_id)
1695 if len(video_ids) == n:
1696 # Specified n videos reached
1697 for id in video_ids:
1698 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1699 return
1700
1701 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1702 for id in video_ids:
1703 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1704 return
1705
1706 pagenum = pagenum + 1
1707
1708 class YahooSearchIE(InfoExtractor):
1709 """Information Extractor for Yahoo! Video search queries."""
1710 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1711 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1712 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1713 _MORE_PAGES_INDICATOR = r'\s*Next'
1714 _yahoo_ie = None
1715 _max_yahoo_results = 1000
1716
1717 def __init__(self, yahoo_ie, downloader=None):
1718 InfoExtractor.__init__(self, downloader)
1719 self._yahoo_ie = yahoo_ie
1720
1721 @staticmethod
1722 def suitable(url):
1723 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1724
1725 def report_download_page(self, query, pagenum):
1726 """Report attempt to download playlist page with given number."""
1727 query = query.decode(preferredencoding())
1728 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1729
1730 def _real_initialize(self):
1731 self._yahoo_ie.initialize()
1732
1733 def _real_extract(self, query):
1734 mobj = re.match(self._VALID_QUERY, query)
1735 if mobj is None:
1736 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1737 return
1738
1739 prefix, query = query.split(':')
1740 prefix = prefix[8:]
1741 query = query.encode('utf-8')
1742 if prefix == '':
1743 self._download_n_results(query, 1)
1744 return
1745 elif prefix == 'all':
1746 self._download_n_results(query, self._max_yahoo_results)
1747 return
1748 else:
1749 try:
1750 n = long(prefix)
1751 if n <= 0:
1752 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1753 return
1754 elif n > self._max_yahoo_results:
1755 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1756 n = self._max_yahoo_results
1757 self._download_n_results(query, n)
1758 return
1759 except ValueError: # parsing prefix as integer fails
1760 self._download_n_results(query, 1)
1761 return
1762
1763 def _download_n_results(self, query, n):
1764 """Downloads a specified number of results for a query"""
1765
1766 video_ids = []
1767 already_seen = set()
1768 pagenum = 1
1769
1770 while True:
1771 self.report_download_page(query, pagenum)
1772 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1773 request = urllib2.Request(result_url, None, std_headers)
1774 try:
1775 page = urllib2.urlopen(request).read()
1776 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1777 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1778 return
1779
1780 # Extract video identifiers
1781 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1782 video_id = mobj.group(1)
1783 if video_id not in already_seen:
1784 video_ids.append(video_id)
1785 already_seen.add(video_id)
1786 if len(video_ids) == n:
1787 # Specified n videos reached
1788 for id in video_ids:
1789 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1790 return
1791
1792 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1793 for id in video_ids:
1794 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1795 return
1796
1797 pagenum = pagenum + 1
1798
1799 class YoutubePlaylistIE(InfoExtractor):
1800 """Information Extractor for YouTube playlists."""
1801
1802 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1803 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1804 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1805 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1806 _youtube_ie = None
1807
1808 def __init__(self, youtube_ie, downloader=None):
1809 InfoExtractor.__init__(self, downloader)
1810 self._youtube_ie = youtube_ie
1811
1812 @staticmethod
1813 def suitable(url):
1814 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1815
1816 def report_download_page(self, playlist_id, pagenum):
1817 """Report attempt to download playlist page with given number."""
1818 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1819
1820 def _real_initialize(self):
1821 self._youtube_ie.initialize()
1822
1823 def _real_extract(self, url):
1824 # Extract playlist id
1825 mobj = re.match(self._VALID_URL, url)
1826 if mobj is None:
1827 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1828 return
1829
1830 # Download playlist pages
1831 playlist_id = mobj.group(1)
1832 video_ids = []
1833 pagenum = 1
1834
1835 while True:
1836 self.report_download_page(playlist_id, pagenum)
1837 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1838 try:
1839 page = urllib2.urlopen(request).read()
1840 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1841 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1842 return
1843
1844 # Extract video identifiers
1845 ids_in_page = []
1846 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1847 if mobj.group(1) not in ids_in_page:
1848 ids_in_page.append(mobj.group(1))
1849 video_ids.extend(ids_in_page)
1850
1851 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1852 break
1853 pagenum = pagenum + 1
1854
1855 for id in video_ids:
1856 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1857 return
1858
1859 class YoutubeUserIE(InfoExtractor):
1860 """Information Extractor for YouTube users."""
1861
1862 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1863 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1864 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1865 _youtube_ie = None
1866
1867 def __init__(self, youtube_ie, downloader=None):
1868 InfoExtractor.__init__(self, downloader)
1869 self._youtube_ie = youtube_ie
1870
1871 @staticmethod
1872 def suitable(url):
1873 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1874
1875 def report_download_page(self, username):
1876 """Report attempt to download user page."""
1877 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1878
1879 def _real_initialize(self):
1880 self._youtube_ie.initialize()
1881
1882 def _real_extract(self, url):
1883 # Extract username
1884 mobj = re.match(self._VALID_URL, url)
1885 if mobj is None:
1886 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1887 return
1888
1889 # Download user page
1890 username = mobj.group(1)
1891 video_ids = []
1892 pagenum = 1
1893
1894 self.report_download_page(username)
1895 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1896 try:
1897 page = urllib2.urlopen(request).read()
1898 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1899 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1900 return
1901
1902 # Extract video identifiers
1903 ids_in_page = []
1904
1905 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1906 if mobj.group(1) not in ids_in_page:
1907 ids_in_page.append(mobj.group(1))
1908 video_ids.extend(ids_in_page)
1909
1910 for id in video_ids:
1911 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1912 return
1913
1914 class PostProcessor(object):
1915 """Post Processor class.
1916
1917 PostProcessor objects can be added to downloaders with their
1918 add_post_processor() method. When the downloader has finished a
1919 successful download, it will take its internal chain of PostProcessors
1920 and start calling the run() method on each one of them, first with
1921 an initial argument and then with the returned value of the previous
1922 PostProcessor.
1923
1924 The chain will be stopped if one of them ever returns None or the end
1925 of the chain is reached.
1926
1927 PostProcessor objects follow a "mutual registration" process similar
1928 to InfoExtractor objects.
1929 """
1930
1931 _downloader = None
1932
1933 def __init__(self, downloader=None):
1934 self._downloader = downloader
1935
1936 def set_downloader(self, downloader):
1937 """Sets the downloader for this PP."""
1938 self._downloader = downloader
1939
1940 def run(self, information):
1941 """Run the PostProcessor.
1942
1943 The "information" argument is a dictionary like the ones
1944 composed by InfoExtractors. The only difference is that this
1945 one has an extra field called "filepath" that points to the
1946 downloaded file.
1947
1948 When this method returns None, the postprocessing chain is
1949 stopped. However, this method may return an information
1950 dictionary that will be passed to the next postprocessing
1951 object in the chain. It can be the one it received after
1952 changing some fields.
1953
1954 In addition, this method may raise a PostProcessingError
1955 exception that will be taken into account by the downloader
1956 it was called from.
1957 """
1958 return information # by default, do nothing
1959
1960 ### MAIN PROGRAM ###
1961 if __name__ == '__main__':
1962 try:
1963 # Modules needed only when running the main program
1964 import getpass
1965 import optparse
1966
1967 # Function to update the program file with the latest version from bitbucket.org
1968 def update_self(downloader, filename):
1969 # Note: downloader only used for options
1970 if not os.access (filename, os.W_OK):
1971 sys.exit('ERROR: no write permissions on %s' % filename)
1972
1973 downloader.to_stdout('Updating to latest stable version...')
1974 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1975 latest_version = urllib.urlopen(latest_url).read().strip()
1976 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1977 newcontent = urllib.urlopen(prog_url).read()
1978 stream = open(filename, 'w')
1979 stream.write(newcontent)
1980 stream.close()
1981 downloader.to_stdout('Updated to version %s' % latest_version)
1982
1983 # General configuration
1984 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1985 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1986 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1987
1988 # Parse command line
1989 parser = optparse.OptionParser(
1990 usage='Usage: %prog [options] url...',
1991 version='2010.06.06',
1992 conflict_handler='resolve',
1993 )
1994
1995 parser.add_option('-h', '--help',
1996 action='help', help='print this help text and exit')
1997 parser.add_option('-v', '--version',
1998 action='version', help='print program version and exit')
1999 parser.add_option('-U', '--update',
2000 action='store_true', dest='update_self', help='update this program to latest stable version')
2001 parser.add_option('-i', '--ignore-errors',
2002 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2003 parser.add_option('-r', '--rate-limit',
2004 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
2005 parser.add_option('-R', '--retries',
2006 dest='retries', metavar='T', help='number of retries (default is 10)', default=10)
2007
2008 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2009 authentication.add_option('-u', '--username',
2010 dest='username', metavar='UN', help='account username')
2011 authentication.add_option('-p', '--password',
2012 dest='password', metavar='PW', help='account password')
2013 authentication.add_option('-n', '--netrc',
2014 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2015 parser.add_option_group(authentication)
2016
2017 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2018 video_format.add_option('-f', '--format',
2019 action='store', dest='format', metavar='FMT', help='video format code')
2020 video_format.add_option('-b', '--best-quality',
2021 action='store_const', dest='format', help='download the best quality video possible', const='0')
2022 video_format.add_option('-m', '--mobile-version',
2023 action='store_const', dest='format', help='alias for -f 17', const='17')
2024 video_format.add_option('-d', '--high-def',
2025 action='store_const', dest='format', help='alias for -f 22', const='22')
2026 video_format.add_option('--all-formats',
2027 action='store_const', dest='format', help='download all available video formats', const='-1')
2028 parser.add_option_group(video_format)
2029
2030 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2031 verbosity.add_option('-q', '--quiet',
2032 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2033 verbosity.add_option('-s', '--simulate',
2034 action='store_true', dest='simulate', help='do not download video', default=False)
2035 verbosity.add_option('-g', '--get-url',
2036 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2037 verbosity.add_option('-e', '--get-title',
2038 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2039 verbosity.add_option('--get-thumbnail',
2040 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2041 verbosity.add_option('--get-description',
2042 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2043 verbosity.add_option('--no-progress',
2044 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2045 parser.add_option_group(verbosity)
2046
2047 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2048 filesystem.add_option('-t', '--title',
2049 action='store_true', dest='usetitle', help='use title in file name', default=False)
2050 filesystem.add_option('-l', '--literal',
2051 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2052 filesystem.add_option('-o', '--output',
2053 dest='outtmpl', metavar='TPL', help='output filename template')
2054 filesystem.add_option('-a', '--batch-file',
2055 dest='batchfile', metavar='F', help='file containing URLs to download (\'-\' for stdin)')
2056 filesystem.add_option('-w', '--no-overwrites',
2057 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2058 filesystem.add_option('-c', '--continue',
2059 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2060 parser.add_option_group(filesystem)
2061
2062 (opts, args) = parser.parse_args()
2063
2064 # Batch file verification
2065 batchurls = []
2066 if opts.batchfile is not None:
2067 try:
2068 if opts.batchfile == '-':
2069 batchfd = sys.stdin
2070 else:
2071 batchfd = open(opts.batchfile, 'r')
2072 batchurls = batchfd.readlines()
2073 batchurls = [x.strip() for x in batchurls]
2074 batchurls = [x for x in batchurls if len(x) > 0]
2075 except IOError:
2076 sys.exit(u'ERROR: batch file could not be read')
2077 all_urls = batchurls + args
2078
2079 # Conflicting, missing and erroneous options
2080 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2081 parser.error(u'using .netrc conflicts with giving username/password')
2082 if opts.password is not None and opts.username is None:
2083 parser.error(u'account username missing')
2084 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2085 parser.error(u'using output template conflicts with using title or literal title')
2086 if opts.usetitle and opts.useliteral:
2087 parser.error(u'using title conflicts with using literal title')
2088 if opts.username is not None and opts.password is None:
2089 opts.password = getpass.getpass(u'Type account password and press return:')
2090 if opts.ratelimit is not None:
2091 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2092 if numeric_limit is None:
2093 parser.error(u'invalid rate limit specified')
2094 opts.ratelimit = numeric_limit
2095 if opts.retries is not None:
2096 try:
2097 opts.retries = long(opts.retries)
2098 except (TypeError, ValueError), err:
2099 parser.error(u'invalid retry count specified')
2100
2101 # Information extractors
2102 youtube_ie = YoutubeIE()
2103 metacafe_ie = MetacafeIE(youtube_ie)
2104 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2105 youtube_user_ie = YoutubeUserIE(youtube_ie)
2106 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2107 google_ie = GoogleIE()
2108 google_search_ie = GoogleSearchIE(google_ie)
2109 photobucket_ie = PhotobucketIE()
2110 yahoo_ie = YahooIE()
2111 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2112 generic_ie = GenericIE()
2113
2114 # File downloader
2115 fd = FileDownloader({
2116 'usenetrc': opts.usenetrc,
2117 'username': opts.username,
2118 'password': opts.password,
2119 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2120 'forceurl': opts.geturl,
2121 'forcetitle': opts.gettitle,
2122 'forcethumbnail': opts.getthumbnail,
2123 'forcedescription': opts.getdescription,
2124 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2125 'format': opts.format,
2126 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2127 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2128 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2129 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2130 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2131 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2132 or u'%(id)s.%(ext)s'),
2133 'ignoreerrors': opts.ignoreerrors,
2134 'ratelimit': opts.ratelimit,
2135 'nooverwrites': opts.nooverwrites,
2136 'retries': opts.retries,
2137 'continuedl': opts.continue_dl,
2138 'noprogress': opts.noprogress,
2139 })
2140 fd.add_info_extractor(youtube_search_ie)
2141 fd.add_info_extractor(youtube_pl_ie)
2142 fd.add_info_extractor(youtube_user_ie)
2143 fd.add_info_extractor(metacafe_ie)
2144 fd.add_info_extractor(youtube_ie)
2145 fd.add_info_extractor(google_ie)
2146 fd.add_info_extractor(google_search_ie)
2147 fd.add_info_extractor(photobucket_ie)
2148 fd.add_info_extractor(yahoo_ie)
2149 fd.add_info_extractor(yahoo_search_ie)
2150
2151 # This must come last since it's the
2152 # fallback if none of the others work
2153 fd.add_info_extractor(generic_ie)
2154
2155 # Update version
2156 if opts.update_self:
2157 update_self(fd, sys.argv[0])
2158
2159 # Maybe do nothing
2160 if len(all_urls) < 1:
2161 if not opts.update_self:
2162 parser.error(u'you must provide at least one URL')
2163 else:
2164 sys.exit()
2165 retcode = fd.download(all_urls)
2166 sys.exit(retcode)
2167
2168 except DownloadError:
2169 sys.exit(1)
2170 except SameFileError:
2171 sys.exit(u'ERROR: fixed output name but more than one file to download')
2172 except KeyboardInterrupt:
2173 sys.exit(u'\nERROR: Interrupted by user')