]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
Imported Upstream version 2010.08.04
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 if filename == u'-':
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
107
108
109 class DownloadError(Exception):
110 """Download Error exception.
111
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
114 error message.
115 """
116 pass
117
118 class SameFileError(Exception):
119 """Same File exception.
120
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
123 """
124 pass
125
126 class PostProcessingError(Exception):
127 """Post Processing exception.
128
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
131 """
132 pass
133
134 class UnavailableVideoError(Exception):
135 """Unavailable Format exception.
136
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
139 """
140 pass
141
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
144
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
148 """
149 # Both in bytes
150 downloaded = None
151 expected = None
152
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
156
157 class FileDownloader(object):
158 """File Downloader class.
159
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
174
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 format_limit: Highest quality format to try.
193 outtmpl: Template for output names.
194 ignoreerrors: Do not stop on download errors.
195 ratelimit: Download speed limit, in bytes/sec.
196 nooverwrites: Prevent overwriting files.
197 retries: Number of times to retry for HTTP error 503
198 continuedl: Try to continue downloads if possible.
199 noprogress: Do not print the progress bar.
200 """
201
202 params = None
203 _ies = []
204 _pps = []
205 _download_retcode = None
206 _num_downloads = None
207
208 def __init__(self, params):
209 """Create a FileDownloader object with the given options."""
210 self._ies = []
211 self._pps = []
212 self._download_retcode = 0
213 self._num_downloads = 0
214 self.params = params
215
216 @staticmethod
217 def pmkdir(filename):
218 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219 components = filename.split(os.sep)
220 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222 for dir in aggregate:
223 if not os.path.exists(dir):
224 os.mkdir(dir)
225
226 @staticmethod
227 def format_bytes(bytes):
228 if bytes is None:
229 return 'N/A'
230 if type(bytes) is str:
231 bytes = float(bytes)
232 if bytes == 0.0:
233 exponent = 0
234 else:
235 exponent = long(math.log(bytes, 1024.0))
236 suffix = 'bkMGTPEZY'[exponent]
237 converted = float(bytes) / float(1024**exponent)
238 return '%.2f%s' % (converted, suffix)
239
240 @staticmethod
241 def calc_percent(byte_counter, data_len):
242 if data_len is None:
243 return '---.-%'
244 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246 @staticmethod
247 def calc_eta(start, now, total, current):
248 if total is None:
249 return '--:--'
250 dif = now - start
251 if current == 0 or dif < 0.001: # One millisecond
252 return '--:--'
253 rate = float(current) / dif
254 eta = long((float(total) - float(current)) / rate)
255 (eta_mins, eta_secs) = divmod(eta, 60)
256 if eta_mins > 99:
257 return '--:--'
258 return '%02d:%02d' % (eta_mins, eta_secs)
259
260 @staticmethod
261 def calc_speed(start, now, bytes):
262 dif = now - start
263 if bytes == 0 or dif < 0.001: # One millisecond
264 return '%10s' % '---b/s'
265 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267 @staticmethod
268 def best_block_size(elapsed_time, bytes):
269 new_min = max(bytes / 2.0, 1.0)
270 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271 if elapsed_time < 0.001:
272 return long(new_max)
273 rate = bytes / elapsed_time
274 if rate > new_max:
275 return long(new_max)
276 if rate < new_min:
277 return long(new_min)
278 return long(rate)
279
280 @staticmethod
281 def parse_bytes(bytestr):
282 """Parse a string indicating a byte quantity into a long integer."""
283 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 if matchobj is None:
285 return None
286 number = float(matchobj.group(1))
287 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288 return long(round(number * multiplier))
289
290 def add_info_extractor(self, ie):
291 """Add an InfoExtractor object to the end of the list."""
292 self._ies.append(ie)
293 ie.set_downloader(self)
294
295 def add_post_processor(self, pp):
296 """Add a PostProcessor object to the end of the chain."""
297 self._pps.append(pp)
298 pp.set_downloader(self)
299
300 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301 """Print message to stdout if not in quiet mode."""
302 try:
303 if not self.params.get('quiet', False):
304 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305 sys.stdout.flush()
306 except (UnicodeEncodeError), err:
307 if not ignore_encoding_errors:
308 raise
309
310 def to_stderr(self, message):
311 """Print message to stderr."""
312 print >>sys.stderr, message.encode(preferredencoding())
313
314 def fixed_template(self):
315 """Checks if the output template is fixed."""
316 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318 def trouble(self, message=None):
319 """Determine action to take when a download problem appears.
320
321 Depending on if the downloader has been configured to ignore
322 download errors or not, this method may throw an exception or
323 not when errors are found, after printing the message.
324 """
325 if message is not None:
326 self.to_stderr(message)
327 if not self.params.get('ignoreerrors', False):
328 raise DownloadError(message)
329 self._download_retcode = 1
330
331 def slow_down(self, start_time, byte_counter):
332 """Sleep if the download speed is over the rate limit."""
333 rate_limit = self.params.get('ratelimit', None)
334 if rate_limit is None or byte_counter == 0:
335 return
336 now = time.time()
337 elapsed = now - start_time
338 if elapsed <= 0.0:
339 return
340 speed = float(byte_counter) / elapsed
341 if speed > rate_limit:
342 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344 def report_destination(self, filename):
345 """Report destination filename."""
346 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347
348 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349 """Report download progress."""
350 if self.params.get('noprogress', False):
351 return
352 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355 def report_resuming_byte(self, resume_len):
356 """Report attemtp to resume at given byte."""
357 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358
359 def report_retry(self, count, retries):
360 """Report retry in case of HTTP error 503"""
361 self.to_stdout(u'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count, retries))
362
363 def report_file_already_downloaded(self, file_name):
364 """Report file has already been fully downloaded."""
365 try:
366 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367 except (UnicodeEncodeError), err:
368 self.to_stdout(u'[download] The file has already been downloaded')
369
370 def report_unable_to_resume(self):
371 """Report it was impossible to resume download."""
372 self.to_stdout(u'[download] Unable to resume')
373
374 def report_finish(self):
375 """Report download finished."""
376 if self.params.get('noprogress', False):
377 self.to_stdout(u'[download] Download completed')
378 else:
379 self.to_stdout(u'')
380
381 def increment_downloads(self):
382 """Increment the ordinal that assigns a number to each file."""
383 self._num_downloads += 1
384
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Forced printings
390 if self.params.get('forcetitle', False):
391 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392 if self.params.get('forceurl', False):
393 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396 if self.params.get('forcedescription', False) and 'description' in info_dict:
397 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399 return
400
401 try:
402 template_dict = dict(info_dict)
403 template_dict['epoch'] = unicode(long(time.time()))
404 template_dict['ord'] = unicode('%05d' % self._num_downloads)
405 filename = self.params['outtmpl'] % template_dict
406 except (ValueError, KeyError), err:
407 self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
408 if self.params.get('nooverwrites', False) and os.path.exists(filename):
409 self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
410 return
411
412 try:
413 self.pmkdir(filename)
414 except (OSError, IOError), err:
415 self.trouble('ERROR: unable to create directories: %s' % str(err))
416 return
417
418 try:
419 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
420 except (OSError, IOError), err:
421 raise UnavailableVideoError
422 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
423 self.trouble('ERROR: unable to download video data: %s' % str(err))
424 return
425 except (ContentTooShortError, ), err:
426 self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
427 return
428
429 if success:
430 try:
431 self.post_process(filename, info_dict)
432 except (PostProcessingError), err:
433 self.trouble('ERROR: postprocessing: %s' % str(err))
434 return
435
436 def download(self, url_list):
437 """Download a given list of URLs."""
438 if len(url_list) > 1 and self.fixed_template():
439 raise SameFileError(self.params['outtmpl'])
440
441 for url in url_list:
442 suitable_found = False
443 for ie in self._ies:
444 # Go to next InfoExtractor if not suitable
445 if not ie.suitable(url):
446 continue
447
448 # Suitable InfoExtractor found
449 suitable_found = True
450
451 # Extract information from URL and process it
452 ie.extract(url)
453
454 # Suitable InfoExtractor had been found; go to next URL
455 break
456
457 if not suitable_found:
458 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
459
460 return self._download_retcode
461
462 def post_process(self, filename, ie_info):
463 """Run the postprocessing chain on the given file."""
464 info = dict(ie_info)
465 info['filepath'] = filename
466 for pp in self._pps:
467 info = pp.run(info)
468 if info is None:
469 break
470
471 def _download_with_rtmpdump(self, filename, url, player_url):
472 self.report_destination(filename)
473
474 # Check for rtmpdump first
475 try:
476 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
477 except (OSError, IOError):
478 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
479 return False
480
481 # Download using rtmpdump. rtmpdump returns exit code 2 when
482 # the connection was interrumpted and resuming appears to be
483 # possible. This is part of rtmpdump's normal usage, AFAIK.
484 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
485 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
486 while retval == 2 or retval == 1:
487 prevsize = os.path.getsize(filename)
488 self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
489 time.sleep(5.0) # This seems to be needed
490 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
491 cursize = os.path.getsize(filename)
492 if prevsize == cursize and retval == 1:
493 break
494 if retval == 0:
495 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
496 return True
497 else:
498 self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
499 return False
500
501 def _do_download(self, filename, url, player_url):
502 # Attempt to download using rtmpdump
503 if url.startswith('rtmp'):
504 return self._download_with_rtmpdump(filename, url, player_url)
505
506 stream = None
507 open_mode = 'wb'
508 basic_request = urllib2.Request(url, None, std_headers)
509 request = urllib2.Request(url, None, std_headers)
510
511 # Establish possible resume length
512 if os.path.isfile(filename):
513 resume_len = os.path.getsize(filename)
514 else:
515 resume_len = 0
516
517 # Request parameters in case of being able to resume
518 if self.params.get('continuedl', False) and resume_len != 0:
519 self.report_resuming_byte(resume_len)
520 request.add_header('Range','bytes=%d-' % resume_len)
521 open_mode = 'ab'
522
523 count = 0
524 retries = self.params.get('retries', 0)
525 while count <= retries:
526 # Establish connection
527 try:
528 data = urllib2.urlopen(request)
529 break
530 except (urllib2.HTTPError, ), err:
531 if err.code != 503 and err.code != 416:
532 # Unexpected HTTP error
533 raise
534 elif err.code == 416:
535 # Unable to resume (requested range not satisfiable)
536 try:
537 # Open the connection again without the range header
538 data = urllib2.urlopen(basic_request)
539 content_length = data.info()['Content-Length']
540 except (urllib2.HTTPError, ), err:
541 if err.code != 503:
542 raise
543 else:
544 # Examine the reported length
545 if (content_length is not None and
546 (resume_len - 100 < long(content_length) < resume_len + 100)):
547 # The file had already been fully downloaded.
548 # Explanation to the above condition: in issue #175 it was revealed that
549 # YouTube sometimes adds or removes a few bytes from the end of the file,
550 # changing the file size slightly and causing problems for some users. So
551 # I decided to implement a suggested change and consider the file
552 # completely downloaded if the file size differs less than 100 bytes from
553 # the one in the hard drive.
554 self.report_file_already_downloaded(filename)
555 return True
556 else:
557 # The length does not match, we start the download over
558 self.report_unable_to_resume()
559 open_mode = 'wb'
560 break
561 # Retry
562 count += 1
563 if count <= retries:
564 self.report_retry(count, retries)
565
566 if count > retries:
567 self.trouble(u'ERROR: giving up after %s retries' % retries)
568 return False
569
570 data_len = data.info().get('Content-length', None)
571 data_len_str = self.format_bytes(data_len)
572 byte_counter = 0
573 block_size = 1024
574 start = time.time()
575 while True:
576 # Download and write
577 before = time.time()
578 data_block = data.read(block_size)
579 after = time.time()
580 data_block_len = len(data_block)
581 if data_block_len == 0:
582 break
583 byte_counter += data_block_len
584
585 # Open file just in time
586 if stream is None:
587 try:
588 (stream, filename) = sanitize_open(filename, open_mode)
589 self.report_destination(filename)
590 except (OSError, IOError), err:
591 self.trouble('ERROR: unable to open for writing: %s' % str(err))
592 return False
593 try:
594 stream.write(data_block)
595 except (IOError, OSError), err:
596 self.trouble('\nERROR: unable to write data: %s' % str(err))
597 block_size = self.best_block_size(after - before, data_block_len)
598
599 # Progress message
600 percent_str = self.calc_percent(byte_counter, data_len)
601 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
602 speed_str = self.calc_speed(start, time.time(), byte_counter)
603 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
604
605 # Apply rate limit
606 self.slow_down(start, byte_counter)
607
608 self.report_finish()
609 if data_len is not None and str(byte_counter) != data_len:
610 raise ContentTooShortError(byte_counter, long(data_len))
611 return True
612
613 class InfoExtractor(object):
614 """Information Extractor class.
615
616 Information extractors are the classes that, given a URL, extract
617 information from the video (or videos) the URL refers to. This
618 information includes the real video URL, the video title and simplified
619 title, author and others. The information is stored in a dictionary
620 which is then passed to the FileDownloader. The FileDownloader
621 processes this information possibly downloading the video to the file
622 system, among other possible outcomes. The dictionaries must include
623 the following fields:
624
625 id: Video identifier.
626 url: Final video URL.
627 uploader: Nickname of the video uploader.
628 title: Literal title.
629 stitle: Simplified title.
630 ext: Video filename extension.
631 format: Video format.
632 player_url: SWF Player URL (may be None).
633
634 The following fields are optional. Their primary purpose is to allow
635 youtube-dl to serve as the backend for a video search function, such
636 as the one in youtube2mp3. They are only used when their respective
637 forced printing functions are called:
638
639 thumbnail: Full URL to a video thumbnail image.
640 description: One-line video description.
641
642 Subclasses of this one should re-define the _real_initialize() and
643 _real_extract() methods, as well as the suitable() static method.
644 Probably, they should also be instantiated and added to the main
645 downloader.
646 """
647
648 _ready = False
649 _downloader = None
650
651 def __init__(self, downloader=None):
652 """Constructor. Receives an optional downloader."""
653 self._ready = False
654 self.set_downloader(downloader)
655
656 @staticmethod
657 def suitable(url):
658 """Receives a URL and returns True if suitable for this IE."""
659 return False
660
661 def initialize(self):
662 """Initializes an instance (authentication, etc)."""
663 if not self._ready:
664 self._real_initialize()
665 self._ready = True
666
667 def extract(self, url):
668 """Extracts URL information and returns it in list of dicts."""
669 self.initialize()
670 return self._real_extract(url)
671
672 def set_downloader(self, downloader):
673 """Sets the downloader for this IE."""
674 self._downloader = downloader
675
676 def _real_initialize(self):
677 """Real initialization process. Redefine in subclasses."""
678 pass
679
680 def _real_extract(self, url):
681 """Real extraction process. Redefine in subclasses."""
682 pass
683
684 class YoutubeIE(InfoExtractor):
685 """Information extractor for youtube.com."""
686
687 _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
688 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
689 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
690 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
691 _NETRC_MACHINE = 'youtube'
692 # Listed in order of quality
693 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
694 _video_extensions = {
695 '13': '3gp',
696 '17': 'mp4',
697 '18': 'mp4',
698 '22': 'mp4',
699 '37': 'mp4',
700 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
701 '43': 'webm',
702 '45': 'webm',
703 }
704
705 @staticmethod
706 def suitable(url):
707 return (re.match(YoutubeIE._VALID_URL, url) is not None)
708
709 def report_lang(self):
710 """Report attempt to set language."""
711 self._downloader.to_stdout(u'[youtube] Setting language')
712
713 def report_login(self):
714 """Report attempt to log in."""
715 self._downloader.to_stdout(u'[youtube] Logging in')
716
717 def report_age_confirmation(self):
718 """Report attempt to confirm age."""
719 self._downloader.to_stdout(u'[youtube] Confirming age')
720
721 def report_video_webpage_download(self, video_id):
722 """Report attempt to download video webpage."""
723 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
724
725 def report_video_info_webpage_download(self, video_id):
726 """Report attempt to download video info webpage."""
727 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
728
729 def report_information_extraction(self, video_id):
730 """Report attempt to extract video information."""
731 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
732
733 def report_unavailable_format(self, video_id, format):
734 """Report extracted video URL."""
735 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
736
737 def report_rtmp_download(self):
738 """Indicate the download will use the RTMP protocol."""
739 self._downloader.to_stdout(u'[youtube] RTMP download detected')
740
741 def _real_initialize(self):
742 if self._downloader is None:
743 return
744
745 username = None
746 password = None
747 downloader_params = self._downloader.params
748
749 # Attempt to use provided username and password or .netrc data
750 if downloader_params.get('username', None) is not None:
751 username = downloader_params['username']
752 password = downloader_params['password']
753 elif downloader_params.get('usenetrc', False):
754 try:
755 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
756 if info is not None:
757 username = info[0]
758 password = info[2]
759 else:
760 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
761 except (IOError, netrc.NetrcParseError), err:
762 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
763 return
764
765 # Set language
766 request = urllib2.Request(self._LANG_URL, None, std_headers)
767 try:
768 self.report_lang()
769 urllib2.urlopen(request).read()
770 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
771 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
772 return
773
774 # No authentication to be performed
775 if username is None:
776 return
777
778 # Log in
779 login_form = {
780 'current_form': 'loginForm',
781 'next': '/',
782 'action_login': 'Log In',
783 'username': username,
784 'password': password,
785 }
786 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
787 try:
788 self.report_login()
789 login_results = urllib2.urlopen(request).read()
790 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
791 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
792 return
793 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
794 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
795 return
796
797 # Confirm age
798 age_form = {
799 'next_url': '/',
800 'action_confirm': 'Confirm',
801 }
802 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
803 try:
804 self.report_age_confirmation()
805 age_results = urllib2.urlopen(request).read()
806 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
807 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
808 return
809
810 def _real_extract(self, url):
811 # Extract video id from URL
812 mobj = re.match(self._VALID_URL, url)
813 if mobj is None:
814 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
815 return
816 video_id = mobj.group(2)
817
818 # Get video webpage
819 self.report_video_webpage_download(video_id)
820 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
821 try:
822 video_webpage = urllib2.urlopen(request).read()
823 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
824 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
825 return
826
827 # Attempt to extract SWF player URL
828 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
829 if mobj is not None:
830 player_url = mobj.group(1)
831 else:
832 player_url = None
833
834 # Get video info
835 self.report_video_info_webpage_download(video_id)
836 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
837 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
838 % (video_id, el_type))
839 request = urllib2.Request(video_info_url, None, std_headers)
840 try:
841 video_info_webpage = urllib2.urlopen(request).read()
842 video_info = parse_qs(video_info_webpage)
843 if 'token' in video_info:
844 break
845 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
846 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
847 return
848 if 'token' not in video_info:
849 if 'reason' in video_info:
850 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
851 else:
852 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
853 return
854
855 # Start extracting information
856 self.report_information_extraction(video_id)
857
858 # uploader
859 if 'author' not in video_info:
860 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
861 return
862 video_uploader = urllib.unquote_plus(video_info['author'][0])
863
864 # title
865 if 'title' not in video_info:
866 self._downloader.trouble(u'ERROR: unable to extract video title')
867 return
868 video_title = urllib.unquote_plus(video_info['title'][0])
869 video_title = video_title.decode('utf-8')
870 video_title = sanitize_title(video_title)
871
872 # simplified title
873 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
874 simple_title = simple_title.strip(ur'_')
875
876 # thumbnail image
877 if 'thumbnail_url' not in video_info:
878 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
879 video_thumbnail = ''
880 else: # don't panic if we can't find it
881 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
882
883 # description
884 video_description = 'No description available.'
885 if self._downloader.params.get('forcedescription', False):
886 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
887 if mobj is not None:
888 video_description = mobj.group(1)
889
890 # token
891 video_token = urllib.unquote_plus(video_info['token'][0])
892
893 # Decide which formats to download
894 requested_format = self._downloader.params.get('format', None)
895 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
896
897 if 'fmt_url_map' in video_info:
898 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
899 format_limit = self._downloader.params.get('format_limit', None)
900 if format_limit is not None and format_limit in self._available_formats:
901 format_list = self._available_formats[self._available_formats.index(format_limit):]
902 else:
903 format_list = self._available_formats
904 existing_formats = [x for x in format_list if x in url_map]
905 if len(existing_formats) == 0:
906 self._downloader.trouble(u'ERROR: no known formats available for video')
907 return
908 if requested_format is None:
909 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
910 elif requested_format == '-1':
911 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
912 else:
913 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
914
915 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
916 self.report_rtmp_download()
917 video_url_list = [(None, video_info['conn'][0])]
918
919 else:
920 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
921 return
922
923 for format_param, video_real_url in video_url_list:
924 # At this point we have a new video
925 self._downloader.increment_downloads()
926
927 # Extension
928 video_extension = self._video_extensions.get(format_param, 'flv')
929
930 # Find the video URL in fmt_url_map or conn paramters
931 try:
932 # Process video information
933 self._downloader.process_info({
934 'id': video_id.decode('utf-8'),
935 'url': video_real_url.decode('utf-8'),
936 'uploader': video_uploader.decode('utf-8'),
937 'title': video_title,
938 'stitle': simple_title,
939 'ext': video_extension.decode('utf-8'),
940 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
941 'thumbnail': video_thumbnail.decode('utf-8'),
942 'description': video_description.decode('utf-8'),
943 'player_url': player_url,
944 })
945 except UnavailableVideoError, err:
946 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
947
948
949 class MetacafeIE(InfoExtractor):
950 """Information Extractor for metacafe.com."""
951
952 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
953 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
954 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
955 _youtube_ie = None
956
957 def __init__(self, youtube_ie, downloader=None):
958 InfoExtractor.__init__(self, downloader)
959 self._youtube_ie = youtube_ie
960
961 @staticmethod
962 def suitable(url):
963 return (re.match(MetacafeIE._VALID_URL, url) is not None)
964
965 def report_disclaimer(self):
966 """Report disclaimer retrieval."""
967 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
968
969 def report_age_confirmation(self):
970 """Report attempt to confirm age."""
971 self._downloader.to_stdout(u'[metacafe] Confirming age')
972
973 def report_download_webpage(self, video_id):
974 """Report webpage download."""
975 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
976
977 def report_extraction(self, video_id):
978 """Report information extraction."""
979 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
980
981 def _real_initialize(self):
982 # Retrieve disclaimer
983 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
984 try:
985 self.report_disclaimer()
986 disclaimer = urllib2.urlopen(request).read()
987 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
988 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
989 return
990
991 # Confirm age
992 disclaimer_form = {
993 'filters': '0',
994 'submit': "Continue - I'm over 18",
995 }
996 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
997 try:
998 self.report_age_confirmation()
999 disclaimer = urllib2.urlopen(request).read()
1000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1001 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1002 return
1003
1004 def _real_extract(self, url):
1005 # Extract id and simplified title from URL
1006 mobj = re.match(self._VALID_URL, url)
1007 if mobj is None:
1008 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1009 return
1010
1011 video_id = mobj.group(1)
1012
1013 # Check if video comes from YouTube
1014 mobj2 = re.match(r'^yt-(.*)$', video_id)
1015 if mobj2 is not None:
1016 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1017 return
1018
1019 # At this point we have a new video
1020 self._downloader.increment_downloads()
1021
1022 simple_title = mobj.group(2).decode('utf-8')
1023 video_extension = 'flv'
1024
1025 # Retrieve video webpage to extract further information
1026 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1027 try:
1028 self.report_download_webpage(video_id)
1029 webpage = urllib2.urlopen(request).read()
1030 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1031 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1032 return
1033
1034 # Extract URL, uploader and title from webpage
1035 self.report_extraction(video_id)
1036 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1037 if mobj is None:
1038 self._downloader.trouble(u'ERROR: unable to extract media URL')
1039 return
1040 mediaURL = urllib.unquote(mobj.group(1))
1041
1042 # Extract gdaKey if available
1043 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044 if mobj is None:
1045 video_url = mediaURL
1046 #self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1047 #return
1048 else:
1049 gdaKey = mobj.group(1)
1050 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1051
1052 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1053 if mobj is None:
1054 self._downloader.trouble(u'ERROR: unable to extract title')
1055 return
1056 video_title = mobj.group(1).decode('utf-8')
1057 video_title = sanitize_title(video_title)
1058
1059 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1060 if mobj is None:
1061 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1062 return
1063 video_uploader = mobj.group(1)
1064
1065 try:
1066 # Process video information
1067 self._downloader.process_info({
1068 'id': video_id.decode('utf-8'),
1069 'url': video_url.decode('utf-8'),
1070 'uploader': video_uploader.decode('utf-8'),
1071 'title': video_title,
1072 'stitle': simple_title,
1073 'ext': video_extension.decode('utf-8'),
1074 'format': u'NA',
1075 'player_url': None,
1076 })
1077 except UnavailableVideoError:
1078 self._downloader.trouble(u'ERROR: unable to download video')
1079
1080
1081 class DailymotionIE(InfoExtractor):
1082 """Information Extractor for Dailymotion"""
1083
1084 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1085
1086 def __init__(self, downloader=None):
1087 InfoExtractor.__init__(self, downloader)
1088
1089 @staticmethod
1090 def suitable(url):
1091 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1092
1093 def report_download_webpage(self, video_id):
1094 """Report webpage download."""
1095 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1096
1097 def report_extraction(self, video_id):
1098 """Report information extraction."""
1099 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1100
1101 def _real_initialize(self):
1102 return
1103
1104 def _real_extract(self, url):
1105 # Extract id and simplified title from URL
1106 mobj = re.match(self._VALID_URL, url)
1107 if mobj is None:
1108 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1109 return
1110
1111 # At this point we have a new video
1112 self._downloader.increment_downloads()
1113 video_id = mobj.group(1)
1114
1115 simple_title = mobj.group(2).decode('utf-8')
1116 video_extension = 'flv'
1117
1118 # Retrieve video webpage to extract further information
1119 request = urllib2.Request(url)
1120 try:
1121 self.report_download_webpage(video_id)
1122 webpage = urllib2.urlopen(request).read()
1123 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1124 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1125 return
1126
1127 # Extract URL, uploader and title from webpage
1128 self.report_extraction(video_id)
1129 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1130 if mobj is None:
1131 self._downloader.trouble(u'ERROR: unable to extract media URL')
1132 return
1133 mediaURL = urllib.unquote(mobj.group(1))
1134
1135 # if needed add http://www.dailymotion.com/ if relative URL
1136
1137 video_url = mediaURL
1138
1139 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1140 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1141 if mobj is None:
1142 self._downloader.trouble(u'ERROR: unable to extract title')
1143 return
1144 video_title = mobj.group(1).decode('utf-8')
1145 video_title = sanitize_title(video_title)
1146
1147 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage)
1148 if mobj is None:
1149 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1150 return
1151 video_uploader = mobj.group(1)
1152
1153 try:
1154 # Process video information
1155 self._downloader.process_info({
1156 'id': video_id.decode('utf-8'),
1157 'url': video_url.decode('utf-8'),
1158 'uploader': video_uploader.decode('utf-8'),
1159 'title': video_title,
1160 'stitle': simple_title,
1161 'ext': video_extension.decode('utf-8'),
1162 'format': u'NA',
1163 'player_url': None,
1164 })
1165 except UnavailableVideoError:
1166 self._downloader.trouble(u'ERROR: unable to download video')
1167
1168 class GoogleIE(InfoExtractor):
1169 """Information extractor for video.google.com."""
1170
1171 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1172
1173 def __init__(self, downloader=None):
1174 InfoExtractor.__init__(self, downloader)
1175
1176 @staticmethod
1177 def suitable(url):
1178 return (re.match(GoogleIE._VALID_URL, url) is not None)
1179
1180 def report_download_webpage(self, video_id):
1181 """Report webpage download."""
1182 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1183
1184 def report_extraction(self, video_id):
1185 """Report information extraction."""
1186 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1187
1188 def _real_initialize(self):
1189 return
1190
1191 def _real_extract(self, url):
1192 # Extract id from URL
1193 mobj = re.match(self._VALID_URL, url)
1194 if mobj is None:
1195 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1196 return
1197
1198 # At this point we have a new video
1199 self._downloader.increment_downloads()
1200 video_id = mobj.group(1)
1201
1202 video_extension = 'mp4'
1203
1204 # Retrieve video webpage to extract further information
1205 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1206 try:
1207 self.report_download_webpage(video_id)
1208 webpage = urllib2.urlopen(request).read()
1209 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1210 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1211 return
1212
1213 # Extract URL, uploader, and title from webpage
1214 self.report_extraction(video_id)
1215 mobj = re.search(r"download_url:'([^']+)'", webpage)
1216 if mobj is None:
1217 video_extension = 'flv'
1218 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1219 if mobj is None:
1220 self._downloader.trouble(u'ERROR: unable to extract media URL')
1221 return
1222 mediaURL = urllib.unquote(mobj.group(1))
1223 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1224 mediaURL = mediaURL.replace('\\x26', '\x26')
1225
1226 video_url = mediaURL
1227
1228 mobj = re.search(r'<title>(.*)</title>', webpage)
1229 if mobj is None:
1230 self._downloader.trouble(u'ERROR: unable to extract title')
1231 return
1232 video_title = mobj.group(1).decode('utf-8')
1233 video_title = sanitize_title(video_title)
1234 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1235
1236 # Extract video description
1237 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1238 if mobj is None:
1239 self._downloader.trouble(u'ERROR: unable to extract video description')
1240 return
1241 video_description = mobj.group(1).decode('utf-8')
1242 if not video_description:
1243 video_description = 'No description available.'
1244
1245 # Extract video thumbnail
1246 if self._downloader.params.get('forcethumbnail', False):
1247 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1248 try:
1249 webpage = urllib2.urlopen(request).read()
1250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1251 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1252 return
1253 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1254 if mobj is None:
1255 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1256 return
1257 video_thumbnail = mobj.group(1)
1258 else: # we need something to pass to process_info
1259 video_thumbnail = ''
1260
1261
1262 try:
1263 # Process video information
1264 self._downloader.process_info({
1265 'id': video_id.decode('utf-8'),
1266 'url': video_url.decode('utf-8'),
1267 'uploader': u'NA',
1268 'title': video_title,
1269 'stitle': simple_title,
1270 'ext': video_extension.decode('utf-8'),
1271 'format': u'NA',
1272 'player_url': None,
1273 })
1274 except UnavailableVideoError:
1275 self._downloader.trouble(u'ERROR: unable to download video')
1276
1277
1278 class PhotobucketIE(InfoExtractor):
1279 """Information extractor for photobucket.com."""
1280
1281 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1282
1283 def __init__(self, downloader=None):
1284 InfoExtractor.__init__(self, downloader)
1285
1286 @staticmethod
1287 def suitable(url):
1288 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1289
1290 def report_download_webpage(self, video_id):
1291 """Report webpage download."""
1292 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1293
1294 def report_extraction(self, video_id):
1295 """Report information extraction."""
1296 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1297
1298 def _real_initialize(self):
1299 return
1300
1301 def _real_extract(self, url):
1302 # Extract id from URL
1303 mobj = re.match(self._VALID_URL, url)
1304 if mobj is None:
1305 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1306 return
1307
1308 # At this point we have a new video
1309 self._downloader.increment_downloads()
1310 video_id = mobj.group(1)
1311
1312 video_extension = 'flv'
1313
1314 # Retrieve video webpage to extract further information
1315 request = urllib2.Request(url)
1316 try:
1317 self.report_download_webpage(video_id)
1318 webpage = urllib2.urlopen(request).read()
1319 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1320 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1321 return
1322
1323 # Extract URL, uploader, and title from webpage
1324 self.report_extraction(video_id)
1325 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1326 if mobj is None:
1327 self._downloader.trouble(u'ERROR: unable to extract media URL')
1328 return
1329 mediaURL = urllib.unquote(mobj.group(1))
1330
1331 video_url = mediaURL
1332
1333 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1334 if mobj is None:
1335 self._downloader.trouble(u'ERROR: unable to extract title')
1336 return
1337 video_title = mobj.group(1).decode('utf-8')
1338 video_title = sanitize_title(video_title)
1339 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1340
1341 video_uploader = mobj.group(2).decode('utf-8')
1342
1343 try:
1344 # Process video information
1345 self._downloader.process_info({
1346 'id': video_id.decode('utf-8'),
1347 'url': video_url.decode('utf-8'),
1348 'uploader': video_uploader,
1349 'title': video_title,
1350 'stitle': simple_title,
1351 'ext': video_extension.decode('utf-8'),
1352 'format': u'NA',
1353 'player_url': None,
1354 })
1355 except UnavailableVideoError:
1356 self._downloader.trouble(u'ERROR: unable to download video')
1357
1358
1359 class YahooIE(InfoExtractor):
1360 """Information extractor for video.yahoo.com."""
1361
1362 # _VALID_URL matches all Yahoo! Video URLs
1363 # _VPAGE_URL matches only the extractable '/watch/' URLs
1364 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1365 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1366
1367 def __init__(self, downloader=None):
1368 InfoExtractor.__init__(self, downloader)
1369
1370 @staticmethod
1371 def suitable(url):
1372 return (re.match(YahooIE._VALID_URL, url) is not None)
1373
1374 def report_download_webpage(self, video_id):
1375 """Report webpage download."""
1376 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1377
1378 def report_extraction(self, video_id):
1379 """Report information extraction."""
1380 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1381
1382 def _real_initialize(self):
1383 return
1384
1385 def _real_extract(self, url, new_video=True):
1386 # Extract ID from URL
1387 mobj = re.match(self._VALID_URL, url)
1388 if mobj is None:
1389 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1390 return
1391
1392 # At this point we have a new video
1393 self._downloader.increment_downloads()
1394 video_id = mobj.group(2)
1395 video_extension = 'flv'
1396
1397 # Rewrite valid but non-extractable URLs as
1398 # extractable English language /watch/ URLs
1399 if re.match(self._VPAGE_URL, url) is None:
1400 request = urllib2.Request(url)
1401 try:
1402 webpage = urllib2.urlopen(request).read()
1403 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1404 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1405 return
1406
1407 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1408 if mobj is None:
1409 self._downloader.trouble(u'ERROR: Unable to extract id field')
1410 return
1411 yahoo_id = mobj.group(1)
1412
1413 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1414 if mobj is None:
1415 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1416 return
1417 yahoo_vid = mobj.group(1)
1418
1419 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1420 return self._real_extract(url, new_video=False)
1421
1422 # Retrieve video webpage to extract further information
1423 request = urllib2.Request(url)
1424 try:
1425 self.report_download_webpage(video_id)
1426 webpage = urllib2.urlopen(request).read()
1427 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1428 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1429 return
1430
1431 # Extract uploader and title from webpage
1432 self.report_extraction(video_id)
1433 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1434 if mobj is None:
1435 self._downloader.trouble(u'ERROR: unable to extract video title')
1436 return
1437 video_title = mobj.group(1).decode('utf-8')
1438 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1439
1440 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1441 if mobj is None:
1442 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1443 return
1444 video_uploader = mobj.group(1).decode('utf-8')
1445
1446 # Extract video thumbnail
1447 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1448 if mobj is None:
1449 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1450 return
1451 video_thumbnail = mobj.group(1).decode('utf-8')
1452
1453 # Extract video description
1454 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1455 if mobj is None:
1456 self._downloader.trouble(u'ERROR: unable to extract video description')
1457 return
1458 video_description = mobj.group(1).decode('utf-8')
1459 if not video_description: video_description = 'No description available.'
1460
1461 # Extract video height and width
1462 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1463 if mobj is None:
1464 self._downloader.trouble(u'ERROR: unable to extract video height')
1465 return
1466 yv_video_height = mobj.group(1)
1467
1468 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1469 if mobj is None:
1470 self._downloader.trouble(u'ERROR: unable to extract video width')
1471 return
1472 yv_video_width = mobj.group(1)
1473
1474 # Retrieve video playlist to extract media URL
1475 # I'm not completely sure what all these options are, but we
1476 # seem to need most of them, otherwise the server sends a 401.
1477 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1478 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1479 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1480 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1481 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1482 try:
1483 self.report_download_webpage(video_id)
1484 webpage = urllib2.urlopen(request).read()
1485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1486 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1487 return
1488
1489 # Extract media URL from playlist XML
1490 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1491 if mobj is None:
1492 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1493 return
1494 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1495 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1496
1497 try:
1498 # Process video information
1499 self._downloader.process_info({
1500 'id': video_id.decode('utf-8'),
1501 'url': video_url,
1502 'uploader': video_uploader,
1503 'title': video_title,
1504 'stitle': simple_title,
1505 'ext': video_extension.decode('utf-8'),
1506 'thumbnail': video_thumbnail.decode('utf-8'),
1507 'description': video_description,
1508 'thumbnail': video_thumbnail,
1509 'description': video_description,
1510 'player_url': None,
1511 })
1512 except UnavailableVideoError:
1513 self._downloader.trouble(u'ERROR: unable to download video')
1514
1515
1516 class GenericIE(InfoExtractor):
1517 """Generic last-resort information extractor."""
1518
1519 def __init__(self, downloader=None):
1520 InfoExtractor.__init__(self, downloader)
1521
1522 @staticmethod
1523 def suitable(url):
1524 return True
1525
1526 def report_download_webpage(self, video_id):
1527 """Report webpage download."""
1528 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1529 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1530
1531 def report_extraction(self, video_id):
1532 """Report information extraction."""
1533 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1534
1535 def _real_initialize(self):
1536 return
1537
1538 def _real_extract(self, url):
1539 # At this point we have a new video
1540 self._downloader.increment_downloads()
1541
1542 video_id = url.split('/')[-1]
1543 request = urllib2.Request(url)
1544 try:
1545 self.report_download_webpage(video_id)
1546 webpage = urllib2.urlopen(request).read()
1547 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1548 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1549 return
1550 except ValueError, err:
1551 # since this is the last-resort InfoExtractor, if
1552 # this error is thrown, it'll be thrown here
1553 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1554 return
1555
1556 # Start with something easy: JW Player in SWFObject
1557 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1558 if mobj is None:
1559 # Broaden the search a little bit
1560 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1561 if mobj is None:
1562 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1563 return
1564
1565 # It's possible that one of the regexes
1566 # matched, but returned an empty group:
1567 if mobj.group(1) is None:
1568 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1569 return
1570
1571 video_url = urllib.unquote(mobj.group(1))
1572 video_id = os.path.basename(video_url)
1573
1574 # here's a fun little line of code for you:
1575 video_extension = os.path.splitext(video_id)[1][1:]
1576 video_id = os.path.splitext(video_id)[0]
1577
1578 # it's tempting to parse this further, but you would
1579 # have to take into account all the variations like
1580 # Video Title - Site Name
1581 # Site Name | Video Title
1582 # Video Title - Tagline | Site Name
1583 # and so on and so forth; it's just not practical
1584 mobj = re.search(r'<title>(.*)</title>', webpage)
1585 if mobj is None:
1586 self._downloader.trouble(u'ERROR: unable to extract title')
1587 return
1588 video_title = mobj.group(1).decode('utf-8')
1589 video_title = sanitize_title(video_title)
1590 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1591
1592 # video uploader is domain name
1593 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1594 if mobj is None:
1595 self._downloader.trouble(u'ERROR: unable to extract title')
1596 return
1597 video_uploader = mobj.group(1).decode('utf-8')
1598
1599 try:
1600 # Process video information
1601 self._downloader.process_info({
1602 'id': video_id.decode('utf-8'),
1603 'url': video_url.decode('utf-8'),
1604 'uploader': video_uploader,
1605 'title': video_title,
1606 'stitle': simple_title,
1607 'ext': video_extension.decode('utf-8'),
1608 'format': u'NA',
1609 'player_url': None,
1610 })
1611 except UnavailableVideoError, err:
1612 self._downloader.trouble(u'ERROR: unable to download video')
1613
1614
1615 class YoutubeSearchIE(InfoExtractor):
1616 """Information Extractor for YouTube search queries."""
1617 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1618 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1619 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1620 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1621 _youtube_ie = None
1622 _max_youtube_results = 1000
1623
1624 def __init__(self, youtube_ie, downloader=None):
1625 InfoExtractor.__init__(self, downloader)
1626 self._youtube_ie = youtube_ie
1627
1628 @staticmethod
1629 def suitable(url):
1630 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1631
1632 def report_download_page(self, query, pagenum):
1633 """Report attempt to download playlist page with given number."""
1634 query = query.decode(preferredencoding())
1635 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1636
1637 def _real_initialize(self):
1638 self._youtube_ie.initialize()
1639
1640 def _real_extract(self, query):
1641 mobj = re.match(self._VALID_QUERY, query)
1642 if mobj is None:
1643 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1644 return
1645
1646 prefix, query = query.split(':')
1647 prefix = prefix[8:]
1648 query = query.encode('utf-8')
1649 if prefix == '':
1650 self._download_n_results(query, 1)
1651 return
1652 elif prefix == 'all':
1653 self._download_n_results(query, self._max_youtube_results)
1654 return
1655 else:
1656 try:
1657 n = long(prefix)
1658 if n <= 0:
1659 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1660 return
1661 elif n > self._max_youtube_results:
1662 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1663 n = self._max_youtube_results
1664 self._download_n_results(query, n)
1665 return
1666 except ValueError: # parsing prefix as integer fails
1667 self._download_n_results(query, 1)
1668 return
1669
1670 def _download_n_results(self, query, n):
1671 """Downloads a specified number of results for a query"""
1672
1673 video_ids = []
1674 already_seen = set()
1675 pagenum = 1
1676
1677 while True:
1678 self.report_download_page(query, pagenum)
1679 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1680 request = urllib2.Request(result_url, None, std_headers)
1681 try:
1682 page = urllib2.urlopen(request).read()
1683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1684 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1685 return
1686
1687 # Extract video identifiers
1688 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1689 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1690 if video_id not in already_seen:
1691 video_ids.append(video_id)
1692 already_seen.add(video_id)
1693 if len(video_ids) == n:
1694 # Specified n videos reached
1695 for id in video_ids:
1696 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1697 return
1698
1699 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1700 for id in video_ids:
1701 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1702 return
1703
1704 pagenum = pagenum + 1
1705
1706 class GoogleSearchIE(InfoExtractor):
1707 """Information Extractor for Google Video search queries."""
1708 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1709 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1710 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1711 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1712 _google_ie = None
1713 _max_google_results = 1000
1714
1715 def __init__(self, google_ie, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1717 self._google_ie = google_ie
1718
1719 @staticmethod
1720 def suitable(url):
1721 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1722
1723 def report_download_page(self, query, pagenum):
1724 """Report attempt to download playlist page with given number."""
1725 query = query.decode(preferredencoding())
1726 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1727
1728 def _real_initialize(self):
1729 self._google_ie.initialize()
1730
1731 def _real_extract(self, query):
1732 mobj = re.match(self._VALID_QUERY, query)
1733 if mobj is None:
1734 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1735 return
1736
1737 prefix, query = query.split(':')
1738 prefix = prefix[8:]
1739 query = query.encode('utf-8')
1740 if prefix == '':
1741 self._download_n_results(query, 1)
1742 return
1743 elif prefix == 'all':
1744 self._download_n_results(query, self._max_google_results)
1745 return
1746 else:
1747 try:
1748 n = long(prefix)
1749 if n <= 0:
1750 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1751 return
1752 elif n > self._max_google_results:
1753 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1754 n = self._max_google_results
1755 self._download_n_results(query, n)
1756 return
1757 except ValueError: # parsing prefix as integer fails
1758 self._download_n_results(query, 1)
1759 return
1760
1761 def _download_n_results(self, query, n):
1762 """Downloads a specified number of results for a query"""
1763
1764 video_ids = []
1765 already_seen = set()
1766 pagenum = 1
1767
1768 while True:
1769 self.report_download_page(query, pagenum)
1770 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1771 request = urllib2.Request(result_url, None, std_headers)
1772 try:
1773 page = urllib2.urlopen(request).read()
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1776 return
1777
1778 # Extract video identifiers
1779 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780 video_id = mobj.group(1)
1781 if video_id not in already_seen:
1782 video_ids.append(video_id)
1783 already_seen.add(video_id)
1784 if len(video_ids) == n:
1785 # Specified n videos reached
1786 for id in video_ids:
1787 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1788 return
1789
1790 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1791 for id in video_ids:
1792 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1793 return
1794
1795 pagenum = pagenum + 1
1796
1797 class YahooSearchIE(InfoExtractor):
1798 """Information Extractor for Yahoo! Video search queries."""
1799 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1800 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1801 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1802 _MORE_PAGES_INDICATOR = r'\s*Next'
1803 _yahoo_ie = None
1804 _max_yahoo_results = 1000
1805
1806 def __init__(self, yahoo_ie, downloader=None):
1807 InfoExtractor.__init__(self, downloader)
1808 self._yahoo_ie = yahoo_ie
1809
1810 @staticmethod
1811 def suitable(url):
1812 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1813
1814 def report_download_page(self, query, pagenum):
1815 """Report attempt to download playlist page with given number."""
1816 query = query.decode(preferredencoding())
1817 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1818
1819 def _real_initialize(self):
1820 self._yahoo_ie.initialize()
1821
1822 def _real_extract(self, query):
1823 mobj = re.match(self._VALID_QUERY, query)
1824 if mobj is None:
1825 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1826 return
1827
1828 prefix, query = query.split(':')
1829 prefix = prefix[8:]
1830 query = query.encode('utf-8')
1831 if prefix == '':
1832 self._download_n_results(query, 1)
1833 return
1834 elif prefix == 'all':
1835 self._download_n_results(query, self._max_yahoo_results)
1836 return
1837 else:
1838 try:
1839 n = long(prefix)
1840 if n <= 0:
1841 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1842 return
1843 elif n > self._max_yahoo_results:
1844 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1845 n = self._max_yahoo_results
1846 self._download_n_results(query, n)
1847 return
1848 except ValueError: # parsing prefix as integer fails
1849 self._download_n_results(query, 1)
1850 return
1851
1852 def _download_n_results(self, query, n):
1853 """Downloads a specified number of results for a query"""
1854
1855 video_ids = []
1856 already_seen = set()
1857 pagenum = 1
1858
1859 while True:
1860 self.report_download_page(query, pagenum)
1861 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1862 request = urllib2.Request(result_url, None, std_headers)
1863 try:
1864 page = urllib2.urlopen(request).read()
1865 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1866 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1867 return
1868
1869 # Extract video identifiers
1870 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1871 video_id = mobj.group(1)
1872 if video_id not in already_seen:
1873 video_ids.append(video_id)
1874 already_seen.add(video_id)
1875 if len(video_ids) == n:
1876 # Specified n videos reached
1877 for id in video_ids:
1878 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1879 return
1880
1881 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1882 for id in video_ids:
1883 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1884 return
1885
1886 pagenum = pagenum + 1
1887
1888 class YoutubePlaylistIE(InfoExtractor):
1889 """Information Extractor for YouTube playlists."""
1890
1891 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1892 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1893 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1894 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1895 _youtube_ie = None
1896
1897 def __init__(self, youtube_ie, downloader=None):
1898 InfoExtractor.__init__(self, downloader)
1899 self._youtube_ie = youtube_ie
1900
1901 @staticmethod
1902 def suitable(url):
1903 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1904
1905 def report_download_page(self, playlist_id, pagenum):
1906 """Report attempt to download playlist page with given number."""
1907 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1908
1909 def _real_initialize(self):
1910 self._youtube_ie.initialize()
1911
1912 def _real_extract(self, url):
1913 # Extract playlist id
1914 mobj = re.match(self._VALID_URL, url)
1915 if mobj is None:
1916 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1917 return
1918
1919 # Download playlist pages
1920 playlist_id = mobj.group(1)
1921 video_ids = []
1922 pagenum = 1
1923
1924 while True:
1925 self.report_download_page(playlist_id, pagenum)
1926 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1927 try:
1928 page = urllib2.urlopen(request).read()
1929 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1930 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1931 return
1932
1933 # Extract video identifiers
1934 ids_in_page = []
1935 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936 if mobj.group(1) not in ids_in_page:
1937 ids_in_page.append(mobj.group(1))
1938 video_ids.extend(ids_in_page)
1939
1940 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1941 break
1942 pagenum = pagenum + 1
1943
1944 playliststart = self._downloader.params.get('playliststart', 1)
1945 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1946 if playliststart > 0:
1947 video_ids = video_ids[playliststart:]
1948
1949 for id in video_ids:
1950 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1951 return
1952
1953 class YoutubeUserIE(InfoExtractor):
1954 """Information Extractor for YouTube users."""
1955
1956 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1957 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1958 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1959 _youtube_ie = None
1960
1961 def __init__(self, youtube_ie, downloader=None):
1962 InfoExtractor.__init__(self, downloader)
1963 self._youtube_ie = youtube_ie
1964
1965 @staticmethod
1966 def suitable(url):
1967 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1968
1969 def report_download_page(self, username):
1970 """Report attempt to download user page."""
1971 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1972
1973 def _real_initialize(self):
1974 self._youtube_ie.initialize()
1975
1976 def _real_extract(self, url):
1977 # Extract username
1978 mobj = re.match(self._VALID_URL, url)
1979 if mobj is None:
1980 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1981 return
1982
1983 # Download user page
1984 username = mobj.group(1)
1985 video_ids = []
1986 pagenum = 1
1987
1988 self.report_download_page(username)
1989 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1990 try:
1991 page = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1994 return
1995
1996 # Extract video identifiers
1997 ids_in_page = []
1998
1999 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2000 if mobj.group(1) not in ids_in_page:
2001 ids_in_page.append(mobj.group(1))
2002 video_ids.extend(ids_in_page)
2003
2004 playliststart = self._downloader.params.get('playliststart', 1)
2005 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2006 if playliststart > 0:
2007 video_ids = video_ids[playliststart:]
2008
2009 for id in video_ids:
2010 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2011 return
2012
2013 class PostProcessor(object):
2014 """Post Processor class.
2015
2016 PostProcessor objects can be added to downloaders with their
2017 add_post_processor() method. When the downloader has finished a
2018 successful download, it will take its internal chain of PostProcessors
2019 and start calling the run() method on each one of them, first with
2020 an initial argument and then with the returned value of the previous
2021 PostProcessor.
2022
2023 The chain will be stopped if one of them ever returns None or the end
2024 of the chain is reached.
2025
2026 PostProcessor objects follow a "mutual registration" process similar
2027 to InfoExtractor objects.
2028 """
2029
2030 _downloader = None
2031
2032 def __init__(self, downloader=None):
2033 self._downloader = downloader
2034
2035 def set_downloader(self, downloader):
2036 """Sets the downloader for this PP."""
2037 self._downloader = downloader
2038
2039 def run(self, information):
2040 """Run the PostProcessor.
2041
2042 The "information" argument is a dictionary like the ones
2043 composed by InfoExtractors. The only difference is that this
2044 one has an extra field called "filepath" that points to the
2045 downloaded file.
2046
2047 When this method returns None, the postprocessing chain is
2048 stopped. However, this method may return an information
2049 dictionary that will be passed to the next postprocessing
2050 object in the chain. It can be the one it received after
2051 changing some fields.
2052
2053 In addition, this method may raise a PostProcessingError
2054 exception that will be taken into account by the downloader
2055 it was called from.
2056 """
2057 return information # by default, do nothing
2058
2059 ### MAIN PROGRAM ###
2060 if __name__ == '__main__':
2061 try:
2062 # Modules needed only when running the main program
2063 import getpass
2064 import optparse
2065
2066 # Function to update the program file with the latest version from bitbucket.org
2067 def update_self(downloader, filename):
2068 # Note: downloader only used for options
2069 if not os.access (filename, os.W_OK):
2070 sys.exit('ERROR: no write permissions on %s' % filename)
2071
2072 downloader.to_stdout('Updating to latest stable version...')
2073 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2074 latest_version = urllib.urlopen(latest_url).read().strip()
2075 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2076 newcontent = urllib.urlopen(prog_url).read()
2077 stream = open(filename, 'w')
2078 stream.write(newcontent)
2079 stream.close()
2080 downloader.to_stdout('Updated to version %s' % latest_version)
2081
2082 # General configuration
2083 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2084 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2085 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2086
2087 # Parse command line
2088 parser = optparse.OptionParser(
2089 usage='Usage: %prog [options] url...',
2090 version='2010.08.04',
2091 conflict_handler='resolve',
2092 )
2093
2094 parser.add_option('-h', '--help',
2095 action='help', help='print this help text and exit')
2096 parser.add_option('-v', '--version',
2097 action='version', help='print program version and exit')
2098 parser.add_option('-U', '--update',
2099 action='store_true', dest='update_self', help='update this program to latest stable version')
2100 parser.add_option('-i', '--ignore-errors',
2101 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2102 parser.add_option('-r', '--rate-limit',
2103 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2104 parser.add_option('-R', '--retries',
2105 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2106 parser.add_option('--playlist-start',
2107 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2108
2109 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2110 authentication.add_option('-u', '--username',
2111 dest='username', metavar='USERNAME', help='account username')
2112 authentication.add_option('-p', '--password',
2113 dest='password', metavar='PASSWORD', help='account password')
2114 authentication.add_option('-n', '--netrc',
2115 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2116 parser.add_option_group(authentication)
2117
2118 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2119 video_format.add_option('-f', '--format',
2120 action='store', dest='format', metavar='FORMAT', help='video format code')
2121 video_format.add_option('-m', '--mobile-version',
2122 action='store_const', dest='format', help='alias for -f 17', const='17')
2123 video_format.add_option('--all-formats',
2124 action='store_const', dest='format', help='download all available video formats', const='-1')
2125 video_format.add_option('--max-quality',
2126 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2127 video_format.add_option('-b', '--best-quality',
2128 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2129 parser.add_option_group(video_format)
2130
2131 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2132 verbosity.add_option('-q', '--quiet',
2133 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2134 verbosity.add_option('-s', '--simulate',
2135 action='store_true', dest='simulate', help='do not download video', default=False)
2136 verbosity.add_option('-g', '--get-url',
2137 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2138 verbosity.add_option('-e', '--get-title',
2139 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2140 verbosity.add_option('--get-thumbnail',
2141 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2142 verbosity.add_option('--get-description',
2143 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2144 verbosity.add_option('--no-progress',
2145 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2146 parser.add_option_group(verbosity)
2147
2148 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2149 filesystem.add_option('-t', '--title',
2150 action='store_true', dest='usetitle', help='use title in file name', default=False)
2151 filesystem.add_option('-l', '--literal',
2152 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2153 filesystem.add_option('-o', '--output',
2154 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2155 filesystem.add_option('-a', '--batch-file',
2156 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2157 filesystem.add_option('-w', '--no-overwrites',
2158 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2159 filesystem.add_option('-c', '--continue',
2160 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2161 parser.add_option_group(filesystem)
2162
2163 (opts, args) = parser.parse_args()
2164
2165 # Batch file verification
2166 batchurls = []
2167 if opts.batchfile is not None:
2168 try:
2169 if opts.batchfile == '-':
2170 batchfd = sys.stdin
2171 else:
2172 batchfd = open(opts.batchfile, 'r')
2173 batchurls = batchfd.readlines()
2174 batchurls = [x.strip() for x in batchurls]
2175 batchurls = [x for x in batchurls if len(x) > 0]
2176 except IOError:
2177 sys.exit(u'ERROR: batch file could not be read')
2178 all_urls = batchurls + args
2179
2180 # Conflicting, missing and erroneous options
2181 if opts.bestquality:
2182 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2183 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2184 parser.error(u'using .netrc conflicts with giving username/password')
2185 if opts.password is not None and opts.username is None:
2186 parser.error(u'account username missing')
2187 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2188 parser.error(u'using output template conflicts with using title or literal title')
2189 if opts.usetitle and opts.useliteral:
2190 parser.error(u'using title conflicts with using literal title')
2191 if opts.username is not None and opts.password is None:
2192 opts.password = getpass.getpass(u'Type account password and press return:')
2193 if opts.ratelimit is not None:
2194 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2195 if numeric_limit is None:
2196 parser.error(u'invalid rate limit specified')
2197 opts.ratelimit = numeric_limit
2198 if opts.retries is not None:
2199 try:
2200 opts.retries = long(opts.retries)
2201 except (TypeError, ValueError), err:
2202 parser.error(u'invalid retry count specified')
2203 if opts.playliststart is not None:
2204 try:
2205 opts.playliststart = long(opts.playliststart)
2206 except (TypeError, ValueError), err:
2207 parser.error(u'invalid playlist page specified')
2208
2209 # Information extractors
2210 youtube_ie = YoutubeIE()
2211 metacafe_ie = MetacafeIE(youtube_ie)
2212 dailymotion_ie = DailymotionIE()
2213 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2214 youtube_user_ie = YoutubeUserIE(youtube_ie)
2215 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2216 google_ie = GoogleIE()
2217 google_search_ie = GoogleSearchIE(google_ie)
2218 photobucket_ie = PhotobucketIE()
2219 yahoo_ie = YahooIE()
2220 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2221 generic_ie = GenericIE()
2222
2223 # File downloader
2224 fd = FileDownloader({
2225 'usenetrc': opts.usenetrc,
2226 'username': opts.username,
2227 'password': opts.password,
2228 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2229 'forceurl': opts.geturl,
2230 'forcetitle': opts.gettitle,
2231 'forcethumbnail': opts.getthumbnail,
2232 'forcedescription': opts.getdescription,
2233 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2234 'format': opts.format,
2235 'format_limit': opts.format_limit,
2236 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2237 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2238 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2239 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2240 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2241 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2242 or u'%(id)s.%(ext)s'),
2243 'ignoreerrors': opts.ignoreerrors,
2244 'ratelimit': opts.ratelimit,
2245 'nooverwrites': opts.nooverwrites,
2246 'retries': opts.retries,
2247 'continuedl': opts.continue_dl,
2248 'noprogress': opts.noprogress,
2249 'playliststart': opts.playliststart,
2250 })
2251 fd.add_info_extractor(youtube_search_ie)
2252 fd.add_info_extractor(youtube_pl_ie)
2253 fd.add_info_extractor(youtube_user_ie)
2254 fd.add_info_extractor(metacafe_ie)
2255 fd.add_info_extractor(dailymotion_ie)
2256 fd.add_info_extractor(youtube_ie)
2257 fd.add_info_extractor(google_ie)
2258 fd.add_info_extractor(google_search_ie)
2259 fd.add_info_extractor(photobucket_ie)
2260 fd.add_info_extractor(yahoo_ie)
2261 fd.add_info_extractor(yahoo_search_ie)
2262
2263 # This must come last since it's the
2264 # fallback if none of the others work
2265 fd.add_info_extractor(generic_ie)
2266
2267 # Update version
2268 if opts.update_self:
2269 update_self(fd, sys.argv[0])
2270
2271 # Maybe do nothing
2272 if len(all_urls) < 1:
2273 if not opts.update_self:
2274 parser.error(u'you must provide at least one URL')
2275 else:
2276 sys.exit()
2277 retcode = fd.download(all_urls)
2278 sys.exit(retcode)
2279
2280 except DownloadError:
2281 sys.exit(1)
2282 except SameFileError:
2283 sys.exit(u'ERROR: fixed output name but more than one file to download')
2284 except KeyboardInterrupt:
2285 sys.exit(u'\nERROR: Interrupted by user')