]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
Imported Upstream version 2010.10.03
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import htmlentitydefs
8 import httplib
9 import locale
10 import math
11 import netrc
12 import os
13 import os.path
14 import re
15 import socket
16 import string
17 import subprocess
18 import sys
19 import time
20 import urllib
21 import urllib2
22
23 # parse_qs was moved from the cgi module to the urlparse module recently.
24 try:
25 from urlparse import parse_qs
26 except ImportError:
27 from cgi import parse_qs
28
29 std_headers = {
30 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33 'Accept-Language': 'en-us,en;q=0.5',
34 }
35
36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
37
38 def preferredencoding():
39 """Get preferred encoding.
40
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
43 """
44 def yield_preferredencoding():
45 try:
46 pref = locale.getpreferredencoding()
47 u'TEST'.encode(pref)
48 except:
49 pref = 'UTF-8'
50 while True:
51 yield pref
52 return yield_preferredencoding().next()
53
54 def htmlentity_transform(matchobj):
55 """Transforms an HTML entity to a Unicode character.
56
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
59 """
60 entity = matchobj.group(1)
61
62 # Known non-numeric HTML entity
63 if entity in htmlentitydefs.name2codepoint:
64 return unichr(htmlentitydefs.name2codepoint[entity])
65
66 # Unicode character
67 mobj = re.match(ur'(?u)#(x?\d+)', entity)
68 if mobj is not None:
69 numstr = mobj.group(1)
70 if numstr.startswith(u'x'):
71 base = 16
72 numstr = u'0%s' % numstr
73 else:
74 base = 10
75 return unichr(long(numstr, base))
76
77 # Unknown entity in name, return its literal representation
78 return (u'&%s;' % entity)
79
80 def sanitize_title(utitle):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
83 return utitle.replace(unicode(os.sep), u'%')
84
85 def sanitize_open(filename, open_mode):
86 """Try to open the given filename, and slightly tweak it if this fails.
87
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
91 function.
92
93 It returns the tuple (stream, definitive_file_name).
94 """
95 try:
96 if filename == u'-':
97 return (sys.stdout, filename)
98 stream = open(filename, open_mode)
99 return (stream, filename)
100 except (IOError, OSError), err:
101 # In case of error, try to remove win32 forbidden chars
102 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
103
104 # An exception here should be caught in the caller
105 stream = open(filename, open_mode)
106 return (stream, filename)
107
108
109 class DownloadError(Exception):
110 """Download Error exception.
111
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
114 error message.
115 """
116 pass
117
118 class SameFileError(Exception):
119 """Same File exception.
120
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
123 """
124 pass
125
126 class PostProcessingError(Exception):
127 """Post Processing exception.
128
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
131 """
132 pass
133
134 class UnavailableVideoError(Exception):
135 """Unavailable Format exception.
136
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
139 """
140 pass
141
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
144
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
148 """
149 # Both in bytes
150 downloaded = None
151 expected = None
152
153 def __init__(self, downloaded, expected):
154 self.downloaded = downloaded
155 self.expected = expected
156
157 class FileDownloader(object):
158 """File Downloader class.
159
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
166
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
174
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
181
182 Available options:
183
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 format_limit: Highest quality format to try.
193 outtmpl: Template for output names.
194 ignoreerrors: Do not stop on download errors.
195 ratelimit: Download speed limit, in bytes/sec.
196 nooverwrites: Prevent overwriting files.
197 retries: Number of times to retry for HTTP error 5xx
198 continuedl: Try to continue downloads if possible.
199 noprogress: Do not print the progress bar.
200 """
201
202 params = None
203 _ies = []
204 _pps = []
205 _download_retcode = None
206 _num_downloads = None
207
208 def __init__(self, params):
209 """Create a FileDownloader object with the given options."""
210 self._ies = []
211 self._pps = []
212 self._download_retcode = 0
213 self._num_downloads = 0
214 self.params = params
215
216 @staticmethod
217 def pmkdir(filename):
218 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219 components = filename.split(os.sep)
220 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
221 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
222 for dir in aggregate:
223 if not os.path.exists(dir):
224 os.mkdir(dir)
225
226 @staticmethod
227 def format_bytes(bytes):
228 if bytes is None:
229 return 'N/A'
230 if type(bytes) is str:
231 bytes = float(bytes)
232 if bytes == 0.0:
233 exponent = 0
234 else:
235 exponent = long(math.log(bytes, 1024.0))
236 suffix = 'bkMGTPEZY'[exponent]
237 converted = float(bytes) / float(1024**exponent)
238 return '%.2f%s' % (converted, suffix)
239
240 @staticmethod
241 def calc_percent(byte_counter, data_len):
242 if data_len is None:
243 return '---.-%'
244 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
245
246 @staticmethod
247 def calc_eta(start, now, total, current):
248 if total is None:
249 return '--:--'
250 dif = now - start
251 if current == 0 or dif < 0.001: # One millisecond
252 return '--:--'
253 rate = float(current) / dif
254 eta = long((float(total) - float(current)) / rate)
255 (eta_mins, eta_secs) = divmod(eta, 60)
256 if eta_mins > 99:
257 return '--:--'
258 return '%02d:%02d' % (eta_mins, eta_secs)
259
260 @staticmethod
261 def calc_speed(start, now, bytes):
262 dif = now - start
263 if bytes == 0 or dif < 0.001: # One millisecond
264 return '%10s' % '---b/s'
265 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
266
267 @staticmethod
268 def best_block_size(elapsed_time, bytes):
269 new_min = max(bytes / 2.0, 1.0)
270 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271 if elapsed_time < 0.001:
272 return long(new_max)
273 rate = bytes / elapsed_time
274 if rate > new_max:
275 return long(new_max)
276 if rate < new_min:
277 return long(new_min)
278 return long(rate)
279
280 @staticmethod
281 def parse_bytes(bytestr):
282 """Parse a string indicating a byte quantity into a long integer."""
283 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
284 if matchobj is None:
285 return None
286 number = float(matchobj.group(1))
287 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
288 return long(round(number * multiplier))
289
290 def add_info_extractor(self, ie):
291 """Add an InfoExtractor object to the end of the list."""
292 self._ies.append(ie)
293 ie.set_downloader(self)
294
295 def add_post_processor(self, pp):
296 """Add a PostProcessor object to the end of the chain."""
297 self._pps.append(pp)
298 pp.set_downloader(self)
299
300 def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
301 """Print message to stdout if not in quiet mode."""
302 try:
303 if not self.params.get('quiet', False):
304 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
305 sys.stdout.flush()
306 except (UnicodeEncodeError), err:
307 if not ignore_encoding_errors:
308 raise
309
310 def to_stderr(self, message):
311 """Print message to stderr."""
312 print >>sys.stderr, message.encode(preferredencoding())
313
314 def fixed_template(self):
315 """Checks if the output template is fixed."""
316 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
317
318 def trouble(self, message=None):
319 """Determine action to take when a download problem appears.
320
321 Depending on if the downloader has been configured to ignore
322 download errors or not, this method may throw an exception or
323 not when errors are found, after printing the message.
324 """
325 if message is not None:
326 self.to_stderr(message)
327 if not self.params.get('ignoreerrors', False):
328 raise DownloadError(message)
329 self._download_retcode = 1
330
331 def slow_down(self, start_time, byte_counter):
332 """Sleep if the download speed is over the rate limit."""
333 rate_limit = self.params.get('ratelimit', None)
334 if rate_limit is None or byte_counter == 0:
335 return
336 now = time.time()
337 elapsed = now - start_time
338 if elapsed <= 0.0:
339 return
340 speed = float(byte_counter) / elapsed
341 if speed > rate_limit:
342 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
343
344 def report_destination(self, filename):
345 """Report destination filename."""
346 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
347
348 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
349 """Report download progress."""
350 if self.params.get('noprogress', False):
351 return
352 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
353 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
354
355 def report_resuming_byte(self, resume_len):
356 """Report attempt to resume at given byte."""
357 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
358
359 def report_retry(self, count, retries):
360 """Report retry in case of HTTP error 5xx"""
361 self.to_stdout(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
362
363 def report_file_already_downloaded(self, file_name):
364 """Report file has already been fully downloaded."""
365 try:
366 self.to_stdout(u'[download] %s has already been downloaded' % file_name)
367 except (UnicodeEncodeError), err:
368 self.to_stdout(u'[download] The file has already been downloaded')
369
370 def report_unable_to_resume(self):
371 """Report it was impossible to resume download."""
372 self.to_stdout(u'[download] Unable to resume')
373
374 def report_finish(self):
375 """Report download finished."""
376 if self.params.get('noprogress', False):
377 self.to_stdout(u'[download] Download completed')
378 else:
379 self.to_stdout(u'')
380
381 def increment_downloads(self):
382 """Increment the ordinal that assigns a number to each file."""
383 self._num_downloads += 1
384
385 def process_info(self, info_dict):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self.params.get('simulate', False):
389 # Forced printings
390 if self.params.get('forcetitle', False):
391 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392 if self.params.get('forceurl', False):
393 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
395 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396 if self.params.get('forcedescription', False) and 'description' in info_dict:
397 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
398
399 return
400
401 try:
402 template_dict = dict(info_dict)
403 template_dict['epoch'] = unicode(long(time.time()))
404 template_dict['ord'] = unicode('%05d' % self._num_downloads)
405 filename = self.params['outtmpl'] % template_dict
406 except (ValueError, KeyError), err:
407 self.trouble(u'ERROR: invalid system charset or erroneous output template')
408 return
409 if self.params.get('nooverwrites', False) and os.path.exists(filename):
410 self.to_stderr(u'WARNING: file exists and will be skipped')
411 return
412
413 try:
414 self.pmkdir(filename)
415 except (OSError, IOError), err:
416 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
417 return
418
419 try:
420 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
421 except (OSError, IOError), err:
422 raise UnavailableVideoError
423 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
424 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
425 return
426 except (ContentTooShortError, ), err:
427 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
428 return
429
430 if success:
431 try:
432 self.post_process(filename, info_dict)
433 except (PostProcessingError), err:
434 self.trouble(u'ERROR: postprocessing: %s' % str(err))
435 return
436
437 def download(self, url_list):
438 """Download a given list of URLs."""
439 if len(url_list) > 1 and self.fixed_template():
440 raise SameFileError(self.params['outtmpl'])
441
442 for url in url_list:
443 suitable_found = False
444 for ie in self._ies:
445 # Go to next InfoExtractor if not suitable
446 if not ie.suitable(url):
447 continue
448
449 # Suitable InfoExtractor found
450 suitable_found = True
451
452 # Extract information from URL and process it
453 ie.extract(url)
454
455 # Suitable InfoExtractor had been found; go to next URL
456 break
457
458 if not suitable_found:
459 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
460
461 return self._download_retcode
462
463 def post_process(self, filename, ie_info):
464 """Run the postprocessing chain on the given file."""
465 info = dict(ie_info)
466 info['filepath'] = filename
467 for pp in self._pps:
468 info = pp.run(info)
469 if info is None:
470 break
471
472 def _download_with_rtmpdump(self, filename, url, player_url):
473 self.report_destination(filename)
474
475 # Check for rtmpdump first
476 try:
477 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
478 except (OSError, IOError):
479 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
480 return False
481
482 # Download using rtmpdump. rtmpdump returns exit code 2 when
483 # the connection was interrumpted and resuming appears to be
484 # possible. This is part of rtmpdump's normal usage, AFAIK.
485 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
486 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
487 while retval == 2 or retval == 1:
488 prevsize = os.path.getsize(filename)
489 self.to_stdout(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
490 time.sleep(5.0) # This seems to be needed
491 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
492 cursize = os.path.getsize(filename)
493 if prevsize == cursize and retval == 1:
494 break
495 if retval == 0:
496 self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
497 return True
498 else:
499 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
500 return False
501
502 def _do_download(self, filename, url, player_url):
503 # Attempt to download using rtmpdump
504 if url.startswith('rtmp'):
505 return self._download_with_rtmpdump(filename, url, player_url)
506
507 stream = None
508 open_mode = 'wb'
509 basic_request = urllib2.Request(url, None, std_headers)
510 request = urllib2.Request(url, None, std_headers)
511
512 # Establish possible resume length
513 if os.path.isfile(filename):
514 resume_len = os.path.getsize(filename)
515 else:
516 resume_len = 0
517
518 # Request parameters in case of being able to resume
519 if self.params.get('continuedl', False) and resume_len != 0:
520 self.report_resuming_byte(resume_len)
521 request.add_header('Range','bytes=%d-' % resume_len)
522 open_mode = 'ab'
523
524 count = 0
525 retries = self.params.get('retries', 0)
526 while count <= retries:
527 # Establish connection
528 try:
529 data = urllib2.urlopen(request)
530 break
531 except (urllib2.HTTPError, ), err:
532 if (err.code < 500 or err.code >= 600) and err.code != 416:
533 # Unexpected HTTP error
534 raise
535 elif err.code == 416:
536 # Unable to resume (requested range not satisfiable)
537 try:
538 # Open the connection again without the range header
539 data = urllib2.urlopen(basic_request)
540 content_length = data.info()['Content-Length']
541 except (urllib2.HTTPError, ), err:
542 if err.code < 500 or err.code >= 600:
543 raise
544 else:
545 # Examine the reported length
546 if (content_length is not None and
547 (resume_len - 100 < long(content_length) < resume_len + 100)):
548 # The file had already been fully downloaded.
549 # Explanation to the above condition: in issue #175 it was revealed that
550 # YouTube sometimes adds or removes a few bytes from the end of the file,
551 # changing the file size slightly and causing problems for some users. So
552 # I decided to implement a suggested change and consider the file
553 # completely downloaded if the file size differs less than 100 bytes from
554 # the one in the hard drive.
555 self.report_file_already_downloaded(filename)
556 return True
557 else:
558 # The length does not match, we start the download over
559 self.report_unable_to_resume()
560 open_mode = 'wb'
561 break
562 # Retry
563 count += 1
564 if count <= retries:
565 self.report_retry(count, retries)
566
567 if count > retries:
568 self.trouble(u'ERROR: giving up after %s retries' % retries)
569 return False
570
571 data_len = data.info().get('Content-length', None)
572 data_len_str = self.format_bytes(data_len)
573 byte_counter = 0
574 block_size = 1024
575 start = time.time()
576 while True:
577 # Download and write
578 before = time.time()
579 data_block = data.read(block_size)
580 after = time.time()
581 data_block_len = len(data_block)
582 if data_block_len == 0:
583 break
584 byte_counter += data_block_len
585
586 # Open file just in time
587 if stream is None:
588 try:
589 (stream, filename) = sanitize_open(filename, open_mode)
590 self.report_destination(filename)
591 except (OSError, IOError), err:
592 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
593 return False
594 try:
595 stream.write(data_block)
596 except (IOError, OSError), err:
597 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
598 return False
599 block_size = self.best_block_size(after - before, data_block_len)
600
601 # Progress message
602 percent_str = self.calc_percent(byte_counter, data_len)
603 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
604 speed_str = self.calc_speed(start, time.time(), byte_counter)
605 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
606
607 # Apply rate limit
608 self.slow_down(start, byte_counter)
609
610 self.report_finish()
611 if data_len is not None and str(byte_counter) != data_len:
612 raise ContentTooShortError(byte_counter, long(data_len))
613 return True
614
615 class InfoExtractor(object):
616 """Information Extractor class.
617
618 Information extractors are the classes that, given a URL, extract
619 information from the video (or videos) the URL refers to. This
620 information includes the real video URL, the video title and simplified
621 title, author and others. The information is stored in a dictionary
622 which is then passed to the FileDownloader. The FileDownloader
623 processes this information possibly downloading the video to the file
624 system, among other possible outcomes. The dictionaries must include
625 the following fields:
626
627 id: Video identifier.
628 url: Final video URL.
629 uploader: Nickname of the video uploader.
630 title: Literal title.
631 stitle: Simplified title.
632 ext: Video filename extension.
633 format: Video format.
634 player_url: SWF Player URL (may be None).
635
636 The following fields are optional. Their primary purpose is to allow
637 youtube-dl to serve as the backend for a video search function, such
638 as the one in youtube2mp3. They are only used when their respective
639 forced printing functions are called:
640
641 thumbnail: Full URL to a video thumbnail image.
642 description: One-line video description.
643
644 Subclasses of this one should re-define the _real_initialize() and
645 _real_extract() methods, as well as the suitable() static method.
646 Probably, they should also be instantiated and added to the main
647 downloader.
648 """
649
650 _ready = False
651 _downloader = None
652
653 def __init__(self, downloader=None):
654 """Constructor. Receives an optional downloader."""
655 self._ready = False
656 self.set_downloader(downloader)
657
658 @staticmethod
659 def suitable(url):
660 """Receives a URL and returns True if suitable for this IE."""
661 return False
662
663 def initialize(self):
664 """Initializes an instance (authentication, etc)."""
665 if not self._ready:
666 self._real_initialize()
667 self._ready = True
668
669 def extract(self, url):
670 """Extracts URL information and returns it in list of dicts."""
671 self.initialize()
672 return self._real_extract(url)
673
674 def set_downloader(self, downloader):
675 """Sets the downloader for this IE."""
676 self._downloader = downloader
677
678 def _real_initialize(self):
679 """Real initialization process. Redefine in subclasses."""
680 pass
681
682 def _real_extract(self, url):
683 """Real extraction process. Redefine in subclasses."""
684 pass
685
686 class YoutubeIE(InfoExtractor):
687 """Information extractor for youtube.com."""
688
689 _VALID_URL = r'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
690 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
691 _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
692 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
693 _NETRC_MACHINE = 'youtube'
694 # Listed in order of quality
695 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
696 _video_extensions = {
697 '13': '3gp',
698 '17': 'mp4',
699 '18': 'mp4',
700 '22': 'mp4',
701 '37': 'mp4',
702 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
703 '43': 'webm',
704 '45': 'webm',
705 }
706
707 @staticmethod
708 def suitable(url):
709 return (re.match(YoutubeIE._VALID_URL, url) is not None)
710
711 def report_lang(self):
712 """Report attempt to set language."""
713 self._downloader.to_stdout(u'[youtube] Setting language')
714
715 def report_login(self):
716 """Report attempt to log in."""
717 self._downloader.to_stdout(u'[youtube] Logging in')
718
719 def report_age_confirmation(self):
720 """Report attempt to confirm age."""
721 self._downloader.to_stdout(u'[youtube] Confirming age')
722
723 def report_video_webpage_download(self, video_id):
724 """Report attempt to download video webpage."""
725 self._downloader.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
726
727 def report_video_info_webpage_download(self, video_id):
728 """Report attempt to download video info webpage."""
729 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
730
731 def report_information_extraction(self, video_id):
732 """Report attempt to extract video information."""
733 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
734
735 def report_unavailable_format(self, video_id, format):
736 """Report extracted video URL."""
737 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
738
739 def report_rtmp_download(self):
740 """Indicate the download will use the RTMP protocol."""
741 self._downloader.to_stdout(u'[youtube] RTMP download detected')
742
743 def _real_initialize(self):
744 if self._downloader is None:
745 return
746
747 username = None
748 password = None
749 downloader_params = self._downloader.params
750
751 # Attempt to use provided username and password or .netrc data
752 if downloader_params.get('username', None) is not None:
753 username = downloader_params['username']
754 password = downloader_params['password']
755 elif downloader_params.get('usenetrc', False):
756 try:
757 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
758 if info is not None:
759 username = info[0]
760 password = info[2]
761 else:
762 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
763 except (IOError, netrc.NetrcParseError), err:
764 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
765 return
766
767 # Set language
768 request = urllib2.Request(self._LANG_URL, None, std_headers)
769 try:
770 self.report_lang()
771 urllib2.urlopen(request).read()
772 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
773 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
774 return
775
776 # No authentication to be performed
777 if username is None:
778 return
779
780 # Log in
781 login_form = {
782 'current_form': 'loginForm',
783 'next': '/',
784 'action_login': 'Log In',
785 'username': username,
786 'password': password,
787 }
788 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
789 try:
790 self.report_login()
791 login_results = urllib2.urlopen(request).read()
792 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
793 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
794 return
795 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
796 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
797 return
798
799 # Confirm age
800 age_form = {
801 'next_url': '/',
802 'action_confirm': 'Confirm',
803 }
804 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
805 try:
806 self.report_age_confirmation()
807 age_results = urllib2.urlopen(request).read()
808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
809 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
810 return
811
812 def _real_extract(self, url):
813 # Extract video id from URL
814 mobj = re.match(self._VALID_URL, url)
815 if mobj is None:
816 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
817 return
818 video_id = mobj.group(2)
819
820 # Get video webpage
821 self.report_video_webpage_download(video_id)
822 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id, None, std_headers)
823 try:
824 video_webpage = urllib2.urlopen(request).read()
825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
826 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
827 return
828
829 # Attempt to extract SWF player URL
830 mobj = re.search(r'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage)
831 if mobj is not None:
832 player_url = mobj.group(1)
833 else:
834 player_url = None
835
836 # Get video info
837 self.report_video_info_webpage_download(video_id)
838 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840 % (video_id, el_type))
841 request = urllib2.Request(video_info_url, None, std_headers)
842 try:
843 video_info_webpage = urllib2.urlopen(request).read()
844 video_info = parse_qs(video_info_webpage)
845 if 'token' in video_info:
846 break
847 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
848 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
849 return
850 if 'token' not in video_info:
851 if 'reason' in video_info:
852 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
853 else:
854 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
855 return
856
857 # Start extracting information
858 self.report_information_extraction(video_id)
859
860 # uploader
861 if 'author' not in video_info:
862 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
863 return
864 video_uploader = urllib.unquote_plus(video_info['author'][0])
865
866 # title
867 if 'title' not in video_info:
868 self._downloader.trouble(u'ERROR: unable to extract video title')
869 return
870 video_title = urllib.unquote_plus(video_info['title'][0])
871 video_title = video_title.decode('utf-8')
872 video_title = sanitize_title(video_title)
873
874 # simplified title
875 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
876 simple_title = simple_title.strip(ur'_')
877
878 # thumbnail image
879 if 'thumbnail_url' not in video_info:
880 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
881 video_thumbnail = ''
882 else: # don't panic if we can't find it
883 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
884
885 # description
886 video_description = 'No description available.'
887 if self._downloader.params.get('forcedescription', False):
888 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
889 if mobj is not None:
890 video_description = mobj.group(1)
891
892 # token
893 video_token = urllib.unquote_plus(video_info['token'][0])
894
895 # Decide which formats to download
896 requested_format = self._downloader.params.get('format', None)
897 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
898
899 if 'fmt_url_map' in video_info:
900 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
901 format_limit = self._downloader.params.get('format_limit', None)
902 if format_limit is not None and format_limit in self._available_formats:
903 format_list = self._available_formats[self._available_formats.index(format_limit):]
904 else:
905 format_list = self._available_formats
906 existing_formats = [x for x in format_list if x in url_map]
907 if len(existing_formats) == 0:
908 self._downloader.trouble(u'ERROR: no known formats available for video')
909 return
910 if requested_format is None:
911 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
912 elif requested_format == '-1':
913 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
914 else:
915 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
916
917 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
918 self.report_rtmp_download()
919 video_url_list = [(None, video_info['conn'][0])]
920
921 else:
922 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
923 return
924
925 for format_param, video_real_url in video_url_list:
926 # At this point we have a new video
927 self._downloader.increment_downloads()
928
929 # Extension
930 video_extension = self._video_extensions.get(format_param, 'flv')
931
932 # Find the video URL in fmt_url_map or conn paramters
933 try:
934 # Process video information
935 self._downloader.process_info({
936 'id': video_id.decode('utf-8'),
937 'url': video_real_url.decode('utf-8'),
938 'uploader': video_uploader.decode('utf-8'),
939 'title': video_title,
940 'stitle': simple_title,
941 'ext': video_extension.decode('utf-8'),
942 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
943 'thumbnail': video_thumbnail.decode('utf-8'),
944 'description': video_description.decode('utf-8'),
945 'player_url': player_url,
946 })
947 except UnavailableVideoError, err:
948 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
949
950
951 class MetacafeIE(InfoExtractor):
952 """Information Extractor for metacafe.com."""
953
954 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
955 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
956 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
957 _youtube_ie = None
958
959 def __init__(self, youtube_ie, downloader=None):
960 InfoExtractor.__init__(self, downloader)
961 self._youtube_ie = youtube_ie
962
963 @staticmethod
964 def suitable(url):
965 return (re.match(MetacafeIE._VALID_URL, url) is not None)
966
967 def report_disclaimer(self):
968 """Report disclaimer retrieval."""
969 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
970
971 def report_age_confirmation(self):
972 """Report attempt to confirm age."""
973 self._downloader.to_stdout(u'[metacafe] Confirming age')
974
975 def report_download_webpage(self, video_id):
976 """Report webpage download."""
977 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
978
979 def report_extraction(self, video_id):
980 """Report information extraction."""
981 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
982
983 def _real_initialize(self):
984 # Retrieve disclaimer
985 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
986 try:
987 self.report_disclaimer()
988 disclaimer = urllib2.urlopen(request).read()
989 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
990 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
991 return
992
993 # Confirm age
994 disclaimer_form = {
995 'filters': '0',
996 'submit': "Continue - I'm over 18",
997 }
998 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
999 try:
1000 self.report_age_confirmation()
1001 disclaimer = urllib2.urlopen(request).read()
1002 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1003 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1004 return
1005
1006 def _real_extract(self, url):
1007 # Extract id and simplified title from URL
1008 mobj = re.match(self._VALID_URL, url)
1009 if mobj is None:
1010 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1011 return
1012
1013 video_id = mobj.group(1)
1014
1015 # Check if video comes from YouTube
1016 mobj2 = re.match(r'^yt-(.*)$', video_id)
1017 if mobj2 is not None:
1018 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1019 return
1020
1021 # At this point we have a new video
1022 self._downloader.increment_downloads()
1023
1024 simple_title = mobj.group(2).decode('utf-8')
1025
1026 # Retrieve video webpage to extract further information
1027 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1028 try:
1029 self.report_download_webpage(video_id)
1030 webpage = urllib2.urlopen(request).read()
1031 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1032 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1033 return
1034
1035 # Extract URL, uploader and title from webpage
1036 self.report_extraction(video_id)
1037 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1038 if mobj is not None:
1039 mediaURL = urllib.unquote(mobj.group(1))
1040 video_extension = mediaURL[-3:]
1041
1042 # Extract gdaKey if available
1043 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1044 if mobj is None:
1045 video_url = mediaURL
1046 else:
1047 gdaKey = mobj.group(1)
1048 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1049 else:
1050 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1051 if mobj is None:
1052 self._downloader.trouble(u'ERROR: unable to extract media URL')
1053 return
1054 vardict = parse_qs(mobj.group(1))
1055 if 'mediaData' not in vardict:
1056 self._downloader.trouble(u'ERROR: unable to extract media URL')
1057 return
1058 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1059 if mobj is None:
1060 self._downloader.trouble(u'ERROR: unable to extract media URL')
1061 return
1062 mediaURL = mobj.group(1).replace('\\/', '/')
1063 video_extension = mediaURL[-3:]
1064 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1065
1066 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1067 if mobj is None:
1068 self._downloader.trouble(u'ERROR: unable to extract title')
1069 return
1070 video_title = mobj.group(1).decode('utf-8')
1071 video_title = sanitize_title(video_title)
1072
1073 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1074 if mobj is None:
1075 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1076 return
1077 video_uploader = mobj.group(1)
1078
1079 try:
1080 # Process video information
1081 self._downloader.process_info({
1082 'id': video_id.decode('utf-8'),
1083 'url': video_url.decode('utf-8'),
1084 'uploader': video_uploader.decode('utf-8'),
1085 'title': video_title,
1086 'stitle': simple_title,
1087 'ext': video_extension.decode('utf-8'),
1088 'format': u'NA',
1089 'player_url': None,
1090 })
1091 except UnavailableVideoError:
1092 self._downloader.trouble(u'ERROR: unable to download video')
1093
1094
1095 class DailymotionIE(InfoExtractor):
1096 """Information Extractor for Dailymotion"""
1097
1098 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1099
1100 def __init__(self, downloader=None):
1101 InfoExtractor.__init__(self, downloader)
1102
1103 @staticmethod
1104 def suitable(url):
1105 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1106
1107 def report_download_webpage(self, video_id):
1108 """Report webpage download."""
1109 self._downloader.to_stdout(u'[dailymotion] %s: Downloading webpage' % video_id)
1110
1111 def report_extraction(self, video_id):
1112 """Report information extraction."""
1113 self._downloader.to_stdout(u'[dailymotion] %s: Extracting information' % video_id)
1114
1115 def _real_initialize(self):
1116 return
1117
1118 def _real_extract(self, url):
1119 # Extract id and simplified title from URL
1120 mobj = re.match(self._VALID_URL, url)
1121 if mobj is None:
1122 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1123 return
1124
1125 # At this point we have a new video
1126 self._downloader.increment_downloads()
1127 video_id = mobj.group(1)
1128
1129 simple_title = mobj.group(2).decode('utf-8')
1130 video_extension = 'flv'
1131
1132 # Retrieve video webpage to extract further information
1133 request = urllib2.Request(url)
1134 try:
1135 self.report_download_webpage(video_id)
1136 webpage = urllib2.urlopen(request).read()
1137 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1138 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1139 return
1140
1141 # Extract URL, uploader and title from webpage
1142 self.report_extraction(video_id)
1143 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1144 if mobj is None:
1145 self._downloader.trouble(u'ERROR: unable to extract media URL')
1146 return
1147 mediaURL = urllib.unquote(mobj.group(1))
1148
1149 # if needed add http://www.dailymotion.com/ if relative URL
1150
1151 video_url = mediaURL
1152
1153 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1155 if mobj is None:
1156 self._downloader.trouble(u'ERROR: unable to extract title')
1157 return
1158 video_title = mobj.group(1).decode('utf-8')
1159 video_title = sanitize_title(video_title)
1160
1161 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1162 if mobj is None:
1163 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1164 return
1165 video_uploader = mobj.group(1)
1166
1167 try:
1168 # Process video information
1169 self._downloader.process_info({
1170 'id': video_id.decode('utf-8'),
1171 'url': video_url.decode('utf-8'),
1172 'uploader': video_uploader.decode('utf-8'),
1173 'title': video_title,
1174 'stitle': simple_title,
1175 'ext': video_extension.decode('utf-8'),
1176 'format': u'NA',
1177 'player_url': None,
1178 })
1179 except UnavailableVideoError:
1180 self._downloader.trouble(u'ERROR: unable to download video')
1181
1182 class GoogleIE(InfoExtractor):
1183 """Information extractor for video.google.com."""
1184
1185 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1186
1187 def __init__(self, downloader=None):
1188 InfoExtractor.__init__(self, downloader)
1189
1190 @staticmethod
1191 def suitable(url):
1192 return (re.match(GoogleIE._VALID_URL, url) is not None)
1193
1194 def report_download_webpage(self, video_id):
1195 """Report webpage download."""
1196 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1197
1198 def report_extraction(self, video_id):
1199 """Report information extraction."""
1200 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1201
1202 def _real_initialize(self):
1203 return
1204
1205 def _real_extract(self, url):
1206 # Extract id from URL
1207 mobj = re.match(self._VALID_URL, url)
1208 if mobj is None:
1209 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1210 return
1211
1212 # At this point we have a new video
1213 self._downloader.increment_downloads()
1214 video_id = mobj.group(1)
1215
1216 video_extension = 'mp4'
1217
1218 # Retrieve video webpage to extract further information
1219 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1220 try:
1221 self.report_download_webpage(video_id)
1222 webpage = urllib2.urlopen(request).read()
1223 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1224 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1225 return
1226
1227 # Extract URL, uploader, and title from webpage
1228 self.report_extraction(video_id)
1229 mobj = re.search(r"download_url:'([^']+)'", webpage)
1230 if mobj is None:
1231 video_extension = 'flv'
1232 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1233 if mobj is None:
1234 self._downloader.trouble(u'ERROR: unable to extract media URL')
1235 return
1236 mediaURL = urllib.unquote(mobj.group(1))
1237 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1238 mediaURL = mediaURL.replace('\\x26', '\x26')
1239
1240 video_url = mediaURL
1241
1242 mobj = re.search(r'<title>(.*)</title>', webpage)
1243 if mobj is None:
1244 self._downloader.trouble(u'ERROR: unable to extract title')
1245 return
1246 video_title = mobj.group(1).decode('utf-8')
1247 video_title = sanitize_title(video_title)
1248 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1249
1250 # Extract video description
1251 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1252 if mobj is None:
1253 self._downloader.trouble(u'ERROR: unable to extract video description')
1254 return
1255 video_description = mobj.group(1).decode('utf-8')
1256 if not video_description:
1257 video_description = 'No description available.'
1258
1259 # Extract video thumbnail
1260 if self._downloader.params.get('forcethumbnail', False):
1261 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1262 try:
1263 webpage = urllib2.urlopen(request).read()
1264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1266 return
1267 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1268 if mobj is None:
1269 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1270 return
1271 video_thumbnail = mobj.group(1)
1272 else: # we need something to pass to process_info
1273 video_thumbnail = ''
1274
1275
1276 try:
1277 # Process video information
1278 self._downloader.process_info({
1279 'id': video_id.decode('utf-8'),
1280 'url': video_url.decode('utf-8'),
1281 'uploader': u'NA',
1282 'title': video_title,
1283 'stitle': simple_title,
1284 'ext': video_extension.decode('utf-8'),
1285 'format': u'NA',
1286 'player_url': None,
1287 })
1288 except UnavailableVideoError:
1289 self._downloader.trouble(u'ERROR: unable to download video')
1290
1291
1292 class PhotobucketIE(InfoExtractor):
1293 """Information extractor for photobucket.com."""
1294
1295 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1296
1297 def __init__(self, downloader=None):
1298 InfoExtractor.__init__(self, downloader)
1299
1300 @staticmethod
1301 def suitable(url):
1302 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1303
1304 def report_download_webpage(self, video_id):
1305 """Report webpage download."""
1306 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1307
1308 def report_extraction(self, video_id):
1309 """Report information extraction."""
1310 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1311
1312 def _real_initialize(self):
1313 return
1314
1315 def _real_extract(self, url):
1316 # Extract id from URL
1317 mobj = re.match(self._VALID_URL, url)
1318 if mobj is None:
1319 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 return
1321
1322 # At this point we have a new video
1323 self._downloader.increment_downloads()
1324 video_id = mobj.group(1)
1325
1326 video_extension = 'flv'
1327
1328 # Retrieve video webpage to extract further information
1329 request = urllib2.Request(url)
1330 try:
1331 self.report_download_webpage(video_id)
1332 webpage = urllib2.urlopen(request).read()
1333 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1334 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1335 return
1336
1337 # Extract URL, uploader, and title from webpage
1338 self.report_extraction(video_id)
1339 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1340 if mobj is None:
1341 self._downloader.trouble(u'ERROR: unable to extract media URL')
1342 return
1343 mediaURL = urllib.unquote(mobj.group(1))
1344
1345 video_url = mediaURL
1346
1347 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1348 if mobj is None:
1349 self._downloader.trouble(u'ERROR: unable to extract title')
1350 return
1351 video_title = mobj.group(1).decode('utf-8')
1352 video_title = sanitize_title(video_title)
1353 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1354
1355 video_uploader = mobj.group(2).decode('utf-8')
1356
1357 try:
1358 # Process video information
1359 self._downloader.process_info({
1360 'id': video_id.decode('utf-8'),
1361 'url': video_url.decode('utf-8'),
1362 'uploader': video_uploader,
1363 'title': video_title,
1364 'stitle': simple_title,
1365 'ext': video_extension.decode('utf-8'),
1366 'format': u'NA',
1367 'player_url': None,
1368 })
1369 except UnavailableVideoError:
1370 self._downloader.trouble(u'ERROR: unable to download video')
1371
1372
1373 class YahooIE(InfoExtractor):
1374 """Information extractor for video.yahoo.com."""
1375
1376 # _VALID_URL matches all Yahoo! Video URLs
1377 # _VPAGE_URL matches only the extractable '/watch/' URLs
1378 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1379 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1380
1381 def __init__(self, downloader=None):
1382 InfoExtractor.__init__(self, downloader)
1383
1384 @staticmethod
1385 def suitable(url):
1386 return (re.match(YahooIE._VALID_URL, url) is not None)
1387
1388 def report_download_webpage(self, video_id):
1389 """Report webpage download."""
1390 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1391
1392 def report_extraction(self, video_id):
1393 """Report information extraction."""
1394 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1395
1396 def _real_initialize(self):
1397 return
1398
1399 def _real_extract(self, url, new_video=True):
1400 # Extract ID from URL
1401 mobj = re.match(self._VALID_URL, url)
1402 if mobj is None:
1403 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1404 return
1405
1406 # At this point we have a new video
1407 self._downloader.increment_downloads()
1408 video_id = mobj.group(2)
1409 video_extension = 'flv'
1410
1411 # Rewrite valid but non-extractable URLs as
1412 # extractable English language /watch/ URLs
1413 if re.match(self._VPAGE_URL, url) is None:
1414 request = urllib2.Request(url)
1415 try:
1416 webpage = urllib2.urlopen(request).read()
1417 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1418 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1419 return
1420
1421 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1422 if mobj is None:
1423 self._downloader.trouble(u'ERROR: Unable to extract id field')
1424 return
1425 yahoo_id = mobj.group(1)
1426
1427 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1428 if mobj is None:
1429 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1430 return
1431 yahoo_vid = mobj.group(1)
1432
1433 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1434 return self._real_extract(url, new_video=False)
1435
1436 # Retrieve video webpage to extract further information
1437 request = urllib2.Request(url)
1438 try:
1439 self.report_download_webpage(video_id)
1440 webpage = urllib2.urlopen(request).read()
1441 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1442 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1443 return
1444
1445 # Extract uploader and title from webpage
1446 self.report_extraction(video_id)
1447 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1448 if mobj is None:
1449 self._downloader.trouble(u'ERROR: unable to extract video title')
1450 return
1451 video_title = mobj.group(1).decode('utf-8')
1452 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1453
1454 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1455 if mobj is None:
1456 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1457 return
1458 video_uploader = mobj.group(1).decode('utf-8')
1459
1460 # Extract video thumbnail
1461 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1462 if mobj is None:
1463 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1464 return
1465 video_thumbnail = mobj.group(1).decode('utf-8')
1466
1467 # Extract video description
1468 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1469 if mobj is None:
1470 self._downloader.trouble(u'ERROR: unable to extract video description')
1471 return
1472 video_description = mobj.group(1).decode('utf-8')
1473 if not video_description: video_description = 'No description available.'
1474
1475 # Extract video height and width
1476 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1477 if mobj is None:
1478 self._downloader.trouble(u'ERROR: unable to extract video height')
1479 return
1480 yv_video_height = mobj.group(1)
1481
1482 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1483 if mobj is None:
1484 self._downloader.trouble(u'ERROR: unable to extract video width')
1485 return
1486 yv_video_width = mobj.group(1)
1487
1488 # Retrieve video playlist to extract media URL
1489 # I'm not completely sure what all these options are, but we
1490 # seem to need most of them, otherwise the server sends a 401.
1491 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1492 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1493 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1494 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1495 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1496 try:
1497 self.report_download_webpage(video_id)
1498 webpage = urllib2.urlopen(request).read()
1499 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1500 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1501 return
1502
1503 # Extract media URL from playlist XML
1504 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1505 if mobj is None:
1506 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1507 return
1508 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1509 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1510
1511 try:
1512 # Process video information
1513 self._downloader.process_info({
1514 'id': video_id.decode('utf-8'),
1515 'url': video_url,
1516 'uploader': video_uploader,
1517 'title': video_title,
1518 'stitle': simple_title,
1519 'ext': video_extension.decode('utf-8'),
1520 'thumbnail': video_thumbnail.decode('utf-8'),
1521 'description': video_description,
1522 'thumbnail': video_thumbnail,
1523 'description': video_description,
1524 'player_url': None,
1525 })
1526 except UnavailableVideoError:
1527 self._downloader.trouble(u'ERROR: unable to download video')
1528
1529
1530 class GenericIE(InfoExtractor):
1531 """Generic last-resort information extractor."""
1532
1533 def __init__(self, downloader=None):
1534 InfoExtractor.__init__(self, downloader)
1535
1536 @staticmethod
1537 def suitable(url):
1538 return True
1539
1540 def report_download_webpage(self, video_id):
1541 """Report webpage download."""
1542 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1543 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1544
1545 def report_extraction(self, video_id):
1546 """Report information extraction."""
1547 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1548
1549 def _real_initialize(self):
1550 return
1551
1552 def _real_extract(self, url):
1553 # At this point we have a new video
1554 self._downloader.increment_downloads()
1555
1556 video_id = url.split('/')[-1]
1557 request = urllib2.Request(url)
1558 try:
1559 self.report_download_webpage(video_id)
1560 webpage = urllib2.urlopen(request).read()
1561 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1562 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1563 return
1564 except ValueError, err:
1565 # since this is the last-resort InfoExtractor, if
1566 # this error is thrown, it'll be thrown here
1567 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1568 return
1569
1570 # Start with something easy: JW Player in SWFObject
1571 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1572 if mobj is None:
1573 # Broaden the search a little bit
1574 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1575 if mobj is None:
1576 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1577 return
1578
1579 # It's possible that one of the regexes
1580 # matched, but returned an empty group:
1581 if mobj.group(1) is None:
1582 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1583 return
1584
1585 video_url = urllib.unquote(mobj.group(1))
1586 video_id = os.path.basename(video_url)
1587
1588 # here's a fun little line of code for you:
1589 video_extension = os.path.splitext(video_id)[1][1:]
1590 video_id = os.path.splitext(video_id)[0]
1591
1592 # it's tempting to parse this further, but you would
1593 # have to take into account all the variations like
1594 # Video Title - Site Name
1595 # Site Name | Video Title
1596 # Video Title - Tagline | Site Name
1597 # and so on and so forth; it's just not practical
1598 mobj = re.search(r'<title>(.*)</title>', webpage)
1599 if mobj is None:
1600 self._downloader.trouble(u'ERROR: unable to extract title')
1601 return
1602 video_title = mobj.group(1).decode('utf-8')
1603 video_title = sanitize_title(video_title)
1604 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1605
1606 # video uploader is domain name
1607 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1610 return
1611 video_uploader = mobj.group(1).decode('utf-8')
1612
1613 try:
1614 # Process video information
1615 self._downloader.process_info({
1616 'id': video_id.decode('utf-8'),
1617 'url': video_url.decode('utf-8'),
1618 'uploader': video_uploader,
1619 'title': video_title,
1620 'stitle': simple_title,
1621 'ext': video_extension.decode('utf-8'),
1622 'format': u'NA',
1623 'player_url': None,
1624 })
1625 except UnavailableVideoError, err:
1626 self._downloader.trouble(u'ERROR: unable to download video')
1627
1628
1629 class YoutubeSearchIE(InfoExtractor):
1630 """Information Extractor for YouTube search queries."""
1631 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1632 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1633 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1634 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1635 _youtube_ie = None
1636 _max_youtube_results = 1000
1637
1638 def __init__(self, youtube_ie, downloader=None):
1639 InfoExtractor.__init__(self, downloader)
1640 self._youtube_ie = youtube_ie
1641
1642 @staticmethod
1643 def suitable(url):
1644 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1645
1646 def report_download_page(self, query, pagenum):
1647 """Report attempt to download playlist page with given number."""
1648 query = query.decode(preferredencoding())
1649 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1650
1651 def _real_initialize(self):
1652 self._youtube_ie.initialize()
1653
1654 def _real_extract(self, query):
1655 mobj = re.match(self._VALID_QUERY, query)
1656 if mobj is None:
1657 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1658 return
1659
1660 prefix, query = query.split(':')
1661 prefix = prefix[8:]
1662 query = query.encode('utf-8')
1663 if prefix == '':
1664 self._download_n_results(query, 1)
1665 return
1666 elif prefix == 'all':
1667 self._download_n_results(query, self._max_youtube_results)
1668 return
1669 else:
1670 try:
1671 n = long(prefix)
1672 if n <= 0:
1673 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1674 return
1675 elif n > self._max_youtube_results:
1676 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1677 n = self._max_youtube_results
1678 self._download_n_results(query, n)
1679 return
1680 except ValueError: # parsing prefix as integer fails
1681 self._download_n_results(query, 1)
1682 return
1683
1684 def _download_n_results(self, query, n):
1685 """Downloads a specified number of results for a query"""
1686
1687 video_ids = []
1688 already_seen = set()
1689 pagenum = 1
1690
1691 while True:
1692 self.report_download_page(query, pagenum)
1693 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1694 request = urllib2.Request(result_url, None, std_headers)
1695 try:
1696 page = urllib2.urlopen(request).read()
1697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1699 return
1700
1701 # Extract video identifiers
1702 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1704 if video_id not in already_seen:
1705 video_ids.append(video_id)
1706 already_seen.add(video_id)
1707 if len(video_ids) == n:
1708 # Specified n videos reached
1709 for id in video_ids:
1710 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1711 return
1712
1713 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1714 for id in video_ids:
1715 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1716 return
1717
1718 pagenum = pagenum + 1
1719
1720 class GoogleSearchIE(InfoExtractor):
1721 """Information Extractor for Google Video search queries."""
1722 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1723 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1724 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1725 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1726 _google_ie = None
1727 _max_google_results = 1000
1728
1729 def __init__(self, google_ie, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1731 self._google_ie = google_ie
1732
1733 @staticmethod
1734 def suitable(url):
1735 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1736
1737 def report_download_page(self, query, pagenum):
1738 """Report attempt to download playlist page with given number."""
1739 query = query.decode(preferredencoding())
1740 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1741
1742 def _real_initialize(self):
1743 self._google_ie.initialize()
1744
1745 def _real_extract(self, query):
1746 mobj = re.match(self._VALID_QUERY, query)
1747 if mobj is None:
1748 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1749 return
1750
1751 prefix, query = query.split(':')
1752 prefix = prefix[8:]
1753 query = query.encode('utf-8')
1754 if prefix == '':
1755 self._download_n_results(query, 1)
1756 return
1757 elif prefix == 'all':
1758 self._download_n_results(query, self._max_google_results)
1759 return
1760 else:
1761 try:
1762 n = long(prefix)
1763 if n <= 0:
1764 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1765 return
1766 elif n > self._max_google_results:
1767 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1768 n = self._max_google_results
1769 self._download_n_results(query, n)
1770 return
1771 except ValueError: # parsing prefix as integer fails
1772 self._download_n_results(query, 1)
1773 return
1774
1775 def _download_n_results(self, query, n):
1776 """Downloads a specified number of results for a query"""
1777
1778 video_ids = []
1779 already_seen = set()
1780 pagenum = 1
1781
1782 while True:
1783 self.report_download_page(query, pagenum)
1784 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1785 request = urllib2.Request(result_url, None, std_headers)
1786 try:
1787 page = urllib2.urlopen(request).read()
1788 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1790 return
1791
1792 # Extract video identifiers
1793 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794 video_id = mobj.group(1)
1795 if video_id not in already_seen:
1796 video_ids.append(video_id)
1797 already_seen.add(video_id)
1798 if len(video_ids) == n:
1799 # Specified n videos reached
1800 for id in video_ids:
1801 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1802 return
1803
1804 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1805 for id in video_ids:
1806 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1807 return
1808
1809 pagenum = pagenum + 1
1810
1811 class YahooSearchIE(InfoExtractor):
1812 """Information Extractor for Yahoo! Video search queries."""
1813 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1814 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1815 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1816 _MORE_PAGES_INDICATOR = r'\s*Next'
1817 _yahoo_ie = None
1818 _max_yahoo_results = 1000
1819
1820 def __init__(self, yahoo_ie, downloader=None):
1821 InfoExtractor.__init__(self, downloader)
1822 self._yahoo_ie = yahoo_ie
1823
1824 @staticmethod
1825 def suitable(url):
1826 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1827
1828 def report_download_page(self, query, pagenum):
1829 """Report attempt to download playlist page with given number."""
1830 query = query.decode(preferredencoding())
1831 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1832
1833 def _real_initialize(self):
1834 self._yahoo_ie.initialize()
1835
1836 def _real_extract(self, query):
1837 mobj = re.match(self._VALID_QUERY, query)
1838 if mobj is None:
1839 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1840 return
1841
1842 prefix, query = query.split(':')
1843 prefix = prefix[8:]
1844 query = query.encode('utf-8')
1845 if prefix == '':
1846 self._download_n_results(query, 1)
1847 return
1848 elif prefix == 'all':
1849 self._download_n_results(query, self._max_yahoo_results)
1850 return
1851 else:
1852 try:
1853 n = long(prefix)
1854 if n <= 0:
1855 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1856 return
1857 elif n > self._max_yahoo_results:
1858 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1859 n = self._max_yahoo_results
1860 self._download_n_results(query, n)
1861 return
1862 except ValueError: # parsing prefix as integer fails
1863 self._download_n_results(query, 1)
1864 return
1865
1866 def _download_n_results(self, query, n):
1867 """Downloads a specified number of results for a query"""
1868
1869 video_ids = []
1870 already_seen = set()
1871 pagenum = 1
1872
1873 while True:
1874 self.report_download_page(query, pagenum)
1875 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1876 request = urllib2.Request(result_url, None, std_headers)
1877 try:
1878 page = urllib2.urlopen(request).read()
1879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1881 return
1882
1883 # Extract video identifiers
1884 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1885 video_id = mobj.group(1)
1886 if video_id not in already_seen:
1887 video_ids.append(video_id)
1888 already_seen.add(video_id)
1889 if len(video_ids) == n:
1890 # Specified n videos reached
1891 for id in video_ids:
1892 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1893 return
1894
1895 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1896 for id in video_ids:
1897 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1898 return
1899
1900 pagenum = pagenum + 1
1901
1902 class YoutubePlaylistIE(InfoExtractor):
1903 """Information Extractor for YouTube playlists."""
1904
1905 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1906 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1907 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1908 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1909 _youtube_ie = None
1910
1911 def __init__(self, youtube_ie, downloader=None):
1912 InfoExtractor.__init__(self, downloader)
1913 self._youtube_ie = youtube_ie
1914
1915 @staticmethod
1916 def suitable(url):
1917 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1918
1919 def report_download_page(self, playlist_id, pagenum):
1920 """Report attempt to download playlist page with given number."""
1921 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1922
1923 def _real_initialize(self):
1924 self._youtube_ie.initialize()
1925
1926 def _real_extract(self, url):
1927 # Extract playlist id
1928 mobj = re.match(self._VALID_URL, url)
1929 if mobj is None:
1930 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1931 return
1932
1933 # Download playlist pages
1934 playlist_id = mobj.group(1)
1935 video_ids = []
1936 pagenum = 1
1937
1938 while True:
1939 self.report_download_page(playlist_id, pagenum)
1940 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1941 try:
1942 page = urllib2.urlopen(request).read()
1943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945 return
1946
1947 # Extract video identifiers
1948 ids_in_page = []
1949 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1950 if mobj.group(1) not in ids_in_page:
1951 ids_in_page.append(mobj.group(1))
1952 video_ids.extend(ids_in_page)
1953
1954 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1955 break
1956 pagenum = pagenum + 1
1957
1958 playliststart = self._downloader.params.get('playliststart', 1)
1959 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1960 if playliststart > 0:
1961 video_ids = video_ids[playliststart:]
1962
1963 for id in video_ids:
1964 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1965 return
1966
1967 class YoutubeUserIE(InfoExtractor):
1968 """Information Extractor for YouTube users."""
1969
1970 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1971 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1972 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1973 _youtube_ie = None
1974
1975 def __init__(self, youtube_ie, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977 self._youtube_ie = youtube_ie
1978
1979 @staticmethod
1980 def suitable(url):
1981 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1982
1983 def report_download_page(self, username):
1984 """Report attempt to download user page."""
1985 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1986
1987 def _real_initialize(self):
1988 self._youtube_ie.initialize()
1989
1990 def _real_extract(self, url):
1991 # Extract username
1992 mobj = re.match(self._VALID_URL, url)
1993 if mobj is None:
1994 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995 return
1996
1997 # Download user page
1998 username = mobj.group(1)
1999 video_ids = []
2000 pagenum = 1
2001
2002 self.report_download_page(username)
2003 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2004 try:
2005 page = urllib2.urlopen(request).read()
2006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2008 return
2009
2010 # Extract video identifiers
2011 ids_in_page = []
2012
2013 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014 if mobj.group(1) not in ids_in_page:
2015 ids_in_page.append(mobj.group(1))
2016 video_ids.extend(ids_in_page)
2017
2018 playliststart = self._downloader.params.get('playliststart', 1)
2019 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2020 if playliststart > 0:
2021 video_ids = video_ids[playliststart:]
2022
2023 for id in video_ids:
2024 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2025 return
2026
2027 class PostProcessor(object):
2028 """Post Processor class.
2029
2030 PostProcessor objects can be added to downloaders with their
2031 add_post_processor() method. When the downloader has finished a
2032 successful download, it will take its internal chain of PostProcessors
2033 and start calling the run() method on each one of them, first with
2034 an initial argument and then with the returned value of the previous
2035 PostProcessor.
2036
2037 The chain will be stopped if one of them ever returns None or the end
2038 of the chain is reached.
2039
2040 PostProcessor objects follow a "mutual registration" process similar
2041 to InfoExtractor objects.
2042 """
2043
2044 _downloader = None
2045
2046 def __init__(self, downloader=None):
2047 self._downloader = downloader
2048
2049 def set_downloader(self, downloader):
2050 """Sets the downloader for this PP."""
2051 self._downloader = downloader
2052
2053 def run(self, information):
2054 """Run the PostProcessor.
2055
2056 The "information" argument is a dictionary like the ones
2057 composed by InfoExtractors. The only difference is that this
2058 one has an extra field called "filepath" that points to the
2059 downloaded file.
2060
2061 When this method returns None, the postprocessing chain is
2062 stopped. However, this method may return an information
2063 dictionary that will be passed to the next postprocessing
2064 object in the chain. It can be the one it received after
2065 changing some fields.
2066
2067 In addition, this method may raise a PostProcessingError
2068 exception that will be taken into account by the downloader
2069 it was called from.
2070 """
2071 return information # by default, do nothing
2072
2073 ### MAIN PROGRAM ###
2074 if __name__ == '__main__':
2075 try:
2076 # Modules needed only when running the main program
2077 import getpass
2078 import optparse
2079
2080 # Function to update the program file with the latest version from bitbucket.org
2081 def update_self(downloader, filename):
2082 # Note: downloader only used for options
2083 if not os.access (filename, os.W_OK):
2084 sys.exit('ERROR: no write permissions on %s' % filename)
2085
2086 downloader.to_stdout('Updating to latest stable version...')
2087 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2088 latest_version = urllib.urlopen(latest_url).read().strip()
2089 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2090 newcontent = urllib.urlopen(prog_url).read()
2091 stream = open(filename, 'w')
2092 stream.write(newcontent)
2093 stream.close()
2094 downloader.to_stdout('Updated to version %s' % latest_version)
2095
2096 # General configuration
2097 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2098 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2099 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2100
2101 # Parse command line
2102 parser = optparse.OptionParser(
2103 usage='Usage: %prog [options] url...',
2104 version='2010.10.03',
2105 conflict_handler='resolve',
2106 )
2107
2108 parser.add_option('-h', '--help',
2109 action='help', help='print this help text and exit')
2110 parser.add_option('-v', '--version',
2111 action='version', help='print program version and exit')
2112 parser.add_option('-U', '--update',
2113 action='store_true', dest='update_self', help='update this program to latest stable version')
2114 parser.add_option('-i', '--ignore-errors',
2115 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2116 parser.add_option('-r', '--rate-limit',
2117 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2118 parser.add_option('-R', '--retries',
2119 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2120 parser.add_option('--playlist-start',
2121 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2122
2123 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2124 authentication.add_option('-u', '--username',
2125 dest='username', metavar='USERNAME', help='account username')
2126 authentication.add_option('-p', '--password',
2127 dest='password', metavar='PASSWORD', help='account password')
2128 authentication.add_option('-n', '--netrc',
2129 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2130 parser.add_option_group(authentication)
2131
2132 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2133 video_format.add_option('-f', '--format',
2134 action='store', dest='format', metavar='FORMAT', help='video format code')
2135 video_format.add_option('-m', '--mobile-version',
2136 action='store_const', dest='format', help='alias for -f 17', const='17')
2137 video_format.add_option('--all-formats',
2138 action='store_const', dest='format', help='download all available video formats', const='-1')
2139 video_format.add_option('--max-quality',
2140 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2141 video_format.add_option('-b', '--best-quality',
2142 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2143 parser.add_option_group(video_format)
2144
2145 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2146 verbosity.add_option('-q', '--quiet',
2147 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2148 verbosity.add_option('-s', '--simulate',
2149 action='store_true', dest='simulate', help='do not download video', default=False)
2150 verbosity.add_option('-g', '--get-url',
2151 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2152 verbosity.add_option('-e', '--get-title',
2153 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2154 verbosity.add_option('--get-thumbnail',
2155 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2156 verbosity.add_option('--get-description',
2157 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2158 verbosity.add_option('--no-progress',
2159 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2160 parser.add_option_group(verbosity)
2161
2162 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2163 filesystem.add_option('-t', '--title',
2164 action='store_true', dest='usetitle', help='use title in file name', default=False)
2165 filesystem.add_option('-l', '--literal',
2166 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2167 filesystem.add_option('-o', '--output',
2168 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2169 filesystem.add_option('-a', '--batch-file',
2170 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2171 filesystem.add_option('-w', '--no-overwrites',
2172 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2173 filesystem.add_option('-c', '--continue',
2174 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2175 parser.add_option_group(filesystem)
2176
2177 (opts, args) = parser.parse_args()
2178
2179 # Batch file verification
2180 batchurls = []
2181 if opts.batchfile is not None:
2182 try:
2183 if opts.batchfile == '-':
2184 batchfd = sys.stdin
2185 else:
2186 batchfd = open(opts.batchfile, 'r')
2187 batchurls = batchfd.readlines()
2188 batchurls = [x.strip() for x in batchurls]
2189 batchurls = [x for x in batchurls if len(x) > 0]
2190 except IOError:
2191 sys.exit(u'ERROR: batch file could not be read')
2192 all_urls = batchurls + args
2193
2194 # Conflicting, missing and erroneous options
2195 if opts.bestquality:
2196 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2197 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2198 parser.error(u'using .netrc conflicts with giving username/password')
2199 if opts.password is not None and opts.username is None:
2200 parser.error(u'account username missing')
2201 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2202 parser.error(u'using output template conflicts with using title or literal title')
2203 if opts.usetitle and opts.useliteral:
2204 parser.error(u'using title conflicts with using literal title')
2205 if opts.username is not None and opts.password is None:
2206 opts.password = getpass.getpass(u'Type account password and press return:')
2207 if opts.ratelimit is not None:
2208 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2209 if numeric_limit is None:
2210 parser.error(u'invalid rate limit specified')
2211 opts.ratelimit = numeric_limit
2212 if opts.retries is not None:
2213 try:
2214 opts.retries = long(opts.retries)
2215 except (TypeError, ValueError), err:
2216 parser.error(u'invalid retry count specified')
2217 if opts.playliststart is not None:
2218 try:
2219 opts.playliststart = long(opts.playliststart)
2220 except (TypeError, ValueError), err:
2221 parser.error(u'invalid playlist page specified')
2222
2223 # Information extractors
2224 youtube_ie = YoutubeIE()
2225 metacafe_ie = MetacafeIE(youtube_ie)
2226 dailymotion_ie = DailymotionIE()
2227 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2228 youtube_user_ie = YoutubeUserIE(youtube_ie)
2229 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2230 google_ie = GoogleIE()
2231 google_search_ie = GoogleSearchIE(google_ie)
2232 photobucket_ie = PhotobucketIE()
2233 yahoo_ie = YahooIE()
2234 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2235 generic_ie = GenericIE()
2236
2237 # File downloader
2238 fd = FileDownloader({
2239 'usenetrc': opts.usenetrc,
2240 'username': opts.username,
2241 'password': opts.password,
2242 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2243 'forceurl': opts.geturl,
2244 'forcetitle': opts.gettitle,
2245 'forcethumbnail': opts.getthumbnail,
2246 'forcedescription': opts.getdescription,
2247 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2248 'format': opts.format,
2249 'format_limit': opts.format_limit,
2250 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2251 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2252 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2253 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2254 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2255 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2256 or u'%(id)s.%(ext)s'),
2257 'ignoreerrors': opts.ignoreerrors,
2258 'ratelimit': opts.ratelimit,
2259 'nooverwrites': opts.nooverwrites,
2260 'retries': opts.retries,
2261 'continuedl': opts.continue_dl,
2262 'noprogress': opts.noprogress,
2263 'playliststart': opts.playliststart,
2264 })
2265 fd.add_info_extractor(youtube_search_ie)
2266 fd.add_info_extractor(youtube_pl_ie)
2267 fd.add_info_extractor(youtube_user_ie)
2268 fd.add_info_extractor(metacafe_ie)
2269 fd.add_info_extractor(dailymotion_ie)
2270 fd.add_info_extractor(youtube_ie)
2271 fd.add_info_extractor(google_ie)
2272 fd.add_info_extractor(google_search_ie)
2273 fd.add_info_extractor(photobucket_ie)
2274 fd.add_info_extractor(yahoo_ie)
2275 fd.add_info_extractor(yahoo_search_ie)
2276
2277 # This must come last since it's the
2278 # fallback if none of the others work
2279 fd.add_info_extractor(generic_ie)
2280
2281 # Update version
2282 if opts.update_self:
2283 update_self(fd, sys.argv[0])
2284
2285 # Maybe do nothing
2286 if len(all_urls) < 1:
2287 if not opts.update_self:
2288 parser.error(u'you must provide at least one URL')
2289 else:
2290 sys.exit()
2291 retcode = fd.download(all_urls)
2292 sys.exit(retcode)
2293
2294 except DownloadError:
2295 sys.exit(1)
2296 except SameFileError:
2297 sys.exit(u'ERROR: fixed output name but more than one file to download')
2298 except KeyboardInterrupt:
2299 sys.exit(u'\nERROR: Interrupted by user')