]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
24722d292ecadfd4b16e2775ca9a53efc145628a
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
7 import cookielib
8 import htmlentitydefs
9 import httplib
10 import locale
11 import math
12 import netrc
13 import os
14 import os.path
15 import re
16 import socket
17 import string
18 import subprocess
19 import sys
20 import time
21 import urllib
22 import urllib2
23
24 # parse_qs was moved from the cgi module to the urlparse module recently.
25 try:
26 from urlparse import parse_qs
27 except ImportError:
28 from cgi import parse_qs
29
30 std_headers = {
31 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
32 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
33 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
34 'Accept-Language': 'en-us,en;q=0.5',
35 }
36
37 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
38
39 month_name_to_number = {
40 'January': '01',
41 'February': '02',
42 'March': '03',
43 'April': '04',
44 'May': '05',
45 'June': '06',
46 'July': '07',
47 'August': '08',
48 'September': '09',
49 'October': '10',
50 'November': '11',
51 'December': '12',
52 }
53
54 def preferredencoding():
55 """Get preferred encoding.
56
57 Returns the best encoding scheme for the system, based on
58 locale.getpreferredencoding() and some further tweaks.
59 """
60 def yield_preferredencoding():
61 try:
62 pref = locale.getpreferredencoding()
63 u'TEST'.encode(pref)
64 except:
65 pref = 'UTF-8'
66 while True:
67 yield pref
68 return yield_preferredencoding().next()
69
70 def htmlentity_transform(matchobj):
71 """Transforms an HTML entity to a Unicode character.
72
73 This function receives a match object and is intended to be used with
74 the re.sub() function.
75 """
76 entity = matchobj.group(1)
77
78 # Known non-numeric HTML entity
79 if entity in htmlentitydefs.name2codepoint:
80 return unichr(htmlentitydefs.name2codepoint[entity])
81
82 # Unicode character
83 mobj = re.match(ur'(?u)#(x?\d+)', entity)
84 if mobj is not None:
85 numstr = mobj.group(1)
86 if numstr.startswith(u'x'):
87 base = 16
88 numstr = u'0%s' % numstr
89 else:
90 base = 10
91 return unichr(long(numstr, base))
92
93 # Unknown entity in name, return its literal representation
94 return (u'&%s;' % entity)
95
96 def sanitize_title(utitle):
97 """Sanitizes a video title so it could be used as part of a filename."""
98 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
99 return utitle.replace(unicode(os.sep), u'%')
100
101 def sanitize_open(filename, open_mode):
102 """Try to open the given filename, and slightly tweak it if this fails.
103
104 Attempts to open the given filename. If this fails, it tries to change
105 the filename slightly, step by step, until it's either able to open it
106 or it fails and raises a final exception, like the standard open()
107 function.
108
109 It returns the tuple (stream, definitive_file_name).
110 """
111 try:
112 if filename == u'-':
113 if sys.platform == 'win32':
114 import msvcrt
115 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
116 return (sys.stdout, filename)
117 stream = open(filename, open_mode)
118 return (stream, filename)
119 except (IOError, OSError), err:
120 # In case of error, try to remove win32 forbidden chars
121 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
122
123 # An exception here should be caught in the caller
124 stream = open(filename, open_mode)
125 return (stream, filename)
126
127
128 class DownloadError(Exception):
129 """Download Error exception.
130
131 This exception may be thrown by FileDownloader objects if they are not
132 configured to continue on errors. They will contain the appropriate
133 error message.
134 """
135 pass
136
137 class SameFileError(Exception):
138 """Same File exception.
139
140 This exception will be thrown by FileDownloader objects if they detect
141 multiple files would have to be downloaded to the same file on disk.
142 """
143 pass
144
145 class PostProcessingError(Exception):
146 """Post Processing exception.
147
148 This exception may be raised by PostProcessor's .run() method to
149 indicate an error in the postprocessing task.
150 """
151 pass
152
153 class UnavailableVideoError(Exception):
154 """Unavailable Format exception.
155
156 This exception will be thrown when a video is requested
157 in a format that is not available for that video.
158 """
159 pass
160
161 class ContentTooShortError(Exception):
162 """Content Too Short exception.
163
164 This exception may be raised by FileDownloader objects when a file they
165 download is too small for what the server announced first, indicating
166 the connection was probably interrupted.
167 """
168 # Both in bytes
169 downloaded = None
170 expected = None
171
172 def __init__(self, downloaded, expected):
173 self.downloaded = downloaded
174 self.expected = expected
175
176 class FileDownloader(object):
177 """File Downloader class.
178
179 File downloader objects are the ones responsible of downloading the
180 actual video file and writing it to disk if the user has requested
181 it, among some other tasks. In most cases there should be one per
182 program. As, given a video URL, the downloader doesn't know how to
183 extract all the needed information, task that InfoExtractors do, it
184 has to pass the URL to one of them.
185
186 For this, file downloader objects have a method that allows
187 InfoExtractors to be registered in a given order. When it is passed
188 a URL, the file downloader handles it to the first InfoExtractor it
189 finds that reports being able to handle it. The InfoExtractor extracts
190 all the information about the video or videos the URL refers to, and
191 asks the FileDownloader to process the video information, possibly
192 downloading the video.
193
194 File downloaders accept a lot of parameters. In order not to saturate
195 the object constructor with arguments, it receives a dictionary of
196 options instead. These options are available through the params
197 attribute for the InfoExtractors to use. The FileDownloader also
198 registers itself as the downloader in charge for the InfoExtractors
199 that are added to it, so this is a "mutual registration".
200
201 Available options:
202
203 username: Username for authentication purposes.
204 password: Password for authentication purposes.
205 usenetrc: Use netrc for authentication instead.
206 quiet: Do not print messages to stdout.
207 forceurl: Force printing final URL.
208 forcetitle: Force printing title.
209 forcethumbnail: Force printing thumbnail URL.
210 forcedescription: Force printing description.
211 simulate: Do not download the video files.
212 format: Video format code.
213 format_limit: Highest quality format to try.
214 outtmpl: Template for output names.
215 ignoreerrors: Do not stop on download errors.
216 ratelimit: Download speed limit, in bytes/sec.
217 nooverwrites: Prevent overwriting files.
218 retries: Number of times to retry for HTTP error 5xx
219 continuedl: Try to continue downloads if possible.
220 noprogress: Do not print the progress bar.
221 playliststart: Playlist item to start at.
222 playlistend: Playlist item to end at.
223 logtostderr: Log messages to stderr instead of stdout.
224 """
225
226 params = None
227 _ies = []
228 _pps = []
229 _download_retcode = None
230 _num_downloads = None
231 _screen_file = None
232
233 def __init__(self, params):
234 """Create a FileDownloader object with the given options."""
235 self._ies = []
236 self._pps = []
237 self._download_retcode = 0
238 self._num_downloads = 0
239 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
240 self.params = params
241
242 @staticmethod
243 def pmkdir(filename):
244 """Create directory components in filename. Similar to Unix "mkdir -p"."""
245 components = filename.split(os.sep)
246 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
247 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
248 for dir in aggregate:
249 if not os.path.exists(dir):
250 os.mkdir(dir)
251
252 @staticmethod
253 def format_bytes(bytes):
254 if bytes is None:
255 return 'N/A'
256 if type(bytes) is str:
257 bytes = float(bytes)
258 if bytes == 0.0:
259 exponent = 0
260 else:
261 exponent = long(math.log(bytes, 1024.0))
262 suffix = 'bkMGTPEZY'[exponent]
263 converted = float(bytes) / float(1024**exponent)
264 return '%.2f%s' % (converted, suffix)
265
266 @staticmethod
267 def calc_percent(byte_counter, data_len):
268 if data_len is None:
269 return '---.-%'
270 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
271
272 @staticmethod
273 def calc_eta(start, now, total, current):
274 if total is None:
275 return '--:--'
276 dif = now - start
277 if current == 0 or dif < 0.001: # One millisecond
278 return '--:--'
279 rate = float(current) / dif
280 eta = long((float(total) - float(current)) / rate)
281 (eta_mins, eta_secs) = divmod(eta, 60)
282 if eta_mins > 99:
283 return '--:--'
284 return '%02d:%02d' % (eta_mins, eta_secs)
285
286 @staticmethod
287 def calc_speed(start, now, bytes):
288 dif = now - start
289 if bytes == 0 or dif < 0.001: # One millisecond
290 return '%10s' % '---b/s'
291 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
292
293 @staticmethod
294 def best_block_size(elapsed_time, bytes):
295 new_min = max(bytes / 2.0, 1.0)
296 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
297 if elapsed_time < 0.001:
298 return long(new_max)
299 rate = bytes / elapsed_time
300 if rate > new_max:
301 return long(new_max)
302 if rate < new_min:
303 return long(new_min)
304 return long(rate)
305
306 @staticmethod
307 def parse_bytes(bytestr):
308 """Parse a string indicating a byte quantity into a long integer."""
309 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
310 if matchobj is None:
311 return None
312 number = float(matchobj.group(1))
313 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
314 return long(round(number * multiplier))
315
316 def add_info_extractor(self, ie):
317 """Add an InfoExtractor object to the end of the list."""
318 self._ies.append(ie)
319 ie.set_downloader(self)
320
321 def add_post_processor(self, pp):
322 """Add a PostProcessor object to the end of the chain."""
323 self._pps.append(pp)
324 pp.set_downloader(self)
325
326 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
327 """Print message to stdout if not in quiet mode."""
328 try:
329 if not self.params.get('quiet', False):
330 terminator = [u'\n', u''][skip_eol]
331 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
332 self._screen_file.flush()
333 except (UnicodeEncodeError), err:
334 if not ignore_encoding_errors:
335 raise
336
337 def to_stderr(self, message):
338 """Print message to stderr."""
339 print >>sys.stderr, message.encode(preferredencoding())
340
341 def fixed_template(self):
342 """Checks if the output template is fixed."""
343 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
344
345 def trouble(self, message=None):
346 """Determine action to take when a download problem appears.
347
348 Depending on if the downloader has been configured to ignore
349 download errors or not, this method may throw an exception or
350 not when errors are found, after printing the message.
351 """
352 if message is not None:
353 self.to_stderr(message)
354 if not self.params.get('ignoreerrors', False):
355 raise DownloadError(message)
356 self._download_retcode = 1
357
358 def slow_down(self, start_time, byte_counter):
359 """Sleep if the download speed is over the rate limit."""
360 rate_limit = self.params.get('ratelimit', None)
361 if rate_limit is None or byte_counter == 0:
362 return
363 now = time.time()
364 elapsed = now - start_time
365 if elapsed <= 0.0:
366 return
367 speed = float(byte_counter) / elapsed
368 if speed > rate_limit:
369 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
370
371 def report_destination(self, filename):
372 """Report destination filename."""
373 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
374
375 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
376 """Report download progress."""
377 if self.params.get('noprogress', False):
378 return
379 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
380 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
381
382 def report_resuming_byte(self, resume_len):
383 """Report attempt to resume at given byte."""
384 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
385
386 def report_retry(self, count, retries):
387 """Report retry in case of HTTP error 5xx"""
388 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
389
390 def report_file_already_downloaded(self, file_name):
391 """Report file has already been fully downloaded."""
392 try:
393 self.to_screen(u'[download] %s has already been downloaded' % file_name)
394 except (UnicodeEncodeError), err:
395 self.to_screen(u'[download] The file has already been downloaded')
396
397 def report_unable_to_resume(self):
398 """Report it was impossible to resume download."""
399 self.to_screen(u'[download] Unable to resume')
400
401 def report_finish(self):
402 """Report download finished."""
403 if self.params.get('noprogress', False):
404 self.to_screen(u'[download] Download completed')
405 else:
406 self.to_screen(u'')
407
408 def increment_downloads(self):
409 """Increment the ordinal that assigns a number to each file."""
410 self._num_downloads += 1
411
412 def process_info(self, info_dict):
413 """Process a single dictionary returned by an InfoExtractor."""
414 # Do nothing else if in simulate mode
415 if self.params.get('simulate', False):
416 # Forced printings
417 if self.params.get('forcetitle', False):
418 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
419 if self.params.get('forceurl', False):
420 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
421 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
422 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
423 if self.params.get('forcedescription', False) and 'description' in info_dict:
424 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
425
426 return
427
428 try:
429 template_dict = dict(info_dict)
430 template_dict['epoch'] = unicode(long(time.time()))
431 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
432 filename = self.params['outtmpl'] % template_dict
433 except (ValueError, KeyError), err:
434 self.trouble(u'ERROR: invalid system charset or erroneous output template')
435 return
436 if self.params.get('nooverwrites', False) and os.path.exists(filename):
437 self.to_stderr(u'WARNING: file exists and will be skipped')
438 return
439
440 try:
441 self.pmkdir(filename)
442 except (OSError, IOError), err:
443 self.trouble(u'ERROR: unable to create directories: %s' % str(err))
444 return
445
446 try:
447 success = self._do_download(filename, info_dict['url'].encode('utf-8'), info_dict.get('player_url', None))
448 except (OSError, IOError), err:
449 raise UnavailableVideoError
450 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
451 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
452 return
453 except (ContentTooShortError, ), err:
454 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
455 return
456
457 if success:
458 try:
459 self.post_process(filename, info_dict)
460 except (PostProcessingError), err:
461 self.trouble(u'ERROR: postprocessing: %s' % str(err))
462 return
463
464 def download(self, url_list):
465 """Download a given list of URLs."""
466 if len(url_list) > 1 and self.fixed_template():
467 raise SameFileError(self.params['outtmpl'])
468
469 for url in url_list:
470 suitable_found = False
471 for ie in self._ies:
472 # Go to next InfoExtractor if not suitable
473 if not ie.suitable(url):
474 continue
475
476 # Suitable InfoExtractor found
477 suitable_found = True
478
479 # Extract information from URL and process it
480 ie.extract(url)
481
482 # Suitable InfoExtractor had been found; go to next URL
483 break
484
485 if not suitable_found:
486 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
487
488 return self._download_retcode
489
490 def post_process(self, filename, ie_info):
491 """Run the postprocessing chain on the given file."""
492 info = dict(ie_info)
493 info['filepath'] = filename
494 for pp in self._pps:
495 info = pp.run(info)
496 if info is None:
497 break
498
499 def _download_with_rtmpdump(self, filename, url, player_url):
500 self.report_destination(filename)
501
502 # Check for rtmpdump first
503 try:
504 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
505 except (OSError, IOError):
506 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
507 return False
508
509 # Download using rtmpdump. rtmpdump returns exit code 2 when
510 # the connection was interrumpted and resuming appears to be
511 # possible. This is part of rtmpdump's normal usage, AFAIK.
512 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', filename]
513 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
514 while retval == 2 or retval == 1:
515 prevsize = os.path.getsize(filename)
516 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
517 time.sleep(5.0) # This seems to be needed
518 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
519 cursize = os.path.getsize(filename)
520 if prevsize == cursize and retval == 1:
521 break
522 if retval == 0:
523 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
524 return True
525 else:
526 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
527 return False
528
529 def _do_download(self, filename, url, player_url):
530 # Attempt to download using rtmpdump
531 if url.startswith('rtmp'):
532 return self._download_with_rtmpdump(filename, url, player_url)
533
534 stream = None
535 open_mode = 'wb'
536 basic_request = urllib2.Request(url, None, std_headers)
537 request = urllib2.Request(url, None, std_headers)
538
539 # Establish possible resume length
540 if os.path.isfile(filename):
541 resume_len = os.path.getsize(filename)
542 else:
543 resume_len = 0
544
545 # Request parameters in case of being able to resume
546 if self.params.get('continuedl', False) and resume_len != 0:
547 self.report_resuming_byte(resume_len)
548 request.add_header('Range','bytes=%d-' % resume_len)
549 open_mode = 'ab'
550
551 count = 0
552 retries = self.params.get('retries', 0)
553 while count <= retries:
554 # Establish connection
555 try:
556 data = urllib2.urlopen(request)
557 break
558 except (urllib2.HTTPError, ), err:
559 if (err.code < 500 or err.code >= 600) and err.code != 416:
560 # Unexpected HTTP error
561 raise
562 elif err.code == 416:
563 # Unable to resume (requested range not satisfiable)
564 try:
565 # Open the connection again without the range header
566 data = urllib2.urlopen(basic_request)
567 content_length = data.info()['Content-Length']
568 except (urllib2.HTTPError, ), err:
569 if err.code < 500 or err.code >= 600:
570 raise
571 else:
572 # Examine the reported length
573 if (content_length is not None and
574 (resume_len - 100 < long(content_length) < resume_len + 100)):
575 # The file had already been fully downloaded.
576 # Explanation to the above condition: in issue #175 it was revealed that
577 # YouTube sometimes adds or removes a few bytes from the end of the file,
578 # changing the file size slightly and causing problems for some users. So
579 # I decided to implement a suggested change and consider the file
580 # completely downloaded if the file size differs less than 100 bytes from
581 # the one in the hard drive.
582 self.report_file_already_downloaded(filename)
583 return True
584 else:
585 # The length does not match, we start the download over
586 self.report_unable_to_resume()
587 open_mode = 'wb'
588 break
589 # Retry
590 count += 1
591 if count <= retries:
592 self.report_retry(count, retries)
593
594 if count > retries:
595 self.trouble(u'ERROR: giving up after %s retries' % retries)
596 return False
597
598 data_len = data.info().get('Content-length', None)
599 data_len_str = self.format_bytes(data_len)
600 byte_counter = 0
601 block_size = 1024
602 start = time.time()
603 while True:
604 # Download and write
605 before = time.time()
606 data_block = data.read(block_size)
607 after = time.time()
608 data_block_len = len(data_block)
609 if data_block_len == 0:
610 break
611 byte_counter += data_block_len
612
613 # Open file just in time
614 if stream is None:
615 try:
616 (stream, filename) = sanitize_open(filename, open_mode)
617 self.report_destination(filename)
618 except (OSError, IOError), err:
619 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
620 return False
621 try:
622 stream.write(data_block)
623 except (IOError, OSError), err:
624 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
625 return False
626 block_size = self.best_block_size(after - before, data_block_len)
627
628 # Progress message
629 percent_str = self.calc_percent(byte_counter, data_len)
630 eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
631 speed_str = self.calc_speed(start, time.time(), byte_counter)
632 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
633
634 # Apply rate limit
635 self.slow_down(start, byte_counter)
636
637 self.report_finish()
638 if data_len is not None and str(byte_counter) != data_len:
639 raise ContentTooShortError(byte_counter, long(data_len))
640 return True
641
642 class InfoExtractor(object):
643 """Information Extractor class.
644
645 Information extractors are the classes that, given a URL, extract
646 information from the video (or videos) the URL refers to. This
647 information includes the real video URL, the video title and simplified
648 title, author and others. The information is stored in a dictionary
649 which is then passed to the FileDownloader. The FileDownloader
650 processes this information possibly downloading the video to the file
651 system, among other possible outcomes. The dictionaries must include
652 the following fields:
653
654 id: Video identifier.
655 url: Final video URL.
656 uploader: Nickname of the video uploader.
657 title: Literal title.
658 stitle: Simplified title.
659 ext: Video filename extension.
660 format: Video format.
661 player_url: SWF Player URL (may be None).
662
663 The following fields are optional. Their primary purpose is to allow
664 youtube-dl to serve as the backend for a video search function, such
665 as the one in youtube2mp3. They are only used when their respective
666 forced printing functions are called:
667
668 thumbnail: Full URL to a video thumbnail image.
669 description: One-line video description.
670
671 Subclasses of this one should re-define the _real_initialize() and
672 _real_extract() methods, as well as the suitable() static method.
673 Probably, they should also be instantiated and added to the main
674 downloader.
675 """
676
677 _ready = False
678 _downloader = None
679
680 def __init__(self, downloader=None):
681 """Constructor. Receives an optional downloader."""
682 self._ready = False
683 self.set_downloader(downloader)
684
685 @staticmethod
686 def suitable(url):
687 """Receives a URL and returns True if suitable for this IE."""
688 return False
689
690 def initialize(self):
691 """Initializes an instance (authentication, etc)."""
692 if not self._ready:
693 self._real_initialize()
694 self._ready = True
695
696 def extract(self, url):
697 """Extracts URL information and returns it in list of dicts."""
698 self.initialize()
699 return self._real_extract(url)
700
701 def set_downloader(self, downloader):
702 """Sets the downloader for this IE."""
703 self._downloader = downloader
704
705 def _real_initialize(self):
706 """Real initialization process. Redefine in subclasses."""
707 pass
708
709 def _real_extract(self, url):
710 """Real extraction process. Redefine in subclasses."""
711 pass
712
713 class YoutubeIE(InfoExtractor):
714 """Information extractor for youtube.com."""
715
716 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
717 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
718 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
719 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
720 _NETRC_MACHINE = 'youtube'
721 # Listed in order of quality
722 _available_formats = ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
723 _video_extensions = {
724 '13': '3gp',
725 '17': 'mp4',
726 '18': 'mp4',
727 '22': 'mp4',
728 '37': 'mp4',
729 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
730 '43': 'webm',
731 '45': 'webm',
732 }
733
734 @staticmethod
735 def suitable(url):
736 return (re.match(YoutubeIE._VALID_URL, url) is not None)
737
738 def report_lang(self):
739 """Report attempt to set language."""
740 self._downloader.to_screen(u'[youtube] Setting language')
741
742 def report_login(self):
743 """Report attempt to log in."""
744 self._downloader.to_screen(u'[youtube] Logging in')
745
746 def report_age_confirmation(self):
747 """Report attempt to confirm age."""
748 self._downloader.to_screen(u'[youtube] Confirming age')
749
750 def report_video_webpage_download(self, video_id):
751 """Report attempt to download video webpage."""
752 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
753
754 def report_video_info_webpage_download(self, video_id):
755 """Report attempt to download video info webpage."""
756 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
757
758 def report_information_extraction(self, video_id):
759 """Report attempt to extract video information."""
760 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
761
762 def report_unavailable_format(self, video_id, format):
763 """Report extracted video URL."""
764 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
765
766 def report_rtmp_download(self):
767 """Indicate the download will use the RTMP protocol."""
768 self._downloader.to_screen(u'[youtube] RTMP download detected')
769
770 def _real_initialize(self):
771 if self._downloader is None:
772 return
773
774 username = None
775 password = None
776 downloader_params = self._downloader.params
777
778 # Attempt to use provided username and password or .netrc data
779 if downloader_params.get('username', None) is not None:
780 username = downloader_params['username']
781 password = downloader_params['password']
782 elif downloader_params.get('usenetrc', False):
783 try:
784 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
785 if info is not None:
786 username = info[0]
787 password = info[2]
788 else:
789 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
790 except (IOError, netrc.NetrcParseError), err:
791 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
792 return
793
794 # Set language
795 request = urllib2.Request(self._LANG_URL, None, std_headers)
796 try:
797 self.report_lang()
798 urllib2.urlopen(request).read()
799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
801 return
802
803 # No authentication to be performed
804 if username is None:
805 return
806
807 # Log in
808 login_form = {
809 'current_form': 'loginForm',
810 'next': '/',
811 'action_login': 'Log In',
812 'username': username,
813 'password': password,
814 }
815 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
816 try:
817 self.report_login()
818 login_results = urllib2.urlopen(request).read()
819 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
820 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
821 return
822 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
823 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
824 return
825
826 # Confirm age
827 age_form = {
828 'next_url': '/',
829 'action_confirm': 'Confirm',
830 }
831 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
832 try:
833 self.report_age_confirmation()
834 age_results = urllib2.urlopen(request).read()
835 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
836 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
837 return
838
839 def _real_extract(self, url):
840 # Extract video id from URL
841 mobj = re.match(self._VALID_URL, url)
842 if mobj is None:
843 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
844 return
845 video_id = mobj.group(2)
846
847 # Get video webpage
848 self.report_video_webpage_download(video_id)
849 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id, None, std_headers)
850 try:
851 video_webpage = urllib2.urlopen(request).read()
852 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
853 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
854 return
855
856 # Attempt to extract SWF player URL
857 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
858 if mobj is not None:
859 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
860 else:
861 player_url = None
862
863 # Get video info
864 self.report_video_info_webpage_download(video_id)
865 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
866 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
867 % (video_id, el_type))
868 request = urllib2.Request(video_info_url, None, std_headers)
869 try:
870 video_info_webpage = urllib2.urlopen(request).read()
871 video_info = parse_qs(video_info_webpage)
872 if 'token' in video_info:
873 break
874 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
875 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
876 return
877 if 'token' not in video_info:
878 if 'reason' in video_info:
879 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
880 else:
881 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
882 return
883
884 # Start extracting information
885 self.report_information_extraction(video_id)
886
887 # uploader
888 if 'author' not in video_info:
889 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
890 return
891 video_uploader = urllib.unquote_plus(video_info['author'][0])
892
893 # title
894 if 'title' not in video_info:
895 self._downloader.trouble(u'ERROR: unable to extract video title')
896 return
897 video_title = urllib.unquote_plus(video_info['title'][0])
898 video_title = video_title.decode('utf-8')
899 video_title = sanitize_title(video_title)
900
901 # simplified title
902 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
903 simple_title = simple_title.strip(ur'_')
904
905 # thumbnail image
906 if 'thumbnail_url' not in video_info:
907 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
908 video_thumbnail = ''
909 else: # don't panic if we can't find it
910 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
911
912 # upload date
913 upload_date = u'NA'
914 mobj = re.search(r'id="eow-date".*?>(.*?)</span>', video_webpage, re.DOTALL)
915 if mobj is not None:
916 try:
917 if ',' in mobj.group(1):
918 # Month Day, Year
919 m, d, y = mobj.group(1).replace(',', '').split()
920 else:
921 # Day Month Year, we'll suppose
922 d, m, y = mobj.group(1).split()
923 m = month_name_to_number[m]
924 d = '%02d' % (long(d))
925 upload_date = '%s%s%s' % (y, m, d)
926 except:
927 upload_date = u'NA'
928
929 # description
930 video_description = 'No description available.'
931 if self._downloader.params.get('forcedescription', False):
932 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
933 if mobj is not None:
934 video_description = mobj.group(1)
935
936 # token
937 video_token = urllib.unquote_plus(video_info['token'][0])
938
939 # Decide which formats to download
940 requested_format = self._downloader.params.get('format', None)
941 get_video_template = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id, video_token)
942
943 if 'fmt_url_map' in video_info:
944 url_map = dict(tuple(pair.split('|')) for pair in video_info['fmt_url_map'][0].split(','))
945 format_limit = self._downloader.params.get('format_limit', None)
946 if format_limit is not None and format_limit in self._available_formats:
947 format_list = self._available_formats[self._available_formats.index(format_limit):]
948 else:
949 format_list = self._available_formats
950 existing_formats = [x for x in format_list if x in url_map]
951 if len(existing_formats) == 0:
952 self._downloader.trouble(u'ERROR: no known formats available for video')
953 return
954 if requested_format is None:
955 video_url_list = [(existing_formats[0], get_video_template % existing_formats[0])] # Best quality
956 elif requested_format == '-1':
957 video_url_list = [(f, get_video_template % f) for f in existing_formats] # All formats
958 else:
959 video_url_list = [(requested_format, get_video_template % requested_format)] # Specific format
960
961 elif 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
962 self.report_rtmp_download()
963 video_url_list = [(None, video_info['conn'][0])]
964
965 else:
966 self._downloader.trouble(u'ERROR: no fmt_url_map or conn information found in video info')
967 return
968
969 for format_param, video_real_url in video_url_list:
970 # At this point we have a new video
971 self._downloader.increment_downloads()
972
973 # Extension
974 video_extension = self._video_extensions.get(format_param, 'flv')
975
976 # Find the video URL in fmt_url_map or conn paramters
977 try:
978 # Process video information
979 self._downloader.process_info({
980 'id': video_id.decode('utf-8'),
981 'url': video_real_url.decode('utf-8'),
982 'uploader': video_uploader.decode('utf-8'),
983 'upload_date': upload_date,
984 'title': video_title,
985 'stitle': simple_title,
986 'ext': video_extension.decode('utf-8'),
987 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
988 'thumbnail': video_thumbnail.decode('utf-8'),
989 'description': video_description.decode('utf-8'),
990 'player_url': player_url,
991 })
992 except UnavailableVideoError, err:
993 self._downloader.trouble(u'ERROR: unable to download video (format may not be available)')
994
995
996 class MetacafeIE(InfoExtractor):
997 """Information Extractor for metacafe.com."""
998
999 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1000 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1001 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1002 _youtube_ie = None
1003
1004 def __init__(self, youtube_ie, downloader=None):
1005 InfoExtractor.__init__(self, downloader)
1006 self._youtube_ie = youtube_ie
1007
1008 @staticmethod
1009 def suitable(url):
1010 return (re.match(MetacafeIE._VALID_URL, url) is not None)
1011
1012 def report_disclaimer(self):
1013 """Report disclaimer retrieval."""
1014 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1015
1016 def report_age_confirmation(self):
1017 """Report attempt to confirm age."""
1018 self._downloader.to_screen(u'[metacafe] Confirming age')
1019
1020 def report_download_webpage(self, video_id):
1021 """Report webpage download."""
1022 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1023
1024 def report_extraction(self, video_id):
1025 """Report information extraction."""
1026 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1027
1028 def _real_initialize(self):
1029 # Retrieve disclaimer
1030 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
1031 try:
1032 self.report_disclaimer()
1033 disclaimer = urllib2.urlopen(request).read()
1034 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1035 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1036 return
1037
1038 # Confirm age
1039 disclaimer_form = {
1040 'filters': '0',
1041 'submit': "Continue - I'm over 18",
1042 }
1043 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
1044 try:
1045 self.report_age_confirmation()
1046 disclaimer = urllib2.urlopen(request).read()
1047 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1048 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1049 return
1050
1051 def _real_extract(self, url):
1052 # Extract id and simplified title from URL
1053 mobj = re.match(self._VALID_URL, url)
1054 if mobj is None:
1055 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1056 return
1057
1058 video_id = mobj.group(1)
1059
1060 # Check if video comes from YouTube
1061 mobj2 = re.match(r'^yt-(.*)$', video_id)
1062 if mobj2 is not None:
1063 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1064 return
1065
1066 # At this point we have a new video
1067 self._downloader.increment_downloads()
1068
1069 simple_title = mobj.group(2).decode('utf-8')
1070
1071 # Retrieve video webpage to extract further information
1072 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1073 try:
1074 self.report_download_webpage(video_id)
1075 webpage = urllib2.urlopen(request).read()
1076 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1077 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1078 return
1079
1080 # Extract URL, uploader and title from webpage
1081 self.report_extraction(video_id)
1082 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1083 if mobj is not None:
1084 mediaURL = urllib.unquote(mobj.group(1))
1085 video_extension = mediaURL[-3:]
1086
1087 # Extract gdaKey if available
1088 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1089 if mobj is None:
1090 video_url = mediaURL
1091 else:
1092 gdaKey = mobj.group(1)
1093 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1094 else:
1095 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1096 if mobj is None:
1097 self._downloader.trouble(u'ERROR: unable to extract media URL')
1098 return
1099 vardict = parse_qs(mobj.group(1))
1100 if 'mediaData' not in vardict:
1101 self._downloader.trouble(u'ERROR: unable to extract media URL')
1102 return
1103 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1104 if mobj is None:
1105 self._downloader.trouble(u'ERROR: unable to extract media URL')
1106 return
1107 mediaURL = mobj.group(1).replace('\\/', '/')
1108 video_extension = mediaURL[-3:]
1109 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1110
1111 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1112 if mobj is None:
1113 self._downloader.trouble(u'ERROR: unable to extract title')
1114 return
1115 video_title = mobj.group(1).decode('utf-8')
1116 video_title = sanitize_title(video_title)
1117
1118 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1119 if mobj is None:
1120 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1121 return
1122 video_uploader = mobj.group(1)
1123
1124 try:
1125 # Process video information
1126 self._downloader.process_info({
1127 'id': video_id.decode('utf-8'),
1128 'url': video_url.decode('utf-8'),
1129 'uploader': video_uploader.decode('utf-8'),
1130 'upload_date': u'NA',
1131 'title': video_title,
1132 'stitle': simple_title,
1133 'ext': video_extension.decode('utf-8'),
1134 'format': u'NA',
1135 'player_url': None,
1136 })
1137 except UnavailableVideoError:
1138 self._downloader.trouble(u'ERROR: unable to download video')
1139
1140
1141 class DailymotionIE(InfoExtractor):
1142 """Information Extractor for Dailymotion"""
1143
1144 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1145
1146 def __init__(self, downloader=None):
1147 InfoExtractor.__init__(self, downloader)
1148
1149 @staticmethod
1150 def suitable(url):
1151 return (re.match(DailymotionIE._VALID_URL, url) is not None)
1152
1153 def report_download_webpage(self, video_id):
1154 """Report webpage download."""
1155 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1156
1157 def report_extraction(self, video_id):
1158 """Report information extraction."""
1159 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1160
1161 def _real_initialize(self):
1162 return
1163
1164 def _real_extract(self, url):
1165 # Extract id and simplified title from URL
1166 mobj = re.match(self._VALID_URL, url)
1167 if mobj is None:
1168 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1169 return
1170
1171 # At this point we have a new video
1172 self._downloader.increment_downloads()
1173 video_id = mobj.group(1)
1174
1175 simple_title = mobj.group(2).decode('utf-8')
1176 video_extension = 'flv'
1177
1178 # Retrieve video webpage to extract further information
1179 request = urllib2.Request(url)
1180 try:
1181 self.report_download_webpage(video_id)
1182 webpage = urllib2.urlopen(request).read()
1183 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1184 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1185 return
1186
1187 # Extract URL, uploader and title from webpage
1188 self.report_extraction(video_id)
1189 mobj = re.search(r'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage)
1190 if mobj is None:
1191 self._downloader.trouble(u'ERROR: unable to extract media URL')
1192 return
1193 mediaURL = urllib.unquote(mobj.group(1))
1194
1195 # if needed add http://www.dailymotion.com/ if relative URL
1196
1197 video_url = mediaURL
1198
1199 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1200 mobj = re.search(r'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage)
1201 if mobj is None:
1202 self._downloader.trouble(u'ERROR: unable to extract title')
1203 return
1204 video_title = mobj.group(1).decode('utf-8')
1205 video_title = sanitize_title(video_title)
1206
1207 mobj = re.search(r'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage)
1208 if mobj is None:
1209 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1210 return
1211 video_uploader = mobj.group(1)
1212
1213 try:
1214 # Process video information
1215 self._downloader.process_info({
1216 'id': video_id.decode('utf-8'),
1217 'url': video_url.decode('utf-8'),
1218 'uploader': video_uploader.decode('utf-8'),
1219 'upload_date': u'NA',
1220 'title': video_title,
1221 'stitle': simple_title,
1222 'ext': video_extension.decode('utf-8'),
1223 'format': u'NA',
1224 'player_url': None,
1225 })
1226 except UnavailableVideoError:
1227 self._downloader.trouble(u'ERROR: unable to download video')
1228
1229 class GoogleIE(InfoExtractor):
1230 """Information extractor for video.google.com."""
1231
1232 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1233
1234 def __init__(self, downloader=None):
1235 InfoExtractor.__init__(self, downloader)
1236
1237 @staticmethod
1238 def suitable(url):
1239 return (re.match(GoogleIE._VALID_URL, url) is not None)
1240
1241 def report_download_webpage(self, video_id):
1242 """Report webpage download."""
1243 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1244
1245 def report_extraction(self, video_id):
1246 """Report information extraction."""
1247 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1248
1249 def _real_initialize(self):
1250 return
1251
1252 def _real_extract(self, url):
1253 # Extract id from URL
1254 mobj = re.match(self._VALID_URL, url)
1255 if mobj is None:
1256 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1257 return
1258
1259 # At this point we have a new video
1260 self._downloader.increment_downloads()
1261 video_id = mobj.group(1)
1262
1263 video_extension = 'mp4'
1264
1265 # Retrieve video webpage to extract further information
1266 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1267 try:
1268 self.report_download_webpage(video_id)
1269 webpage = urllib2.urlopen(request).read()
1270 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1271 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1272 return
1273
1274 # Extract URL, uploader, and title from webpage
1275 self.report_extraction(video_id)
1276 mobj = re.search(r"download_url:'([^']+)'", webpage)
1277 if mobj is None:
1278 video_extension = 'flv'
1279 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1280 if mobj is None:
1281 self._downloader.trouble(u'ERROR: unable to extract media URL')
1282 return
1283 mediaURL = urllib.unquote(mobj.group(1))
1284 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1285 mediaURL = mediaURL.replace('\\x26', '\x26')
1286
1287 video_url = mediaURL
1288
1289 mobj = re.search(r'<title>(.*)</title>', webpage)
1290 if mobj is None:
1291 self._downloader.trouble(u'ERROR: unable to extract title')
1292 return
1293 video_title = mobj.group(1).decode('utf-8')
1294 video_title = sanitize_title(video_title)
1295 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1296
1297 # Extract video description
1298 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1299 if mobj is None:
1300 self._downloader.trouble(u'ERROR: unable to extract video description')
1301 return
1302 video_description = mobj.group(1).decode('utf-8')
1303 if not video_description:
1304 video_description = 'No description available.'
1305
1306 # Extract video thumbnail
1307 if self._downloader.params.get('forcethumbnail', False):
1308 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1309 try:
1310 webpage = urllib2.urlopen(request).read()
1311 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1312 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1313 return
1314 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1315 if mobj is None:
1316 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1317 return
1318 video_thumbnail = mobj.group(1)
1319 else: # we need something to pass to process_info
1320 video_thumbnail = ''
1321
1322
1323 try:
1324 # Process video information
1325 self._downloader.process_info({
1326 'id': video_id.decode('utf-8'),
1327 'url': video_url.decode('utf-8'),
1328 'uploader': u'NA',
1329 'upload_date': u'NA',
1330 'title': video_title,
1331 'stitle': simple_title,
1332 'ext': video_extension.decode('utf-8'),
1333 'format': u'NA',
1334 'player_url': None,
1335 })
1336 except UnavailableVideoError:
1337 self._downloader.trouble(u'ERROR: unable to download video')
1338
1339
1340 class PhotobucketIE(InfoExtractor):
1341 """Information extractor for photobucket.com."""
1342
1343 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1344
1345 def __init__(self, downloader=None):
1346 InfoExtractor.__init__(self, downloader)
1347
1348 @staticmethod
1349 def suitable(url):
1350 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1351
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1355
1356 def report_extraction(self, video_id):
1357 """Report information extraction."""
1358 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1359
1360 def _real_initialize(self):
1361 return
1362
1363 def _real_extract(self, url):
1364 # Extract id from URL
1365 mobj = re.match(self._VALID_URL, url)
1366 if mobj is None:
1367 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1368 return
1369
1370 # At this point we have a new video
1371 self._downloader.increment_downloads()
1372 video_id = mobj.group(1)
1373
1374 video_extension = 'flv'
1375
1376 # Retrieve video webpage to extract further information
1377 request = urllib2.Request(url)
1378 try:
1379 self.report_download_webpage(video_id)
1380 webpage = urllib2.urlopen(request).read()
1381 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1382 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1383 return
1384
1385 # Extract URL, uploader, and title from webpage
1386 self.report_extraction(video_id)
1387 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1388 if mobj is None:
1389 self._downloader.trouble(u'ERROR: unable to extract media URL')
1390 return
1391 mediaURL = urllib.unquote(mobj.group(1))
1392
1393 video_url = mediaURL
1394
1395 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1396 if mobj is None:
1397 self._downloader.trouble(u'ERROR: unable to extract title')
1398 return
1399 video_title = mobj.group(1).decode('utf-8')
1400 video_title = sanitize_title(video_title)
1401 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1402
1403 video_uploader = mobj.group(2).decode('utf-8')
1404
1405 try:
1406 # Process video information
1407 self._downloader.process_info({
1408 'id': video_id.decode('utf-8'),
1409 'url': video_url.decode('utf-8'),
1410 'uploader': video_uploader,
1411 'upload_date': u'NA',
1412 'title': video_title,
1413 'stitle': simple_title,
1414 'ext': video_extension.decode('utf-8'),
1415 'format': u'NA',
1416 'player_url': None,
1417 })
1418 except UnavailableVideoError:
1419 self._downloader.trouble(u'ERROR: unable to download video')
1420
1421
1422 class YahooIE(InfoExtractor):
1423 """Information extractor for video.yahoo.com."""
1424
1425 # _VALID_URL matches all Yahoo! Video URLs
1426 # _VPAGE_URL matches only the extractable '/watch/' URLs
1427 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1428 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1429
1430 def __init__(self, downloader=None):
1431 InfoExtractor.__init__(self, downloader)
1432
1433 @staticmethod
1434 def suitable(url):
1435 return (re.match(YahooIE._VALID_URL, url) is not None)
1436
1437 def report_download_webpage(self, video_id):
1438 """Report webpage download."""
1439 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1440
1441 def report_extraction(self, video_id):
1442 """Report information extraction."""
1443 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1444
1445 def _real_initialize(self):
1446 return
1447
1448 def _real_extract(self, url, new_video=True):
1449 # Extract ID from URL
1450 mobj = re.match(self._VALID_URL, url)
1451 if mobj is None:
1452 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1453 return
1454
1455 # At this point we have a new video
1456 self._downloader.increment_downloads()
1457 video_id = mobj.group(2)
1458 video_extension = 'flv'
1459
1460 # Rewrite valid but non-extractable URLs as
1461 # extractable English language /watch/ URLs
1462 if re.match(self._VPAGE_URL, url) is None:
1463 request = urllib2.Request(url)
1464 try:
1465 webpage = urllib2.urlopen(request).read()
1466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1467 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1468 return
1469
1470 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1471 if mobj is None:
1472 self._downloader.trouble(u'ERROR: Unable to extract id field')
1473 return
1474 yahoo_id = mobj.group(1)
1475
1476 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1477 if mobj is None:
1478 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1479 return
1480 yahoo_vid = mobj.group(1)
1481
1482 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1483 return self._real_extract(url, new_video=False)
1484
1485 # Retrieve video webpage to extract further information
1486 request = urllib2.Request(url)
1487 try:
1488 self.report_download_webpage(video_id)
1489 webpage = urllib2.urlopen(request).read()
1490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1492 return
1493
1494 # Extract uploader and title from webpage
1495 self.report_extraction(video_id)
1496 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1497 if mobj is None:
1498 self._downloader.trouble(u'ERROR: unable to extract video title')
1499 return
1500 video_title = mobj.group(1).decode('utf-8')
1501 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1502
1503 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1504 if mobj is None:
1505 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1506 return
1507 video_uploader = mobj.group(1).decode('utf-8')
1508
1509 # Extract video thumbnail
1510 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1511 if mobj is None:
1512 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1513 return
1514 video_thumbnail = mobj.group(1).decode('utf-8')
1515
1516 # Extract video description
1517 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1518 if mobj is None:
1519 self._downloader.trouble(u'ERROR: unable to extract video description')
1520 return
1521 video_description = mobj.group(1).decode('utf-8')
1522 if not video_description: video_description = 'No description available.'
1523
1524 # Extract video height and width
1525 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1526 if mobj is None:
1527 self._downloader.trouble(u'ERROR: unable to extract video height')
1528 return
1529 yv_video_height = mobj.group(1)
1530
1531 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1532 if mobj is None:
1533 self._downloader.trouble(u'ERROR: unable to extract video width')
1534 return
1535 yv_video_width = mobj.group(1)
1536
1537 # Retrieve video playlist to extract media URL
1538 # I'm not completely sure what all these options are, but we
1539 # seem to need most of them, otherwise the server sends a 401.
1540 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1541 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1542 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1543 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1544 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1545 try:
1546 self.report_download_webpage(video_id)
1547 webpage = urllib2.urlopen(request).read()
1548 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1549 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1550 return
1551
1552 # Extract media URL from playlist XML
1553 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1554 if mobj is None:
1555 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1556 return
1557 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1558 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1559
1560 try:
1561 # Process video information
1562 self._downloader.process_info({
1563 'id': video_id.decode('utf-8'),
1564 'url': video_url,
1565 'uploader': video_uploader,
1566 'upload_date': u'NA',
1567 'title': video_title,
1568 'stitle': simple_title,
1569 'ext': video_extension.decode('utf-8'),
1570 'thumbnail': video_thumbnail.decode('utf-8'),
1571 'description': video_description,
1572 'thumbnail': video_thumbnail,
1573 'description': video_description,
1574 'player_url': None,
1575 })
1576 except UnavailableVideoError:
1577 self._downloader.trouble(u'ERROR: unable to download video')
1578
1579
1580 class GenericIE(InfoExtractor):
1581 """Generic last-resort information extractor."""
1582
1583 def __init__(self, downloader=None):
1584 InfoExtractor.__init__(self, downloader)
1585
1586 @staticmethod
1587 def suitable(url):
1588 return True
1589
1590 def report_download_webpage(self, video_id):
1591 """Report webpage download."""
1592 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1593 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1594
1595 def report_extraction(self, video_id):
1596 """Report information extraction."""
1597 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1598
1599 def _real_initialize(self):
1600 return
1601
1602 def _real_extract(self, url):
1603 # At this point we have a new video
1604 self._downloader.increment_downloads()
1605
1606 video_id = url.split('/')[-1]
1607 request = urllib2.Request(url)
1608 try:
1609 self.report_download_webpage(video_id)
1610 webpage = urllib2.urlopen(request).read()
1611 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1612 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1613 return
1614 except ValueError, err:
1615 # since this is the last-resort InfoExtractor, if
1616 # this error is thrown, it'll be thrown here
1617 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1618 return
1619
1620 # Start with something easy: JW Player in SWFObject
1621 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1622 if mobj is None:
1623 # Broaden the search a little bit
1624 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1625 if mobj is None:
1626 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1627 return
1628
1629 # It's possible that one of the regexes
1630 # matched, but returned an empty group:
1631 if mobj.group(1) is None:
1632 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1633 return
1634
1635 video_url = urllib.unquote(mobj.group(1))
1636 video_id = os.path.basename(video_url)
1637
1638 # here's a fun little line of code for you:
1639 video_extension = os.path.splitext(video_id)[1][1:]
1640 video_id = os.path.splitext(video_id)[0]
1641
1642 # it's tempting to parse this further, but you would
1643 # have to take into account all the variations like
1644 # Video Title - Site Name
1645 # Site Name | Video Title
1646 # Video Title - Tagline | Site Name
1647 # and so on and so forth; it's just not practical
1648 mobj = re.search(r'<title>(.*)</title>', webpage)
1649 if mobj is None:
1650 self._downloader.trouble(u'ERROR: unable to extract title')
1651 return
1652 video_title = mobj.group(1).decode('utf-8')
1653 video_title = sanitize_title(video_title)
1654 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1655
1656 # video uploader is domain name
1657 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1658 if mobj is None:
1659 self._downloader.trouble(u'ERROR: unable to extract title')
1660 return
1661 video_uploader = mobj.group(1).decode('utf-8')
1662
1663 try:
1664 # Process video information
1665 self._downloader.process_info({
1666 'id': video_id.decode('utf-8'),
1667 'url': video_url.decode('utf-8'),
1668 'uploader': video_uploader,
1669 'upload_date': u'NA',
1670 'title': video_title,
1671 'stitle': simple_title,
1672 'ext': video_extension.decode('utf-8'),
1673 'format': u'NA',
1674 'player_url': None,
1675 })
1676 except UnavailableVideoError, err:
1677 self._downloader.trouble(u'ERROR: unable to download video')
1678
1679
1680 class YoutubeSearchIE(InfoExtractor):
1681 """Information Extractor for YouTube search queries."""
1682 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1683 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1684 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1685 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1686 _youtube_ie = None
1687 _max_youtube_results = 1000
1688
1689 def __init__(self, youtube_ie, downloader=None):
1690 InfoExtractor.__init__(self, downloader)
1691 self._youtube_ie = youtube_ie
1692
1693 @staticmethod
1694 def suitable(url):
1695 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1696
1697 def report_download_page(self, query, pagenum):
1698 """Report attempt to download playlist page with given number."""
1699 query = query.decode(preferredencoding())
1700 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1701
1702 def _real_initialize(self):
1703 self._youtube_ie.initialize()
1704
1705 def _real_extract(self, query):
1706 mobj = re.match(self._VALID_QUERY, query)
1707 if mobj is None:
1708 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1709 return
1710
1711 prefix, query = query.split(':')
1712 prefix = prefix[8:]
1713 query = query.encode('utf-8')
1714 if prefix == '':
1715 self._download_n_results(query, 1)
1716 return
1717 elif prefix == 'all':
1718 self._download_n_results(query, self._max_youtube_results)
1719 return
1720 else:
1721 try:
1722 n = long(prefix)
1723 if n <= 0:
1724 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1725 return
1726 elif n > self._max_youtube_results:
1727 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1728 n = self._max_youtube_results
1729 self._download_n_results(query, n)
1730 return
1731 except ValueError: # parsing prefix as integer fails
1732 self._download_n_results(query, 1)
1733 return
1734
1735 def _download_n_results(self, query, n):
1736 """Downloads a specified number of results for a query"""
1737
1738 video_ids = []
1739 already_seen = set()
1740 pagenum = 1
1741
1742 while True:
1743 self.report_download_page(query, pagenum)
1744 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1745 request = urllib2.Request(result_url, None, std_headers)
1746 try:
1747 page = urllib2.urlopen(request).read()
1748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1750 return
1751
1752 # Extract video identifiers
1753 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1754 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1755 if video_id not in already_seen:
1756 video_ids.append(video_id)
1757 already_seen.add(video_id)
1758 if len(video_ids) == n:
1759 # Specified n videos reached
1760 for id in video_ids:
1761 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1762 return
1763
1764 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1765 for id in video_ids:
1766 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1767 return
1768
1769 pagenum = pagenum + 1
1770
1771 class GoogleSearchIE(InfoExtractor):
1772 """Information Extractor for Google Video search queries."""
1773 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1774 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1775 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1776 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1777 _google_ie = None
1778 _max_google_results = 1000
1779
1780 def __init__(self, google_ie, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1782 self._google_ie = google_ie
1783
1784 @staticmethod
1785 def suitable(url):
1786 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1787
1788 def report_download_page(self, query, pagenum):
1789 """Report attempt to download playlist page with given number."""
1790 query = query.decode(preferredencoding())
1791 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1792
1793 def _real_initialize(self):
1794 self._google_ie.initialize()
1795
1796 def _real_extract(self, query):
1797 mobj = re.match(self._VALID_QUERY, query)
1798 if mobj is None:
1799 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1800 return
1801
1802 prefix, query = query.split(':')
1803 prefix = prefix[8:]
1804 query = query.encode('utf-8')
1805 if prefix == '':
1806 self._download_n_results(query, 1)
1807 return
1808 elif prefix == 'all':
1809 self._download_n_results(query, self._max_google_results)
1810 return
1811 else:
1812 try:
1813 n = long(prefix)
1814 if n <= 0:
1815 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1816 return
1817 elif n > self._max_google_results:
1818 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1819 n = self._max_google_results
1820 self._download_n_results(query, n)
1821 return
1822 except ValueError: # parsing prefix as integer fails
1823 self._download_n_results(query, 1)
1824 return
1825
1826 def _download_n_results(self, query, n):
1827 """Downloads a specified number of results for a query"""
1828
1829 video_ids = []
1830 already_seen = set()
1831 pagenum = 1
1832
1833 while True:
1834 self.report_download_page(query, pagenum)
1835 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1836 request = urllib2.Request(result_url, None, std_headers)
1837 try:
1838 page = urllib2.urlopen(request).read()
1839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1841 return
1842
1843 # Extract video identifiers
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 video_id = mobj.group(1)
1846 if video_id not in already_seen:
1847 video_ids.append(video_id)
1848 already_seen.add(video_id)
1849 if len(video_ids) == n:
1850 # Specified n videos reached
1851 for id in video_ids:
1852 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1853 return
1854
1855 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1856 for id in video_ids:
1857 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1858 return
1859
1860 pagenum = pagenum + 1
1861
1862 class YahooSearchIE(InfoExtractor):
1863 """Information Extractor for Yahoo! Video search queries."""
1864 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1865 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1866 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1867 _MORE_PAGES_INDICATOR = r'\s*Next'
1868 _yahoo_ie = None
1869 _max_yahoo_results = 1000
1870
1871 def __init__(self, yahoo_ie, downloader=None):
1872 InfoExtractor.__init__(self, downloader)
1873 self._yahoo_ie = yahoo_ie
1874
1875 @staticmethod
1876 def suitable(url):
1877 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1878
1879 def report_download_page(self, query, pagenum):
1880 """Report attempt to download playlist page with given number."""
1881 query = query.decode(preferredencoding())
1882 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1883
1884 def _real_initialize(self):
1885 self._yahoo_ie.initialize()
1886
1887 def _real_extract(self, query):
1888 mobj = re.match(self._VALID_QUERY, query)
1889 if mobj is None:
1890 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1891 return
1892
1893 prefix, query = query.split(':')
1894 prefix = prefix[8:]
1895 query = query.encode('utf-8')
1896 if prefix == '':
1897 self._download_n_results(query, 1)
1898 return
1899 elif prefix == 'all':
1900 self._download_n_results(query, self._max_yahoo_results)
1901 return
1902 else:
1903 try:
1904 n = long(prefix)
1905 if n <= 0:
1906 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1907 return
1908 elif n > self._max_yahoo_results:
1909 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1910 n = self._max_yahoo_results
1911 self._download_n_results(query, n)
1912 return
1913 except ValueError: # parsing prefix as integer fails
1914 self._download_n_results(query, 1)
1915 return
1916
1917 def _download_n_results(self, query, n):
1918 """Downloads a specified number of results for a query"""
1919
1920 video_ids = []
1921 already_seen = set()
1922 pagenum = 1
1923
1924 while True:
1925 self.report_download_page(query, pagenum)
1926 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1927 request = urllib2.Request(result_url, None, std_headers)
1928 try:
1929 page = urllib2.urlopen(request).read()
1930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1932 return
1933
1934 # Extract video identifiers
1935 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936 video_id = mobj.group(1)
1937 if video_id not in already_seen:
1938 video_ids.append(video_id)
1939 already_seen.add(video_id)
1940 if len(video_ids) == n:
1941 # Specified n videos reached
1942 for id in video_ids:
1943 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1944 return
1945
1946 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1947 for id in video_ids:
1948 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1949 return
1950
1951 pagenum = pagenum + 1
1952
1953 class YoutubePlaylistIE(InfoExtractor):
1954 """Information Extractor for YouTube playlists."""
1955
1956 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1957 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1958 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1959 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1960 _youtube_ie = None
1961
1962 def __init__(self, youtube_ie, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1964 self._youtube_ie = youtube_ie
1965
1966 @staticmethod
1967 def suitable(url):
1968 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1969
1970 def report_download_page(self, playlist_id, pagenum):
1971 """Report attempt to download playlist page with given number."""
1972 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1973
1974 def _real_initialize(self):
1975 self._youtube_ie.initialize()
1976
1977 def _real_extract(self, url):
1978 # Extract playlist id
1979 mobj = re.match(self._VALID_URL, url)
1980 if mobj is None:
1981 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1982 return
1983
1984 # Download playlist pages
1985 playlist_id = mobj.group(1)
1986 video_ids = []
1987 pagenum = 1
1988
1989 while True:
1990 self.report_download_page(playlist_id, pagenum)
1991 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1992 try:
1993 page = urllib2.urlopen(request).read()
1994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1995 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1996 return
1997
1998 # Extract video identifiers
1999 ids_in_page = []
2000 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2001 if mobj.group(1) not in ids_in_page:
2002 ids_in_page.append(mobj.group(1))
2003 video_ids.extend(ids_in_page)
2004
2005 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2006 break
2007 pagenum = pagenum + 1
2008
2009 playliststart = self._downloader.params.get('playliststart', 1) - 1
2010 playlistend = self._downloader.params.get('playlistend', -1)
2011 video_ids = video_ids[playliststart:playlistend]
2012
2013 for id in video_ids:
2014 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2015 return
2016
2017 class YoutubeUserIE(InfoExtractor):
2018 """Information Extractor for YouTube users."""
2019
2020 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2021 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2022 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2023 _youtube_ie = None
2024
2025 def __init__(self, youtube_ie, downloader=None):
2026 InfoExtractor.__init__(self, downloader)
2027 self._youtube_ie = youtube_ie
2028
2029 @staticmethod
2030 def suitable(url):
2031 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2032
2033 def report_download_page(self, username):
2034 """Report attempt to download user page."""
2035 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2036
2037 def _real_initialize(self):
2038 self._youtube_ie.initialize()
2039
2040 def _real_extract(self, url):
2041 # Extract username
2042 mobj = re.match(self._VALID_URL, url)
2043 if mobj is None:
2044 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2045 return
2046
2047 # Download user page
2048 username = mobj.group(1)
2049 video_ids = []
2050 pagenum = 1
2051
2052 self.report_download_page(username)
2053 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2054 try:
2055 page = urllib2.urlopen(request).read()
2056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2057 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2058 return
2059
2060 # Extract video identifiers
2061 ids_in_page = []
2062
2063 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2064 if mobj.group(1) not in ids_in_page:
2065 ids_in_page.append(mobj.group(1))
2066 video_ids.extend(ids_in_page)
2067
2068 playliststart = self._downloader.params.get('playliststart', 1) - 1
2069 playlistend = self._downloader.params.get('playlistend', -1)
2070 video_ids = video_ids[playliststart:playlistend]
2071
2072 for id in video_ids:
2073 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2074 return
2075
2076 class PostProcessor(object):
2077 """Post Processor class.
2078
2079 PostProcessor objects can be added to downloaders with their
2080 add_post_processor() method. When the downloader has finished a
2081 successful download, it will take its internal chain of PostProcessors
2082 and start calling the run() method on each one of them, first with
2083 an initial argument and then with the returned value of the previous
2084 PostProcessor.
2085
2086 The chain will be stopped if one of them ever returns None or the end
2087 of the chain is reached.
2088
2089 PostProcessor objects follow a "mutual registration" process similar
2090 to InfoExtractor objects.
2091 """
2092
2093 _downloader = None
2094
2095 def __init__(self, downloader=None):
2096 self._downloader = downloader
2097
2098 def set_downloader(self, downloader):
2099 """Sets the downloader for this PP."""
2100 self._downloader = downloader
2101
2102 def run(self, information):
2103 """Run the PostProcessor.
2104
2105 The "information" argument is a dictionary like the ones
2106 composed by InfoExtractors. The only difference is that this
2107 one has an extra field called "filepath" that points to the
2108 downloaded file.
2109
2110 When this method returns None, the postprocessing chain is
2111 stopped. However, this method may return an information
2112 dictionary that will be passed to the next postprocessing
2113 object in the chain. It can be the one it received after
2114 changing some fields.
2115
2116 In addition, this method may raise a PostProcessingError
2117 exception that will be taken into account by the downloader
2118 it was called from.
2119 """
2120 return information # by default, do nothing
2121
2122 ### MAIN PROGRAM ###
2123 if __name__ == '__main__':
2124 try:
2125 # Modules needed only when running the main program
2126 import getpass
2127 import optparse
2128
2129 # Function to update the program file with the latest version from bitbucket.org
2130 def update_self(downloader, filename):
2131 # Note: downloader only used for options
2132 if not os.access (filename, os.W_OK):
2133 sys.exit('ERROR: no write permissions on %s' % filename)
2134
2135 downloader.to_screen('Updating to latest stable version...')
2136 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2137 latest_version = urllib.urlopen(latest_url).read().strip()
2138 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2139 newcontent = urllib.urlopen(prog_url).read()
2140 stream = open(filename, 'w')
2141 stream.write(newcontent)
2142 stream.close()
2143 downloader.to_screen('Updated to version %s' % latest_version)
2144
2145 # Parse command line
2146 parser = optparse.OptionParser(
2147 usage='Usage: %prog [options] url...',
2148 version='2010.11.19',
2149 conflict_handler='resolve',
2150 )
2151
2152 parser.add_option('-h', '--help',
2153 action='help', help='print this help text and exit')
2154 parser.add_option('-v', '--version',
2155 action='version', help='print program version and exit')
2156 parser.add_option('-U', '--update',
2157 action='store_true', dest='update_self', help='update this program to latest stable version')
2158 parser.add_option('-i', '--ignore-errors',
2159 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2160 parser.add_option('-r', '--rate-limit',
2161 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2162 parser.add_option('-R', '--retries',
2163 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2164 parser.add_option('--playlist-start',
2165 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2166 parser.add_option('--playlist-end',
2167 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2168
2169 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2170 authentication.add_option('-u', '--username',
2171 dest='username', metavar='USERNAME', help='account username')
2172 authentication.add_option('-p', '--password',
2173 dest='password', metavar='PASSWORD', help='account password')
2174 authentication.add_option('-n', '--netrc',
2175 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2176 parser.add_option_group(authentication)
2177
2178 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2179 video_format.add_option('-f', '--format',
2180 action='store', dest='format', metavar='FORMAT', help='video format code')
2181 video_format.add_option('-m', '--mobile-version',
2182 action='store_const', dest='format', help='alias for -f 17', const='17')
2183 video_format.add_option('--all-formats',
2184 action='store_const', dest='format', help='download all available video formats', const='-1')
2185 video_format.add_option('--max-quality',
2186 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2187 video_format.add_option('-b', '--best-quality',
2188 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2189 parser.add_option_group(video_format)
2190
2191 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2192 verbosity.add_option('-q', '--quiet',
2193 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2194 verbosity.add_option('-s', '--simulate',
2195 action='store_true', dest='simulate', help='do not download video', default=False)
2196 verbosity.add_option('-g', '--get-url',
2197 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2198 verbosity.add_option('-e', '--get-title',
2199 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2200 verbosity.add_option('--get-thumbnail',
2201 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2202 verbosity.add_option('--get-description',
2203 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2204 verbosity.add_option('--no-progress',
2205 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2206 parser.add_option_group(verbosity)
2207
2208 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2209 filesystem.add_option('-t', '--title',
2210 action='store_true', dest='usetitle', help='use title in file name', default=False)
2211 filesystem.add_option('-l', '--literal',
2212 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2213 filesystem.add_option('-A', '--auto-number',
2214 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2215 filesystem.add_option('-o', '--output',
2216 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2217 filesystem.add_option('-a', '--batch-file',
2218 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2219 filesystem.add_option('-w', '--no-overwrites',
2220 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2221 filesystem.add_option('-c', '--continue',
2222 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2223 filesystem.add_option('--cookies',
2224 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2225 parser.add_option_group(filesystem)
2226
2227 (opts, args) = parser.parse_args()
2228
2229 # Open appropriate CookieJar
2230 if opts.cookiefile is None:
2231 jar = cookielib.CookieJar()
2232 else:
2233 try:
2234 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2235 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2236 jar.load()
2237 except (IOError, OSError), err:
2238 sys.exit(u'ERROR: unable to open cookie file')
2239
2240 # General configuration
2241 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2242 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2243 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2244 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2245
2246 # Batch file verification
2247 batchurls = []
2248 if opts.batchfile is not None:
2249 try:
2250 if opts.batchfile == '-':
2251 batchfd = sys.stdin
2252 else:
2253 batchfd = open(opts.batchfile, 'r')
2254 batchurls = batchfd.readlines()
2255 batchurls = [x.strip() for x in batchurls]
2256 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2257 except IOError:
2258 sys.exit(u'ERROR: batch file could not be read')
2259 all_urls = batchurls + args
2260
2261 # Conflicting, missing and erroneous options
2262 if opts.bestquality:
2263 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2264 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2265 parser.error(u'using .netrc conflicts with giving username/password')
2266 if opts.password is not None and opts.username is None:
2267 parser.error(u'account username missing')
2268 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2269 parser.error(u'using output template conflicts with using title, literal title or auto number')
2270 if opts.usetitle and opts.useliteral:
2271 parser.error(u'using title conflicts with using literal title')
2272 if opts.username is not None and opts.password is None:
2273 opts.password = getpass.getpass(u'Type account password and press return:')
2274 if opts.ratelimit is not None:
2275 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2276 if numeric_limit is None:
2277 parser.error(u'invalid rate limit specified')
2278 opts.ratelimit = numeric_limit
2279 if opts.retries is not None:
2280 try:
2281 opts.retries = long(opts.retries)
2282 except (TypeError, ValueError), err:
2283 parser.error(u'invalid retry count specified')
2284 try:
2285 opts.playliststart = long(opts.playliststart)
2286 if opts.playliststart <= 0:
2287 raise ValueError
2288 except (TypeError, ValueError), err:
2289 parser.error(u'invalid playlist start number specified')
2290 try:
2291 opts.playlistend = long(opts.playlistend)
2292 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2293 raise ValueError
2294 except (TypeError, ValueError), err:
2295 parser.error(u'invalid playlist end number specified')
2296
2297 # Information extractors
2298 youtube_ie = YoutubeIE()
2299 metacafe_ie = MetacafeIE(youtube_ie)
2300 dailymotion_ie = DailymotionIE()
2301 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2302 youtube_user_ie = YoutubeUserIE(youtube_ie)
2303 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2304 google_ie = GoogleIE()
2305 google_search_ie = GoogleSearchIE(google_ie)
2306 photobucket_ie = PhotobucketIE()
2307 yahoo_ie = YahooIE()
2308 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2309 generic_ie = GenericIE()
2310
2311 # File downloader
2312 fd = FileDownloader({
2313 'usenetrc': opts.usenetrc,
2314 'username': opts.username,
2315 'password': opts.password,
2316 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2317 'forceurl': opts.geturl,
2318 'forcetitle': opts.gettitle,
2319 'forcethumbnail': opts.getthumbnail,
2320 'forcedescription': opts.getdescription,
2321 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2322 'format': opts.format,
2323 'format_limit': opts.format_limit,
2324 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2325 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2326 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2327 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2328 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2329 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2330 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2331 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2332 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2333 or u'%(id)s.%(ext)s'),
2334 'ignoreerrors': opts.ignoreerrors,
2335 'ratelimit': opts.ratelimit,
2336 'nooverwrites': opts.nooverwrites,
2337 'retries': opts.retries,
2338 'continuedl': opts.continue_dl,
2339 'noprogress': opts.noprogress,
2340 'playliststart': opts.playliststart,
2341 'playlistend': opts.playlistend,
2342 'logtostderr': opts.outtmpl == '-',
2343 })
2344 fd.add_info_extractor(youtube_search_ie)
2345 fd.add_info_extractor(youtube_pl_ie)
2346 fd.add_info_extractor(youtube_user_ie)
2347 fd.add_info_extractor(metacafe_ie)
2348 fd.add_info_extractor(dailymotion_ie)
2349 fd.add_info_extractor(youtube_ie)
2350 fd.add_info_extractor(google_ie)
2351 fd.add_info_extractor(google_search_ie)
2352 fd.add_info_extractor(photobucket_ie)
2353 fd.add_info_extractor(yahoo_ie)
2354 fd.add_info_extractor(yahoo_search_ie)
2355
2356 # This must come last since it's the
2357 # fallback if none of the others work
2358 fd.add_info_extractor(generic_ie)
2359
2360 # Update version
2361 if opts.update_self:
2362 update_self(fd, sys.argv[0])
2363
2364 # Maybe do nothing
2365 if len(all_urls) < 1:
2366 if not opts.update_self:
2367 parser.error(u'you must provide at least one URL')
2368 else:
2369 sys.exit()
2370 retcode = fd.download(all_urls)
2371
2372 # Dump cookie jar if requested
2373 if opts.cookiefile is not None:
2374 try:
2375 jar.save()
2376 except (IOError, OSError), err:
2377 sys.exit(u'ERROR: unable to save cookie jar')
2378
2379 sys.exit(retcode)
2380
2381 except DownloadError:
2382 sys.exit(1)
2383 except SameFileError:
2384 sys.exit(u'ERROR: fixed output name but more than one file to download')
2385 except KeyboardInterrupt:
2386 sys.exit(u'\nERROR: Interrupted by user')