]> Raphaël G. Git Repositories - youtubedl/blob - youtube-dl
Update the manpage wrt the 'worst' format. Thanks Cristian Rigamonti.
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.10.19'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45 import ctypes
46
47 try:
48 import email.utils
49 except ImportError: # Python 2.4
50 import email.Utils
51 try:
52 import cStringIO as StringIO
53 except ImportError:
54 import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58 from urlparse import parse_qs
59 except ImportError:
60 from cgi import parse_qs
61
62 try:
63 import lxml.etree
64 except ImportError:
65 pass # Handled below
66
67 try:
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83 import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
195 def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
214
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
256 if filename == u'-':
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
271
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
280
281 class DownloadError(Exception):
282 """Download Error exception.
283
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
286 error message.
287 """
288 pass
289
290
291 class SameFileError(Exception):
292 """Same File exception.
293
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
296 """
297 pass
298
299
300 class PostProcessingError(Exception):
301 """Post Processing exception.
302
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
305 """
306 pass
307
308
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
311
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
314 """
315 pass
316
317
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
320
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
324 """
325 # Both in bytes
326 downloaded = None
327 expected = None
328
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
336
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
343
344 Part of this code was copied from:
345
346 http://techknack.net/python-urllib2-handlers/
347
348 Andrew Rowls, the author of that code, agreed to release it to the
349 public domain.
350 """
351
352 @staticmethod
353 def deflate(data):
354 try:
355 return zlib.decompress(data, -zlib.MAX_WBITS)
356 except zlib.error:
357 return zlib.decompress(data)
358
359 @staticmethod
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
364 ret.code = code
365 return ret
366
367 def http_request(self, req):
368 for h in std_headers:
369 if h in req.headers:
370 del req.headers[h]
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
376 return req
377
378 def http_response(self, req, resp):
379 old_resp = resp
380 # gzip
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
385 # deflate
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
390 return resp
391
392
393 class FileDownloader(object):
394 """File Downloader class.
395
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
402
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
410
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
417
418 Available options:
419
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
449 """
450
451 params = None
452 _ies = []
453 _pps = []
454 _download_retcode = None
455 _num_downloads = None
456 _screen_file = None
457
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
460 self._ies = []
461 self._pps = []
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465 self.params = params
466
467 @staticmethod
468 def format_bytes(bytes):
469 if bytes is None:
470 return 'N/A'
471 if type(bytes) is str:
472 bytes = float(bytes)
473 if bytes == 0.0:
474 exponent = 0
475 else:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
480
481 @staticmethod
482 def calc_percent(byte_counter, data_len):
483 if data_len is None:
484 return '---.-%'
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487 @staticmethod
488 def calc_eta(start, now, total, current):
489 if total is None:
490 return '--:--'
491 dif = now - start
492 if current == 0 or dif < 0.001: # One millisecond
493 return '--:--'
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
497 if eta_mins > 99:
498 return '--:--'
499 return '%02d:%02d' % (eta_mins, eta_secs)
500
501 @staticmethod
502 def calc_speed(start, now, bytes):
503 dif = now - start
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508 @staticmethod
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
513 return long(new_max)
514 rate = bytes / elapsed_time
515 if rate > new_max:
516 return long(new_max)
517 if rate < new_min:
518 return long(new_min)
519 return long(rate)
520
521 @staticmethod
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 if matchobj is None:
526 return None
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
530
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
533 self._ies.append(ie)
534 ie.set_downloader(self)
535
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
538 self._pps.append(pp)
539 pp.set_downloader(self)
540
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
543 try:
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
550 raise
551
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
555
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
559 return
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
573
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
577 """
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
583
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
588 return
589 now = time.time()
590 elapsed = now - start_time
591 if elapsed <= 0.0:
592 return
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
601 return filename
602 return filename + u'.part'
603
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
607 return filename
608
609 def try_rename(self, old_filename, new_filename):
610 try:
611 if old_filename == new_filename:
612 return
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
616
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
620 return
621 if not os.path.isfile(filename):
622 return
623 timestr = last_modified_hdr
624 if timestr is None:
625 return
626 filetime = timeconvert(timestr)
627 if filetime is None:
628 return filetime
629 try:
630 os.utime(filename, (time.time(), filetime))
631 except:
632 pass
633 return filetime
634
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
650 return
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
666 try:
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
670
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
674
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
679 else:
680 self.to_screen(u'')
681
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
685
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
688 try:
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
693 return filename
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
696 return None
697
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
701
702 # Forced printings
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
718 return
719
720 if filename is None:
721 return
722
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728 return
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731 return
732
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
735 return
736
737 try:
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
740 os.makedirs(dn)
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743 return
744
745 if self.params.get('writedescription', False):
746 try:
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
750 try:
751 descfile.write(info_dict['description'].encode('utf-8'))
752 finally:
753 descfile.close()
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756 return
757
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
761 try:
762 json.dump
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765 return
766 try:
767 infof = open(infofn, 'wb')
768 try:
769 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770 json.dump(json_info_dict, infof)
771 finally:
772 infof.close()
773 except (OSError, IOError):
774 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
775 return
776
777 if not self.params.get('skip_download', False):
778 try:
779 success = self._do_download(filename, info_dict)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 return
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787 return
788
789 if success:
790 try:
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
794 return
795
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
800
801 for url in url_list:
802 suitable_found = False
803 for ie in self._ies:
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
806 continue
807
808 # Suitable InfoExtractor found
809 suitable_found = True
810
811 # Extract information from URL and process it
812 ie.extract(url)
813
814 # Suitable InfoExtractor had been found; go to next URL
815 break
816
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819
820 return self._download_retcode
821
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
824 info = dict(ie_info)
825 info['filepath'] = filename
826 for pp in self._pps:
827 info = pp.run(info)
828 if info is None:
829 break
830
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
834
835 # Check for rtmpdump first
836 try:
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840 return False
841
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
854 break
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858 retval = 0
859 break
860 if retval == 0:
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
863 return True
864 else:
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866 return False
867
868 def _do_download(self, filename, info_dict):
869 url = info_dict['url']
870 player_url = info_dict.get('player_url', None)
871
872 # Check file already present
873 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
874 self.report_file_already_downloaded(filename)
875 return True
876
877 # Attempt to download using rtmpdump
878 if url.startswith('rtmp'):
879 return self._download_with_rtmpdump(filename, url, player_url)
880
881 tmpfilename = self.temp_name(filename)
882 stream = None
883
884 # Do not include the Accept-Encoding header
885 headers = {'Youtubedl-no-compression': 'True'}
886 basic_request = urllib2.Request(url, None, headers)
887 request = urllib2.Request(url, None, headers)
888
889 # Establish possible resume length
890 if os.path.isfile(tmpfilename):
891 resume_len = os.path.getsize(tmpfilename)
892 else:
893 resume_len = 0
894
895 open_mode = 'wb'
896 if resume_len != 0:
897 if self.params.get('continuedl', False):
898 self.report_resuming_byte(resume_len)
899 request.add_header('Range','bytes=%d-' % resume_len)
900 open_mode = 'ab'
901 else:
902 resume_len = 0
903
904 count = 0
905 retries = self.params.get('retries', 0)
906 while count <= retries:
907 # Establish connection
908 try:
909 if count == 0 and 'urlhandle' in info_dict:
910 data = info_dict['urlhandle']
911 data = urllib2.urlopen(request)
912 break
913 except (urllib2.HTTPError, ), err:
914 if (err.code < 500 or err.code >= 600) and err.code != 416:
915 # Unexpected HTTP error
916 raise
917 elif err.code == 416:
918 # Unable to resume (requested range not satisfiable)
919 try:
920 # Open the connection again without the range header
921 data = urllib2.urlopen(basic_request)
922 content_length = data.info()['Content-Length']
923 except (urllib2.HTTPError, ), err:
924 if err.code < 500 or err.code >= 600:
925 raise
926 else:
927 # Examine the reported length
928 if (content_length is not None and
929 (resume_len - 100 < long(content_length) < resume_len + 100)):
930 # The file had already been fully downloaded.
931 # Explanation to the above condition: in issue #175 it was revealed that
932 # YouTube sometimes adds or removes a few bytes from the end of the file,
933 # changing the file size slightly and causing problems for some users. So
934 # I decided to implement a suggested change and consider the file
935 # completely downloaded if the file size differs less than 100 bytes from
936 # the one in the hard drive.
937 self.report_file_already_downloaded(filename)
938 self.try_rename(tmpfilename, filename)
939 return True
940 else:
941 # The length does not match, we start the download over
942 self.report_unable_to_resume()
943 open_mode = 'wb'
944 break
945 # Retry
946 count += 1
947 if count <= retries:
948 self.report_retry(count, retries)
949
950 if count > retries:
951 self.trouble(u'ERROR: giving up after %s retries' % retries)
952 return False
953
954 data_len = data.info().get('Content-length', None)
955 if data_len is not None:
956 data_len = long(data_len) + resume_len
957 data_len_str = self.format_bytes(data_len)
958 byte_counter = 0 + resume_len
959 block_size = 1024
960 start = time.time()
961 while True:
962 # Download and write
963 before = time.time()
964 data_block = data.read(block_size)
965 after = time.time()
966 if len(data_block) == 0:
967 break
968 byte_counter += len(data_block)
969
970 # Open file just in time
971 if stream is None:
972 try:
973 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
974 assert stream is not None
975 filename = self.undo_temp_name(tmpfilename)
976 self.report_destination(filename)
977 except (OSError, IOError), err:
978 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
979 return False
980 try:
981 stream.write(data_block)
982 except (IOError, OSError), err:
983 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
984 return False
985 block_size = self.best_block_size(after - before, len(data_block))
986
987 # Progress message
988 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
989 if data_len is None:
990 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
991 else:
992 percent_str = self.calc_percent(byte_counter, data_len)
993 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
995
996 # Apply rate limit
997 self.slow_down(start, byte_counter - resume_len)
998
999 if stream is None:
1000 self.trouble(u'\nERROR: Did not get any data blocks')
1001 return False
1002 stream.close()
1003 self.report_finish()
1004 if data_len is not None and byte_counter != data_len:
1005 raise ContentTooShortError(byte_counter, long(data_len))
1006 self.try_rename(tmpfilename, filename)
1007
1008 # Update file modification time
1009 if self.params.get('updatetime', True):
1010 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
1012 return True
1013
1014
1015 class InfoExtractor(object):
1016 """Information Extractor class.
1017
1018 Information extractors are the classes that, given a URL, extract
1019 information from the video (or videos) the URL refers to. This
1020 information includes the real video URL, the video title and simplified
1021 title, author and others. The information is stored in a dictionary
1022 which is then passed to the FileDownloader. The FileDownloader
1023 processes this information possibly downloading the video to the file
1024 system, among other possible outcomes. The dictionaries must include
1025 the following fields:
1026
1027 id: Video identifier.
1028 url: Final video URL.
1029 uploader: Nickname of the video uploader.
1030 title: Literal title.
1031 stitle: Simplified title.
1032 ext: Video filename extension.
1033 format: Video format.
1034 player_url: SWF Player URL (may be None).
1035
1036 The following fields are optional. Their primary purpose is to allow
1037 youtube-dl to serve as the backend for a video search function, such
1038 as the one in youtube2mp3. They are only used when their respective
1039 forced printing functions are called:
1040
1041 thumbnail: Full URL to a video thumbnail image.
1042 description: One-line video description.
1043
1044 Subclasses of this one should re-define the _real_initialize() and
1045 _real_extract() methods and define a _VALID_URL regexp.
1046 Probably, they should also be added to the list of extractors.
1047 """
1048
1049 _ready = False
1050 _downloader = None
1051
1052 def __init__(self, downloader=None):
1053 """Constructor. Receives an optional downloader."""
1054 self._ready = False
1055 self.set_downloader(downloader)
1056
1057 def suitable(self, url):
1058 """Receives a URL and returns True if suitable for this IE."""
1059 return re.match(self._VALID_URL, url) is not None
1060
1061 def initialize(self):
1062 """Initializes an instance (authentication, etc)."""
1063 if not self._ready:
1064 self._real_initialize()
1065 self._ready = True
1066
1067 def extract(self, url):
1068 """Extracts URL information and returns it in list of dicts."""
1069 self.initialize()
1070 return self._real_extract(url)
1071
1072 def set_downloader(self, downloader):
1073 """Sets the downloader for this IE."""
1074 self._downloader = downloader
1075
1076 def _real_initialize(self):
1077 """Real initialization process. Redefine in subclasses."""
1078 pass
1079
1080 def _real_extract(self, url):
1081 """Real extraction process. Redefine in subclasses."""
1082 pass
1083
1084
1085 class YoutubeIE(InfoExtractor):
1086 """Information extractor for youtube.com."""
1087
1088 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092 _NETRC_MACHINE = 'youtube'
1093 # Listed in order of quality
1094 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095 _video_extensions = {
1096 '13': '3gp',
1097 '17': 'mp4',
1098 '18': 'mp4',
1099 '22': 'mp4',
1100 '37': 'mp4',
1101 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102 '43': 'webm',
1103 '44': 'webm',
1104 '45': 'webm',
1105 }
1106 _video_dimensions = {
1107 '5': '240x400',
1108 '6': '???',
1109 '13': '???',
1110 '17': '144x176',
1111 '18': '360x640',
1112 '22': '720x1280',
1113 '34': '360x640',
1114 '35': '480x854',
1115 '37': '1080x1920',
1116 '38': '3072x4096',
1117 '43': '360x640',
1118 '44': '480x854',
1119 '45': '720x1280',
1120 }
1121 IE_NAME = u'youtube'
1122
1123 def report_lang(self):
1124 """Report attempt to set language."""
1125 self._downloader.to_screen(u'[youtube] Setting language')
1126
1127 def report_login(self):
1128 """Report attempt to log in."""
1129 self._downloader.to_screen(u'[youtube] Logging in')
1130
1131 def report_age_confirmation(self):
1132 """Report attempt to confirm age."""
1133 self._downloader.to_screen(u'[youtube] Confirming age')
1134
1135 def report_video_webpage_download(self, video_id):
1136 """Report attempt to download video webpage."""
1137 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1138
1139 def report_video_info_webpage_download(self, video_id):
1140 """Report attempt to download video info webpage."""
1141 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1142
1143 def report_information_extraction(self, video_id):
1144 """Report attempt to extract video information."""
1145 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1146
1147 def report_unavailable_format(self, video_id, format):
1148 """Report extracted video URL."""
1149 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1150
1151 def report_rtmp_download(self):
1152 """Indicate the download will use the RTMP protocol."""
1153 self._downloader.to_screen(u'[youtube] RTMP download detected')
1154
1155 def _print_formats(self, formats):
1156 print 'Available formats:'
1157 for x in formats:
1158 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1159
1160 def _real_initialize(self):
1161 if self._downloader is None:
1162 return
1163
1164 username = None
1165 password = None
1166 downloader_params = self._downloader.params
1167
1168 # Attempt to use provided username and password or .netrc data
1169 if downloader_params.get('username', None) is not None:
1170 username = downloader_params['username']
1171 password = downloader_params['password']
1172 elif downloader_params.get('usenetrc', False):
1173 try:
1174 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1175 if info is not None:
1176 username = info[0]
1177 password = info[2]
1178 else:
1179 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1180 except (IOError, netrc.NetrcParseError), err:
1181 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1182 return
1183
1184 # Set language
1185 request = urllib2.Request(self._LANG_URL)
1186 try:
1187 self.report_lang()
1188 urllib2.urlopen(request).read()
1189 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1190 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1191 return
1192
1193 # No authentication to be performed
1194 if username is None:
1195 return
1196
1197 # Log in
1198 login_form = {
1199 'current_form': 'loginForm',
1200 'next': '/',
1201 'action_login': 'Log In',
1202 'username': username,
1203 'password': password,
1204 }
1205 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1206 try:
1207 self.report_login()
1208 login_results = urllib2.urlopen(request).read()
1209 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1210 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1211 return
1212 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1213 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1214 return
1215
1216 # Confirm age
1217 age_form = {
1218 'next_url': '/',
1219 'action_confirm': 'Confirm',
1220 }
1221 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1222 try:
1223 self.report_age_confirmation()
1224 age_results = urllib2.urlopen(request).read()
1225 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1226 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1227 return
1228
1229 def _real_extract(self, url):
1230 # Extract video id from URL
1231 mobj = re.match(self._VALID_URL, url)
1232 if mobj is None:
1233 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1234 return
1235 video_id = mobj.group(2)
1236
1237 # Get video webpage
1238 self.report_video_webpage_download(video_id)
1239 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1240 try:
1241 video_webpage = urllib2.urlopen(request).read()
1242 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1243 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1244 return
1245
1246 # Attempt to extract SWF player URL
1247 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1248 if mobj is not None:
1249 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1250 else:
1251 player_url = None
1252
1253 # Get video info
1254 self.report_video_info_webpage_download(video_id)
1255 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1256 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1257 % (video_id, el_type))
1258 request = urllib2.Request(video_info_url)
1259 try:
1260 video_info_webpage = urllib2.urlopen(request).read()
1261 video_info = parse_qs(video_info_webpage)
1262 if 'token' in video_info:
1263 break
1264 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1265 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1266 return
1267 if 'token' not in video_info:
1268 if 'reason' in video_info:
1269 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1270 else:
1271 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1272 return
1273
1274 # Start extracting information
1275 self.report_information_extraction(video_id)
1276
1277 # uploader
1278 if 'author' not in video_info:
1279 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1280 return
1281 video_uploader = urllib.unquote_plus(video_info['author'][0])
1282
1283 # title
1284 if 'title' not in video_info:
1285 self._downloader.trouble(u'ERROR: unable to extract video title')
1286 return
1287 video_title = urllib.unquote_plus(video_info['title'][0])
1288 video_title = video_title.decode('utf-8')
1289 video_title = sanitize_title(video_title)
1290
1291 # simplified title
1292 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1293 simple_title = simple_title.strip(ur'_')
1294
1295 # thumbnail image
1296 if 'thumbnail_url' not in video_info:
1297 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1298 video_thumbnail = ''
1299 else: # don't panic if we can't find it
1300 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1301
1302 # upload date
1303 upload_date = u'NA'
1304 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1305 if mobj is not None:
1306 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1307 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1308 for expression in format_expressions:
1309 try:
1310 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1311 except:
1312 pass
1313
1314 # description
1315 try:
1316 lxml.etree
1317 except NameError:
1318 video_description = u'No description available.'
1319 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1320 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1321 if mobj is not None:
1322 video_description = mobj.group(1).decode('utf-8')
1323 else:
1324 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1325 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1326 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1327 # TODO use another parser
1328
1329 # token
1330 video_token = urllib.unquote_plus(video_info['token'][0])
1331
1332 # Decide which formats to download
1333 req_format = self._downloader.params.get('format', None)
1334
1335 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1336 self.report_rtmp_download()
1337 video_url_list = [(None, video_info['conn'][0])]
1338 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1339 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1340 url_data = [parse_qs(uds) for uds in url_data_strs]
1341 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1342 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1343
1344 format_limit = self._downloader.params.get('format_limit', None)
1345 if format_limit is not None and format_limit in self._available_formats:
1346 format_list = self._available_formats[self._available_formats.index(format_limit):]
1347 else:
1348 format_list = self._available_formats
1349 existing_formats = [x for x in format_list if x in url_map]
1350 if len(existing_formats) == 0:
1351 self._downloader.trouble(u'ERROR: no known formats available for video')
1352 return
1353 if self._downloader.params.get('listformats', None):
1354 self._print_formats(existing_formats)
1355 return
1356 if req_format is None or req_format == 'best':
1357 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1358 elif req_format == 'worst':
1359 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1360 elif req_format in ('-1', 'all'):
1361 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1362 else:
1363 # Specific formats. We pick the first in a slash-delimeted sequence.
1364 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1365 req_formats = req_format.split('/')
1366 video_url_list = None
1367 for rf in req_formats:
1368 if rf in url_map:
1369 video_url_list = [(rf, url_map[rf])]
1370 break
1371 if video_url_list is None:
1372 self._downloader.trouble(u'ERROR: requested format not available')
1373 return
1374 else:
1375 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1376 return
1377
1378 for format_param, video_real_url in video_url_list:
1379 # At this point we have a new video
1380 self._downloader.increment_downloads()
1381
1382 # Extension
1383 video_extension = self._video_extensions.get(format_param, 'flv')
1384
1385 try:
1386 # Process video information
1387 self._downloader.process_info({
1388 'id': video_id.decode('utf-8'),
1389 'url': video_real_url.decode('utf-8'),
1390 'uploader': video_uploader.decode('utf-8'),
1391 'upload_date': upload_date,
1392 'title': video_title,
1393 'stitle': simple_title,
1394 'ext': video_extension.decode('utf-8'),
1395 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1396 'thumbnail': video_thumbnail.decode('utf-8'),
1397 'description': video_description,
1398 'player_url': player_url,
1399 })
1400 except UnavailableVideoError, err:
1401 self._downloader.trouble(u'\nERROR: unable to download video')
1402
1403
1404 class MetacafeIE(InfoExtractor):
1405 """Information Extractor for metacafe.com."""
1406
1407 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1408 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1409 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1410 _youtube_ie = None
1411 IE_NAME = u'metacafe'
1412
1413 def __init__(self, youtube_ie, downloader=None):
1414 InfoExtractor.__init__(self, downloader)
1415 self._youtube_ie = youtube_ie
1416
1417 def report_disclaimer(self):
1418 """Report disclaimer retrieval."""
1419 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1420
1421 def report_age_confirmation(self):
1422 """Report attempt to confirm age."""
1423 self._downloader.to_screen(u'[metacafe] Confirming age')
1424
1425 def report_download_webpage(self, video_id):
1426 """Report webpage download."""
1427 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1428
1429 def report_extraction(self, video_id):
1430 """Report information extraction."""
1431 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1432
1433 def _real_initialize(self):
1434 # Retrieve disclaimer
1435 request = urllib2.Request(self._DISCLAIMER)
1436 try:
1437 self.report_disclaimer()
1438 disclaimer = urllib2.urlopen(request).read()
1439 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1440 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1441 return
1442
1443 # Confirm age
1444 disclaimer_form = {
1445 'filters': '0',
1446 'submit': "Continue - I'm over 18",
1447 }
1448 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1449 try:
1450 self.report_age_confirmation()
1451 disclaimer = urllib2.urlopen(request).read()
1452 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1453 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1454 return
1455
1456 def _real_extract(self, url):
1457 # Extract id and simplified title from URL
1458 mobj = re.match(self._VALID_URL, url)
1459 if mobj is None:
1460 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1461 return
1462
1463 video_id = mobj.group(1)
1464
1465 # Check if video comes from YouTube
1466 mobj2 = re.match(r'^yt-(.*)$', video_id)
1467 if mobj2 is not None:
1468 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1469 return
1470
1471 # At this point we have a new video
1472 self._downloader.increment_downloads()
1473
1474 simple_title = mobj.group(2).decode('utf-8')
1475
1476 # Retrieve video webpage to extract further information
1477 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1478 try:
1479 self.report_download_webpage(video_id)
1480 webpage = urllib2.urlopen(request).read()
1481 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1482 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1483 return
1484
1485 # Extract URL, uploader and title from webpage
1486 self.report_extraction(video_id)
1487 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1488 if mobj is not None:
1489 mediaURL = urllib.unquote(mobj.group(1))
1490 video_extension = mediaURL[-3:]
1491
1492 # Extract gdaKey if available
1493 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1494 if mobj is None:
1495 video_url = mediaURL
1496 else:
1497 gdaKey = mobj.group(1)
1498 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1499 else:
1500 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1501 if mobj is None:
1502 self._downloader.trouble(u'ERROR: unable to extract media URL')
1503 return
1504 vardict = parse_qs(mobj.group(1))
1505 if 'mediaData' not in vardict:
1506 self._downloader.trouble(u'ERROR: unable to extract media URL')
1507 return
1508 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1509 if mobj is None:
1510 self._downloader.trouble(u'ERROR: unable to extract media URL')
1511 return
1512 mediaURL = mobj.group(1).replace('\\/', '/')
1513 video_extension = mediaURL[-3:]
1514 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1515
1516 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1517 if mobj is None:
1518 self._downloader.trouble(u'ERROR: unable to extract title')
1519 return
1520 video_title = mobj.group(1).decode('utf-8')
1521 video_title = sanitize_title(video_title)
1522
1523 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1524 if mobj is None:
1525 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1526 return
1527 video_uploader = mobj.group(1)
1528
1529 try:
1530 # Process video information
1531 self._downloader.process_info({
1532 'id': video_id.decode('utf-8'),
1533 'url': video_url.decode('utf-8'),
1534 'uploader': video_uploader.decode('utf-8'),
1535 'upload_date': u'NA',
1536 'title': video_title,
1537 'stitle': simple_title,
1538 'ext': video_extension.decode('utf-8'),
1539 'format': u'NA',
1540 'player_url': None,
1541 })
1542 except UnavailableVideoError:
1543 self._downloader.trouble(u'\nERROR: unable to download video')
1544
1545
1546 class DailymotionIE(InfoExtractor):
1547 """Information Extractor for Dailymotion"""
1548
1549 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1550 IE_NAME = u'dailymotion'
1551
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1554
1555 def report_download_webpage(self, video_id):
1556 """Report webpage download."""
1557 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1558
1559 def report_extraction(self, video_id):
1560 """Report information extraction."""
1561 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1562
1563 def _real_initialize(self):
1564 return
1565
1566 def _real_extract(self, url):
1567 # Extract id and simplified title from URL
1568 mobj = re.match(self._VALID_URL, url)
1569 if mobj is None:
1570 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1571 return
1572
1573 # At this point we have a new video
1574 self._downloader.increment_downloads()
1575 video_id = mobj.group(1)
1576
1577 simple_title = mobj.group(2).decode('utf-8')
1578 video_extension = 'flv'
1579
1580 # Retrieve video webpage to extract further information
1581 request = urllib2.Request(url)
1582 request.add_header('Cookie', 'family_filter=off')
1583 try:
1584 self.report_download_webpage(video_id)
1585 webpage = urllib2.urlopen(request).read()
1586 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1587 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1588 return
1589
1590 # Extract URL, uploader and title from webpage
1591 self.report_extraction(video_id)
1592 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1593 if mobj is None:
1594 self._downloader.trouble(u'ERROR: unable to extract media URL')
1595 return
1596 sequence = urllib.unquote(mobj.group(1))
1597 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1598 if mobj is None:
1599 self._downloader.trouble(u'ERROR: unable to extract media URL')
1600 return
1601 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1602
1603 # if needed add http://www.dailymotion.com/ if relative URL
1604
1605 video_url = mediaURL
1606
1607 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1608 if mobj is None:
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1610 return
1611 video_title = mobj.group(1).decode('utf-8')
1612 video_title = sanitize_title(video_title)
1613
1614 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1615 if mobj is None:
1616 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1617 return
1618 video_uploader = mobj.group(1)
1619
1620 try:
1621 # Process video information
1622 self._downloader.process_info({
1623 'id': video_id.decode('utf-8'),
1624 'url': video_url.decode('utf-8'),
1625 'uploader': video_uploader.decode('utf-8'),
1626 'upload_date': u'NA',
1627 'title': video_title,
1628 'stitle': simple_title,
1629 'ext': video_extension.decode('utf-8'),
1630 'format': u'NA',
1631 'player_url': None,
1632 })
1633 except UnavailableVideoError:
1634 self._downloader.trouble(u'\nERROR: unable to download video')
1635
1636
1637 class GoogleIE(InfoExtractor):
1638 """Information extractor for video.google.com."""
1639
1640 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1641 IE_NAME = u'video.google'
1642
1643 def __init__(self, downloader=None):
1644 InfoExtractor.__init__(self, downloader)
1645
1646 def report_download_webpage(self, video_id):
1647 """Report webpage download."""
1648 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1649
1650 def report_extraction(self, video_id):
1651 """Report information extraction."""
1652 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1653
1654 def _real_initialize(self):
1655 return
1656
1657 def _real_extract(self, url):
1658 # Extract id from URL
1659 mobj = re.match(self._VALID_URL, url)
1660 if mobj is None:
1661 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1662 return
1663
1664 # At this point we have a new video
1665 self._downloader.increment_downloads()
1666 video_id = mobj.group(1)
1667
1668 video_extension = 'mp4'
1669
1670 # Retrieve video webpage to extract further information
1671 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1672 try:
1673 self.report_download_webpage(video_id)
1674 webpage = urllib2.urlopen(request).read()
1675 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1676 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1677 return
1678
1679 # Extract URL, uploader, and title from webpage
1680 self.report_extraction(video_id)
1681 mobj = re.search(r"download_url:'([^']+)'", webpage)
1682 if mobj is None:
1683 video_extension = 'flv'
1684 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1685 if mobj is None:
1686 self._downloader.trouble(u'ERROR: unable to extract media URL')
1687 return
1688 mediaURL = urllib.unquote(mobj.group(1))
1689 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1690 mediaURL = mediaURL.replace('\\x26', '\x26')
1691
1692 video_url = mediaURL
1693
1694 mobj = re.search(r'<title>(.*)</title>', webpage)
1695 if mobj is None:
1696 self._downloader.trouble(u'ERROR: unable to extract title')
1697 return
1698 video_title = mobj.group(1).decode('utf-8')
1699 video_title = sanitize_title(video_title)
1700 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1701
1702 # Extract video description
1703 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1704 if mobj is None:
1705 self._downloader.trouble(u'ERROR: unable to extract video description')
1706 return
1707 video_description = mobj.group(1).decode('utf-8')
1708 if not video_description:
1709 video_description = 'No description available.'
1710
1711 # Extract video thumbnail
1712 if self._downloader.params.get('forcethumbnail', False):
1713 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1714 try:
1715 webpage = urllib2.urlopen(request).read()
1716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1717 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1718 return
1719 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1720 if mobj is None:
1721 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1722 return
1723 video_thumbnail = mobj.group(1)
1724 else: # we need something to pass to process_info
1725 video_thumbnail = ''
1726
1727 try:
1728 # Process video information
1729 self._downloader.process_info({
1730 'id': video_id.decode('utf-8'),
1731 'url': video_url.decode('utf-8'),
1732 'uploader': u'NA',
1733 'upload_date': u'NA',
1734 'title': video_title,
1735 'stitle': simple_title,
1736 'ext': video_extension.decode('utf-8'),
1737 'format': u'NA',
1738 'player_url': None,
1739 })
1740 except UnavailableVideoError:
1741 self._downloader.trouble(u'\nERROR: unable to download video')
1742
1743
1744 class PhotobucketIE(InfoExtractor):
1745 """Information extractor for photobucket.com."""
1746
1747 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1748 IE_NAME = u'photobucket'
1749
1750 def __init__(self, downloader=None):
1751 InfoExtractor.__init__(self, downloader)
1752
1753 def report_download_webpage(self, video_id):
1754 """Report webpage download."""
1755 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1756
1757 def report_extraction(self, video_id):
1758 """Report information extraction."""
1759 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1760
1761 def _real_initialize(self):
1762 return
1763
1764 def _real_extract(self, url):
1765 # Extract id from URL
1766 mobj = re.match(self._VALID_URL, url)
1767 if mobj is None:
1768 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1769 return
1770
1771 # At this point we have a new video
1772 self._downloader.increment_downloads()
1773 video_id = mobj.group(1)
1774
1775 video_extension = 'flv'
1776
1777 # Retrieve video webpage to extract further information
1778 request = urllib2.Request(url)
1779 try:
1780 self.report_download_webpage(video_id)
1781 webpage = urllib2.urlopen(request).read()
1782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1783 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1784 return
1785
1786 # Extract URL, uploader, and title from webpage
1787 self.report_extraction(video_id)
1788 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1789 if mobj is None:
1790 self._downloader.trouble(u'ERROR: unable to extract media URL')
1791 return
1792 mediaURL = urllib.unquote(mobj.group(1))
1793
1794 video_url = mediaURL
1795
1796 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1797 if mobj is None:
1798 self._downloader.trouble(u'ERROR: unable to extract title')
1799 return
1800 video_title = mobj.group(1).decode('utf-8')
1801 video_title = sanitize_title(video_title)
1802 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1803
1804 video_uploader = mobj.group(2).decode('utf-8')
1805
1806 try:
1807 # Process video information
1808 self._downloader.process_info({
1809 'id': video_id.decode('utf-8'),
1810 'url': video_url.decode('utf-8'),
1811 'uploader': video_uploader,
1812 'upload_date': u'NA',
1813 'title': video_title,
1814 'stitle': simple_title,
1815 'ext': video_extension.decode('utf-8'),
1816 'format': u'NA',
1817 'player_url': None,
1818 })
1819 except UnavailableVideoError:
1820 self._downloader.trouble(u'\nERROR: unable to download video')
1821
1822
1823 class YahooIE(InfoExtractor):
1824 """Information extractor for video.yahoo.com."""
1825
1826 # _VALID_URL matches all Yahoo! Video URLs
1827 # _VPAGE_URL matches only the extractable '/watch/' URLs
1828 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1829 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1830 IE_NAME = u'video.yahoo'
1831
1832 def __init__(self, downloader=None):
1833 InfoExtractor.__init__(self, downloader)
1834
1835 def report_download_webpage(self, video_id):
1836 """Report webpage download."""
1837 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1838
1839 def report_extraction(self, video_id):
1840 """Report information extraction."""
1841 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1842
1843 def _real_initialize(self):
1844 return
1845
1846 def _real_extract(self, url, new_video=True):
1847 # Extract ID from URL
1848 mobj = re.match(self._VALID_URL, url)
1849 if mobj is None:
1850 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1851 return
1852
1853 # At this point we have a new video
1854 self._downloader.increment_downloads()
1855 video_id = mobj.group(2)
1856 video_extension = 'flv'
1857
1858 # Rewrite valid but non-extractable URLs as
1859 # extractable English language /watch/ URLs
1860 if re.match(self._VPAGE_URL, url) is None:
1861 request = urllib2.Request(url)
1862 try:
1863 webpage = urllib2.urlopen(request).read()
1864 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1865 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1866 return
1867
1868 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1869 if mobj is None:
1870 self._downloader.trouble(u'ERROR: Unable to extract id field')
1871 return
1872 yahoo_id = mobj.group(1)
1873
1874 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1875 if mobj is None:
1876 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1877 return
1878 yahoo_vid = mobj.group(1)
1879
1880 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1881 return self._real_extract(url, new_video=False)
1882
1883 # Retrieve video webpage to extract further information
1884 request = urllib2.Request(url)
1885 try:
1886 self.report_download_webpage(video_id)
1887 webpage = urllib2.urlopen(request).read()
1888 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1889 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1890 return
1891
1892 # Extract uploader and title from webpage
1893 self.report_extraction(video_id)
1894 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1895 if mobj is None:
1896 self._downloader.trouble(u'ERROR: unable to extract video title')
1897 return
1898 video_title = mobj.group(1).decode('utf-8')
1899 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1900
1901 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1902 if mobj is None:
1903 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1904 return
1905 video_uploader = mobj.group(1).decode('utf-8')
1906
1907 # Extract video thumbnail
1908 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1909 if mobj is None:
1910 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1911 return
1912 video_thumbnail = mobj.group(1).decode('utf-8')
1913
1914 # Extract video description
1915 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1916 if mobj is None:
1917 self._downloader.trouble(u'ERROR: unable to extract video description')
1918 return
1919 video_description = mobj.group(1).decode('utf-8')
1920 if not video_description:
1921 video_description = 'No description available.'
1922
1923 # Extract video height and width
1924 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1925 if mobj is None:
1926 self._downloader.trouble(u'ERROR: unable to extract video height')
1927 return
1928 yv_video_height = mobj.group(1)
1929
1930 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1931 if mobj is None:
1932 self._downloader.trouble(u'ERROR: unable to extract video width')
1933 return
1934 yv_video_width = mobj.group(1)
1935
1936 # Retrieve video playlist to extract media URL
1937 # I'm not completely sure what all these options are, but we
1938 # seem to need most of them, otherwise the server sends a 401.
1939 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1940 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1941 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1942 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1943 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1944 try:
1945 self.report_download_webpage(video_id)
1946 webpage = urllib2.urlopen(request).read()
1947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1948 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1949 return
1950
1951 # Extract media URL from playlist XML
1952 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1953 if mobj is None:
1954 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1955 return
1956 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1957 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1958
1959 try:
1960 # Process video information
1961 self._downloader.process_info({
1962 'id': video_id.decode('utf-8'),
1963 'url': video_url,
1964 'uploader': video_uploader,
1965 'upload_date': u'NA',
1966 'title': video_title,
1967 'stitle': simple_title,
1968 'ext': video_extension.decode('utf-8'),
1969 'thumbnail': video_thumbnail.decode('utf-8'),
1970 'description': video_description,
1971 'thumbnail': video_thumbnail,
1972 'player_url': None,
1973 })
1974 except UnavailableVideoError:
1975 self._downloader.trouble(u'\nERROR: unable to download video')
1976
1977
1978 class VimeoIE(InfoExtractor):
1979 """Information extractor for vimeo.com."""
1980
1981 # _VALID_URL matches Vimeo URLs
1982 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1983 IE_NAME = u'vimeo'
1984
1985 def __init__(self, downloader=None):
1986 InfoExtractor.__init__(self, downloader)
1987
1988 def report_download_webpage(self, video_id):
1989 """Report webpage download."""
1990 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1991
1992 def report_extraction(self, video_id):
1993 """Report information extraction."""
1994 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1995
1996 def _real_initialize(self):
1997 return
1998
1999 def _real_extract(self, url, new_video=True):
2000 # Extract ID from URL
2001 mobj = re.match(self._VALID_URL, url)
2002 if mobj is None:
2003 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2004 return
2005
2006 # At this point we have a new video
2007 self._downloader.increment_downloads()
2008 video_id = mobj.group(1)
2009
2010 # Retrieve video webpage to extract further information
2011 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2012 try:
2013 self.report_download_webpage(video_id)
2014 webpage = urllib2.urlopen(request).read()
2015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2017 return
2018
2019 # Now we begin extracting as much information as we can from what we
2020 # retrieved. First we extract the information common to all extractors,
2021 # and latter we extract those that are Vimeo specific.
2022 self.report_extraction(video_id)
2023
2024 # Extract title
2025 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2026 if mobj is None:
2027 self._downloader.trouble(u'ERROR: unable to extract video title')
2028 return
2029 video_title = mobj.group(1).decode('utf-8')
2030 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2031
2032 # Extract uploader
2033 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2034 if mobj is None:
2035 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2036 return
2037 video_uploader = mobj.group(1).decode('utf-8')
2038
2039 # Extract video thumbnail
2040 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2041 if mobj is None:
2042 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2043 return
2044 video_thumbnail = mobj.group(1).decode('utf-8')
2045
2046 # # Extract video description
2047 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2048 # if mobj is None:
2049 # self._downloader.trouble(u'ERROR: unable to extract video description')
2050 # return
2051 # video_description = mobj.group(1).decode('utf-8')
2052 # if not video_description: video_description = 'No description available.'
2053 video_description = 'Foo.'
2054
2055 # Vimeo specific: extract request signature
2056 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: unable to extract request signature')
2059 return
2060 sig = mobj.group(1).decode('utf-8')
2061
2062 # Vimeo specific: extract video quality information
2063 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2064 if mobj is None:
2065 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2066 return
2067 quality = mobj.group(1).decode('utf-8')
2068
2069 if int(quality) == 1:
2070 quality = 'hd'
2071 else:
2072 quality = 'sd'
2073
2074 # Vimeo specific: Extract request signature expiration
2075 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2076 if mobj is None:
2077 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2078 return
2079 sig_exp = mobj.group(1).decode('utf-8')
2080
2081 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2082
2083 try:
2084 # Process video information
2085 self._downloader.process_info({
2086 'id': video_id.decode('utf-8'),
2087 'url': video_url,
2088 'uploader': video_uploader,
2089 'upload_date': u'NA',
2090 'title': video_title,
2091 'stitle': simple_title,
2092 'ext': u'mp4',
2093 'thumbnail': video_thumbnail.decode('utf-8'),
2094 'description': video_description,
2095 'thumbnail': video_thumbnail,
2096 'description': video_description,
2097 'player_url': None,
2098 })
2099 except UnavailableVideoError:
2100 self._downloader.trouble(u'ERROR: unable to download video')
2101
2102
2103 class GenericIE(InfoExtractor):
2104 """Generic last-resort information extractor."""
2105
2106 _VALID_URL = r'.*'
2107 IE_NAME = u'generic'
2108
2109 def __init__(self, downloader=None):
2110 InfoExtractor.__init__(self, downloader)
2111
2112 def report_download_webpage(self, video_id):
2113 """Report webpage download."""
2114 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2115 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2116
2117 def report_extraction(self, video_id):
2118 """Report information extraction."""
2119 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2120
2121 def _real_initialize(self):
2122 return
2123
2124 def _real_extract(self, url):
2125 # At this point we have a new video
2126 self._downloader.increment_downloads()
2127
2128 video_id = url.split('/')[-1]
2129 request = urllib2.Request(url)
2130 try:
2131 self.report_download_webpage(video_id)
2132 webpage = urllib2.urlopen(request).read()
2133 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2134 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2135 return
2136 except ValueError, err:
2137 # since this is the last-resort InfoExtractor, if
2138 # this error is thrown, it'll be thrown here
2139 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2140 return
2141
2142 self.report_extraction(video_id)
2143 # Start with something easy: JW Player in SWFObject
2144 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2145 if mobj is None:
2146 # Broaden the search a little bit
2147 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2148 if mobj is None:
2149 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2150 return
2151
2152 # It's possible that one of the regexes
2153 # matched, but returned an empty group:
2154 if mobj.group(1) is None:
2155 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2156 return
2157
2158 video_url = urllib.unquote(mobj.group(1))
2159 video_id = os.path.basename(video_url)
2160
2161 # here's a fun little line of code for you:
2162 video_extension = os.path.splitext(video_id)[1][1:]
2163 video_id = os.path.splitext(video_id)[0]
2164
2165 # it's tempting to parse this further, but you would
2166 # have to take into account all the variations like
2167 # Video Title - Site Name
2168 # Site Name | Video Title
2169 # Video Title - Tagline | Site Name
2170 # and so on and so forth; it's just not practical
2171 mobj = re.search(r'<title>(.*)</title>', webpage)
2172 if mobj is None:
2173 self._downloader.trouble(u'ERROR: unable to extract title')
2174 return
2175 video_title = mobj.group(1).decode('utf-8')
2176 video_title = sanitize_title(video_title)
2177 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2178
2179 # video uploader is domain name
2180 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2181 if mobj is None:
2182 self._downloader.trouble(u'ERROR: unable to extract title')
2183 return
2184 video_uploader = mobj.group(1).decode('utf-8')
2185
2186 try:
2187 # Process video information
2188 self._downloader.process_info({
2189 'id': video_id.decode('utf-8'),
2190 'url': video_url.decode('utf-8'),
2191 'uploader': video_uploader,
2192 'upload_date': u'NA',
2193 'title': video_title,
2194 'stitle': simple_title,
2195 'ext': video_extension.decode('utf-8'),
2196 'format': u'NA',
2197 'player_url': None,
2198 })
2199 except UnavailableVideoError, err:
2200 self._downloader.trouble(u'\nERROR: unable to download video')
2201
2202
2203 class YoutubeSearchIE(InfoExtractor):
2204 """Information Extractor for YouTube search queries."""
2205 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2206 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2207 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2208 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2209 _youtube_ie = None
2210 _max_youtube_results = 1000
2211 IE_NAME = u'youtube:search'
2212
2213 def __init__(self, youtube_ie, downloader=None):
2214 InfoExtractor.__init__(self, downloader)
2215 self._youtube_ie = youtube_ie
2216
2217 def report_download_page(self, query, pagenum):
2218 """Report attempt to download playlist page with given number."""
2219 query = query.decode(preferredencoding())
2220 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2221
2222 def _real_initialize(self):
2223 self._youtube_ie.initialize()
2224
2225 def _real_extract(self, query):
2226 mobj = re.match(self._VALID_URL, query)
2227 if mobj is None:
2228 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2229 return
2230
2231 prefix, query = query.split(':')
2232 prefix = prefix[8:]
2233 query = query.encode('utf-8')
2234 if prefix == '':
2235 self._download_n_results(query, 1)
2236 return
2237 elif prefix == 'all':
2238 self._download_n_results(query, self._max_youtube_results)
2239 return
2240 else:
2241 try:
2242 n = long(prefix)
2243 if n <= 0:
2244 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2245 return
2246 elif n > self._max_youtube_results:
2247 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2248 n = self._max_youtube_results
2249 self._download_n_results(query, n)
2250 return
2251 except ValueError: # parsing prefix as integer fails
2252 self._download_n_results(query, 1)
2253 return
2254
2255 def _download_n_results(self, query, n):
2256 """Downloads a specified number of results for a query"""
2257
2258 video_ids = []
2259 already_seen = set()
2260 pagenum = 1
2261
2262 while True:
2263 self.report_download_page(query, pagenum)
2264 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2265 request = urllib2.Request(result_url)
2266 try:
2267 page = urllib2.urlopen(request).read()
2268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2269 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2270 return
2271
2272 # Extract video identifiers
2273 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2274 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2275 if video_id not in already_seen:
2276 video_ids.append(video_id)
2277 already_seen.add(video_id)
2278 if len(video_ids) == n:
2279 # Specified n videos reached
2280 for id in video_ids:
2281 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2282 return
2283
2284 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2285 for id in video_ids:
2286 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2287 return
2288
2289 pagenum = pagenum + 1
2290
2291
2292 class GoogleSearchIE(InfoExtractor):
2293 """Information Extractor for Google Video search queries."""
2294 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2295 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2296 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2297 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2298 _google_ie = None
2299 _max_google_results = 1000
2300 IE_NAME = u'video.google:search'
2301
2302 def __init__(self, google_ie, downloader=None):
2303 InfoExtractor.__init__(self, downloader)
2304 self._google_ie = google_ie
2305
2306 def report_download_page(self, query, pagenum):
2307 """Report attempt to download playlist page with given number."""
2308 query = query.decode(preferredencoding())
2309 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2310
2311 def _real_initialize(self):
2312 self._google_ie.initialize()
2313
2314 def _real_extract(self, query):
2315 mobj = re.match(self._VALID_URL, query)
2316 if mobj is None:
2317 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2318 return
2319
2320 prefix, query = query.split(':')
2321 prefix = prefix[8:]
2322 query = query.encode('utf-8')
2323 if prefix == '':
2324 self._download_n_results(query, 1)
2325 return
2326 elif prefix == 'all':
2327 self._download_n_results(query, self._max_google_results)
2328 return
2329 else:
2330 try:
2331 n = long(prefix)
2332 if n <= 0:
2333 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2334 return
2335 elif n > self._max_google_results:
2336 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2337 n = self._max_google_results
2338 self._download_n_results(query, n)
2339 return
2340 except ValueError: # parsing prefix as integer fails
2341 self._download_n_results(query, 1)
2342 return
2343
2344 def _download_n_results(self, query, n):
2345 """Downloads a specified number of results for a query"""
2346
2347 video_ids = []
2348 already_seen = set()
2349 pagenum = 1
2350
2351 while True:
2352 self.report_download_page(query, pagenum)
2353 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2354 request = urllib2.Request(result_url)
2355 try:
2356 page = urllib2.urlopen(request).read()
2357 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2358 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2359 return
2360
2361 # Extract video identifiers
2362 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2363 video_id = mobj.group(1)
2364 if video_id not in already_seen:
2365 video_ids.append(video_id)
2366 already_seen.add(video_id)
2367 if len(video_ids) == n:
2368 # Specified n videos reached
2369 for id in video_ids:
2370 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2371 return
2372
2373 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2374 for id in video_ids:
2375 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2376 return
2377
2378 pagenum = pagenum + 1
2379
2380
2381 class YahooSearchIE(InfoExtractor):
2382 """Information Extractor for Yahoo! Video search queries."""
2383 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2384 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2385 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2386 _MORE_PAGES_INDICATOR = r'\s*Next'
2387 _yahoo_ie = None
2388 _max_yahoo_results = 1000
2389 IE_NAME = u'video.yahoo:search'
2390
2391 def __init__(self, yahoo_ie, downloader=None):
2392 InfoExtractor.__init__(self, downloader)
2393 self._yahoo_ie = yahoo_ie
2394
2395 def report_download_page(self, query, pagenum):
2396 """Report attempt to download playlist page with given number."""
2397 query = query.decode(preferredencoding())
2398 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2399
2400 def _real_initialize(self):
2401 self._yahoo_ie.initialize()
2402
2403 def _real_extract(self, query):
2404 mobj = re.match(self._VALID_URL, query)
2405 if mobj is None:
2406 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2407 return
2408
2409 prefix, query = query.split(':')
2410 prefix = prefix[8:]
2411 query = query.encode('utf-8')
2412 if prefix == '':
2413 self._download_n_results(query, 1)
2414 return
2415 elif prefix == 'all':
2416 self._download_n_results(query, self._max_yahoo_results)
2417 return
2418 else:
2419 try:
2420 n = long(prefix)
2421 if n <= 0:
2422 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2423 return
2424 elif n > self._max_yahoo_results:
2425 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2426 n = self._max_yahoo_results
2427 self._download_n_results(query, n)
2428 return
2429 except ValueError: # parsing prefix as integer fails
2430 self._download_n_results(query, 1)
2431 return
2432
2433 def _download_n_results(self, query, n):
2434 """Downloads a specified number of results for a query"""
2435
2436 video_ids = []
2437 already_seen = set()
2438 pagenum = 1
2439
2440 while True:
2441 self.report_download_page(query, pagenum)
2442 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2443 request = urllib2.Request(result_url)
2444 try:
2445 page = urllib2.urlopen(request).read()
2446 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2447 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2448 return
2449
2450 # Extract video identifiers
2451 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2452 video_id = mobj.group(1)
2453 if video_id not in already_seen:
2454 video_ids.append(video_id)
2455 already_seen.add(video_id)
2456 if len(video_ids) == n:
2457 # Specified n videos reached
2458 for id in video_ids:
2459 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2460 return
2461
2462 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2463 for id in video_ids:
2464 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2465 return
2466
2467 pagenum = pagenum + 1
2468
2469
2470 class YoutubePlaylistIE(InfoExtractor):
2471 """Information Extractor for YouTube playlists."""
2472
2473 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2474 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2475 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2476 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2477 _youtube_ie = None
2478 IE_NAME = u'youtube:playlist'
2479
2480 def __init__(self, youtube_ie, downloader=None):
2481 InfoExtractor.__init__(self, downloader)
2482 self._youtube_ie = youtube_ie
2483
2484 def report_download_page(self, playlist_id, pagenum):
2485 """Report attempt to download playlist page with given number."""
2486 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2487
2488 def _real_initialize(self):
2489 self._youtube_ie.initialize()
2490
2491 def _real_extract(self, url):
2492 # Extract playlist id
2493 mobj = re.match(self._VALID_URL, url)
2494 if mobj is None:
2495 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2496 return
2497
2498 # Single video case
2499 if mobj.group(3) is not None:
2500 self._youtube_ie.extract(mobj.group(3))
2501 return
2502
2503 # Download playlist pages
2504 # prefix is 'p' as default for playlists but there are other types that need extra care
2505 playlist_prefix = mobj.group(1)
2506 if playlist_prefix == 'a':
2507 playlist_access = 'artist'
2508 else:
2509 playlist_prefix = 'p'
2510 playlist_access = 'view_play_list'
2511 playlist_id = mobj.group(2)
2512 video_ids = []
2513 pagenum = 1
2514
2515 while True:
2516 self.report_download_page(playlist_id, pagenum)
2517 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2518 try:
2519 page = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2522 return
2523
2524 # Extract video identifiers
2525 ids_in_page = []
2526 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2527 if mobj.group(1) not in ids_in_page:
2528 ids_in_page.append(mobj.group(1))
2529 video_ids.extend(ids_in_page)
2530
2531 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2532 break
2533 pagenum = pagenum + 1
2534
2535 playliststart = self._downloader.params.get('playliststart', 1) - 1
2536 playlistend = self._downloader.params.get('playlistend', -1)
2537 video_ids = video_ids[playliststart:playlistend]
2538
2539 for id in video_ids:
2540 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2541 return
2542
2543
2544 class YoutubeUserIE(InfoExtractor):
2545 """Information Extractor for YouTube users."""
2546
2547 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2548 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2549 _GDATA_PAGE_SIZE = 50
2550 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2551 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2552 _youtube_ie = None
2553 IE_NAME = u'youtube:user'
2554
2555 def __init__(self, youtube_ie, downloader=None):
2556 InfoExtractor.__init__(self, downloader)
2557 self._youtube_ie = youtube_ie
2558
2559 def report_download_page(self, username, start_index):
2560 """Report attempt to download user page."""
2561 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2562 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2563
2564 def _real_initialize(self):
2565 self._youtube_ie.initialize()
2566
2567 def _real_extract(self, url):
2568 # Extract username
2569 mobj = re.match(self._VALID_URL, url)
2570 if mobj is None:
2571 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2572 return
2573
2574 username = mobj.group(1)
2575
2576 # Download video ids using YouTube Data API. Result size per
2577 # query is limited (currently to 50 videos) so we need to query
2578 # page by page until there are no video ids - it means we got
2579 # all of them.
2580
2581 video_ids = []
2582 pagenum = 0
2583
2584 while True:
2585 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2586 self.report_download_page(username, start_index)
2587
2588 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2589
2590 try:
2591 page = urllib2.urlopen(request).read()
2592 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2593 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2594 return
2595
2596 # Extract video identifiers
2597 ids_in_page = []
2598
2599 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2600 if mobj.group(1) not in ids_in_page:
2601 ids_in_page.append(mobj.group(1))
2602
2603 video_ids.extend(ids_in_page)
2604
2605 # A little optimization - if current page is not
2606 # "full", ie. does not contain PAGE_SIZE video ids then
2607 # we can assume that this page is the last one - there
2608 # are no more ids on further pages - no need to query
2609 # again.
2610
2611 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2612 break
2613
2614 pagenum += 1
2615
2616 all_ids_count = len(video_ids)
2617 playliststart = self._downloader.params.get('playliststart', 1) - 1
2618 playlistend = self._downloader.params.get('playlistend', -1)
2619
2620 if playlistend == -1:
2621 video_ids = video_ids[playliststart:]
2622 else:
2623 video_ids = video_ids[playliststart:playlistend]
2624
2625 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2626 (username, all_ids_count, len(video_ids)))
2627
2628 for video_id in video_ids:
2629 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2630
2631
2632 class DepositFilesIE(InfoExtractor):
2633 """Information extractor for depositfiles.com"""
2634
2635 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2636 IE_NAME = u'DepositFiles'
2637
2638 def __init__(self, downloader=None):
2639 InfoExtractor.__init__(self, downloader)
2640
2641 def report_download_webpage(self, file_id):
2642 """Report webpage download."""
2643 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2644
2645 def report_extraction(self, file_id):
2646 """Report information extraction."""
2647 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2648
2649 def _real_initialize(self):
2650 return
2651
2652 def _real_extract(self, url):
2653 # At this point we have a new file
2654 self._downloader.increment_downloads()
2655
2656 file_id = url.split('/')[-1]
2657 # Rebuild url in english locale
2658 url = 'http://depositfiles.com/en/files/' + file_id
2659
2660 # Retrieve file webpage with 'Free download' button pressed
2661 free_download_indication = { 'gateway_result' : '1' }
2662 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2663 try:
2664 self.report_download_webpage(file_id)
2665 webpage = urllib2.urlopen(request).read()
2666 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2667 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2668 return
2669
2670 # Search for the real file URL
2671 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2672 if (mobj is None) or (mobj.group(1) is None):
2673 # Try to figure out reason of the error.
2674 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2675 if (mobj is not None) and (mobj.group(1) is not None):
2676 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2677 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2678 else:
2679 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2680 return
2681
2682 file_url = mobj.group(1)
2683 file_extension = os.path.splitext(file_url)[1][1:]
2684
2685 # Search for file title
2686 mobj = re.search(r'<b title="(.*?)">', webpage)
2687 if mobj is None:
2688 self._downloader.trouble(u'ERROR: unable to extract title')
2689 return
2690 file_title = mobj.group(1).decode('utf-8')
2691
2692 try:
2693 # Process file information
2694 self._downloader.process_info({
2695 'id': file_id.decode('utf-8'),
2696 'url': file_url.decode('utf-8'),
2697 'uploader': u'NA',
2698 'upload_date': u'NA',
2699 'title': file_title,
2700 'stitle': file_title,
2701 'ext': file_extension.decode('utf-8'),
2702 'format': u'NA',
2703 'player_url': None,
2704 })
2705 except UnavailableVideoError, err:
2706 self._downloader.trouble(u'ERROR: unable to download file')
2707
2708
2709 class FacebookIE(InfoExtractor):
2710 """Information Extractor for Facebook"""
2711
2712 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2713 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2714 _NETRC_MACHINE = 'facebook'
2715 _available_formats = ['highqual', 'lowqual']
2716 _video_extensions = {
2717 'highqual': 'mp4',
2718 'lowqual': 'mp4',
2719 }
2720 IE_NAME = u'facebook'
2721
2722 def __init__(self, downloader=None):
2723 InfoExtractor.__init__(self, downloader)
2724
2725 def _reporter(self, message):
2726 """Add header and report message."""
2727 self._downloader.to_screen(u'[facebook] %s' % message)
2728
2729 def report_login(self):
2730 """Report attempt to log in."""
2731 self._reporter(u'Logging in')
2732
2733 def report_video_webpage_download(self, video_id):
2734 """Report attempt to download video webpage."""
2735 self._reporter(u'%s: Downloading video webpage' % video_id)
2736
2737 def report_information_extraction(self, video_id):
2738 """Report attempt to extract video information."""
2739 self._reporter(u'%s: Extracting video information' % video_id)
2740
2741 def _parse_page(self, video_webpage):
2742 """Extract video information from page"""
2743 # General data
2744 data = {'title': r'class="video_title datawrap">(.*?)</',
2745 'description': r'<div class="datawrap">(.*?)</div>',
2746 'owner': r'\("video_owner_name", "(.*?)"\)',
2747 'upload_date': r'data-date="(.*?)"',
2748 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2749 }
2750 video_info = {}
2751 for piece in data.keys():
2752 mobj = re.search(data[piece], video_webpage)
2753 if mobj is not None:
2754 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2755
2756 # Video urls
2757 video_urls = {}
2758 for fmt in self._available_formats:
2759 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2760 if mobj is not None:
2761 # URL is in a Javascript segment inside an escaped Unicode format within
2762 # the generally utf-8 page
2763 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2764 video_info['video_urls'] = video_urls
2765
2766 return video_info
2767
2768 def _real_initialize(self):
2769 if self._downloader is None:
2770 return
2771
2772 useremail = None
2773 password = None
2774 downloader_params = self._downloader.params
2775
2776 # Attempt to use provided username and password or .netrc data
2777 if downloader_params.get('username', None) is not None:
2778 useremail = downloader_params['username']
2779 password = downloader_params['password']
2780 elif downloader_params.get('usenetrc', False):
2781 try:
2782 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2783 if info is not None:
2784 useremail = info[0]
2785 password = info[2]
2786 else:
2787 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2788 except (IOError, netrc.NetrcParseError), err:
2789 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2790 return
2791
2792 if useremail is None:
2793 return
2794
2795 # Log in
2796 login_form = {
2797 'email': useremail,
2798 'pass': password,
2799 'login': 'Log+In'
2800 }
2801 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2802 try:
2803 self.report_login()
2804 login_results = urllib2.urlopen(request).read()
2805 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2806 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2807 return
2808 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2809 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2810 return
2811
2812 def _real_extract(self, url):
2813 mobj = re.match(self._VALID_URL, url)
2814 if mobj is None:
2815 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2816 return
2817 video_id = mobj.group('ID')
2818
2819 # Get video webpage
2820 self.report_video_webpage_download(video_id)
2821 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2822 try:
2823 page = urllib2.urlopen(request)
2824 video_webpage = page.read()
2825 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2826 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2827 return
2828
2829 # Start extracting information
2830 self.report_information_extraction(video_id)
2831
2832 # Extract information
2833 video_info = self._parse_page(video_webpage)
2834
2835 # uploader
2836 if 'owner' not in video_info:
2837 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2838 return
2839 video_uploader = video_info['owner']
2840
2841 # title
2842 if 'title' not in video_info:
2843 self._downloader.trouble(u'ERROR: unable to extract video title')
2844 return
2845 video_title = video_info['title']
2846 video_title = video_title.decode('utf-8')
2847 video_title = sanitize_title(video_title)
2848
2849 # simplified title
2850 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2851 simple_title = simple_title.strip(ur'_')
2852
2853 # thumbnail image
2854 if 'thumbnail' not in video_info:
2855 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2856 video_thumbnail = ''
2857 else:
2858 video_thumbnail = video_info['thumbnail']
2859
2860 # upload date
2861 upload_date = u'NA'
2862 if 'upload_date' in video_info:
2863 upload_time = video_info['upload_date']
2864 timetuple = email.utils.parsedate_tz(upload_time)
2865 if timetuple is not None:
2866 try:
2867 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2868 except:
2869 pass
2870
2871 # description
2872 video_description = video_info.get('description', 'No description available.')
2873
2874 url_map = video_info['video_urls']
2875 if len(url_map.keys()) > 0:
2876 # Decide which formats to download
2877 req_format = self._downloader.params.get('format', None)
2878 format_limit = self._downloader.params.get('format_limit', None)
2879
2880 if format_limit is not None and format_limit in self._available_formats:
2881 format_list = self._available_formats[self._available_formats.index(format_limit):]
2882 else:
2883 format_list = self._available_formats
2884 existing_formats = [x for x in format_list if x in url_map]
2885 if len(existing_formats) == 0:
2886 self._downloader.trouble(u'ERROR: no known formats available for video')
2887 return
2888 if req_format is None:
2889 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2890 elif req_format == 'worst':
2891 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2892 elif req_format == '-1':
2893 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2894 else:
2895 # Specific format
2896 if req_format not in url_map:
2897 self._downloader.trouble(u'ERROR: requested format not available')
2898 return
2899 video_url_list = [(req_format, url_map[req_format])] # Specific format
2900
2901 for format_param, video_real_url in video_url_list:
2902
2903 # At this point we have a new video
2904 self._downloader.increment_downloads()
2905
2906 # Extension
2907 video_extension = self._video_extensions.get(format_param, 'mp4')
2908
2909 try:
2910 # Process video information
2911 self._downloader.process_info({
2912 'id': video_id.decode('utf-8'),
2913 'url': video_real_url.decode('utf-8'),
2914 'uploader': video_uploader.decode('utf-8'),
2915 'upload_date': upload_date,
2916 'title': video_title,
2917 'stitle': simple_title,
2918 'ext': video_extension.decode('utf-8'),
2919 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2920 'thumbnail': video_thumbnail.decode('utf-8'),
2921 'description': video_description.decode('utf-8'),
2922 'player_url': None,
2923 })
2924 except UnavailableVideoError, err:
2925 self._downloader.trouble(u'\nERROR: unable to download video')
2926
2927 class BlipTVIE(InfoExtractor):
2928 """Information extractor for blip.tv"""
2929
2930 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2931 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2932 IE_NAME = u'blip.tv'
2933
2934 def report_extraction(self, file_id):
2935 """Report information extraction."""
2936 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2937
2938 def report_direct_download(self, title):
2939 """Report information extraction."""
2940 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2941
2942 def _simplify_title(self, title):
2943 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2944 res = res.strip(ur'_')
2945 return res
2946
2947 def _real_extract(self, url):
2948 mobj = re.match(self._VALID_URL, url)
2949 if mobj is None:
2950 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2951 return
2952
2953 if '?' in url:
2954 cchar = '&'
2955 else:
2956 cchar = '?'
2957 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2958 request = urllib2.Request(json_url)
2959 self.report_extraction(mobj.group(1))
2960 info = None
2961 try:
2962 urlh = urllib2.urlopen(request)
2963 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2964 basename = url.split('/')[-1]
2965 title,ext = os.path.splitext(basename)
2966 ext = ext.replace('.', '')
2967 self.report_direct_download(title)
2968 info = {
2969 'id': title,
2970 'url': url,
2971 'title': title,
2972 'stitle': self._simplify_title(title),
2973 'ext': ext,
2974 'urlhandle': urlh
2975 }
2976 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2977 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2978 return
2979 if info is None: # Regular URL
2980 try:
2981 json_code = urlh.read()
2982 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2983 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2984 return
2985
2986 try:
2987 json_data = json.loads(json_code)
2988 if 'Post' in json_data:
2989 data = json_data['Post']
2990 else:
2991 data = json_data
2992
2993 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2994 video_url = data['media']['url']
2995 umobj = re.match(self._URL_EXT, video_url)
2996 if umobj is None:
2997 raise ValueError('Can not determine filename extension')
2998 ext = umobj.group(1)
2999
3000 info = {
3001 'id': data['item_id'],
3002 'url': video_url,
3003 'uploader': data['display_name'],
3004 'upload_date': upload_date,
3005 'title': data['title'],
3006 'stitle': self._simplify_title(data['title']),
3007 'ext': ext,
3008 'format': data['media']['mimeType'],
3009 'thumbnail': data['thumbnailUrl'],
3010 'description': data['description'],
3011 'player_url': data['embedUrl']
3012 }
3013 except (ValueError,KeyError), err:
3014 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3015 return
3016
3017 self._downloader.increment_downloads()
3018
3019 try:
3020 self._downloader.process_info(info)
3021 except UnavailableVideoError, err:
3022 self._downloader.trouble(u'\nERROR: unable to download video')
3023
3024
3025 class MyVideoIE(InfoExtractor):
3026 """Information Extractor for myvideo.de."""
3027
3028 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3029 IE_NAME = u'myvideo'
3030
3031 def __init__(self, downloader=None):
3032 InfoExtractor.__init__(self, downloader)
3033
3034 def report_download_webpage(self, video_id):
3035 """Report webpage download."""
3036 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3037
3038 def report_extraction(self, video_id):
3039 """Report information extraction."""
3040 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3041
3042 def _real_initialize(self):
3043 return
3044
3045 def _real_extract(self,url):
3046 mobj = re.match(self._VALID_URL, url)
3047 if mobj is None:
3048 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3049 return
3050
3051 video_id = mobj.group(1)
3052 simple_title = mobj.group(2).decode('utf-8')
3053 # should actually not be necessary
3054 simple_title = sanitize_title(simple_title)
3055 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3056
3057 # Get video webpage
3058 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3059 try:
3060 self.report_download_webpage(video_id)
3061 webpage = urllib2.urlopen(request).read()
3062 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3063 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3064 return
3065
3066 self.report_extraction(video_id)
3067 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3068 webpage)
3069 if mobj is None:
3070 self._downloader.trouble(u'ERROR: unable to extract media URL')
3071 return
3072 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3073
3074 mobj = re.search('<title>([^<]+)</title>', webpage)
3075 if mobj is None:
3076 self._downloader.trouble(u'ERROR: unable to extract title')
3077 return
3078
3079 video_title = mobj.group(1)
3080 video_title = sanitize_title(video_title)
3081
3082 try:
3083 self._downloader.process_info({
3084 'id': video_id,
3085 'url': video_url,
3086 'uploader': u'NA',
3087 'upload_date': u'NA',
3088 'title': video_title,
3089 'stitle': simple_title,
3090 'ext': u'flv',
3091 'format': u'NA',
3092 'player_url': None,
3093 })
3094 except UnavailableVideoError:
3095 self._downloader.trouble(u'\nERROR: Unable to download video')
3096
3097 class ComedyCentralIE(InfoExtractor):
3098 """Information extractor for The Daily Show and Colbert Report """
3099
3100 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3101 IE_NAME = u'comedycentral'
3102
3103 def report_extraction(self, episode_id):
3104 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3105
3106 def report_config_download(self, episode_id):
3107 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3108
3109 def report_index_download(self, episode_id):
3110 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3111
3112 def report_player_url(self, episode_id):
3113 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3114
3115 def _simplify_title(self, title):
3116 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3117 res = res.strip(ur'_')
3118 return res
3119
3120 def _real_extract(self, url):
3121 mobj = re.match(self._VALID_URL, url)
3122 if mobj is None:
3123 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3124 return
3125
3126 if mobj.group('shortname'):
3127 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3128 url = 'http://www.thedailyshow.com/full-episodes/'
3129 else:
3130 url = 'http://www.colbertnation.com/full-episodes/'
3131 mobj = re.match(self._VALID_URL, url)
3132 assert mobj is not None
3133
3134 dlNewest = not mobj.group('episode')
3135 if dlNewest:
3136 epTitle = mobj.group('showname')
3137 else:
3138 epTitle = mobj.group('episode')
3139
3140 req = urllib2.Request(url)
3141 self.report_extraction(epTitle)
3142 try:
3143 htmlHandle = urllib2.urlopen(req)
3144 html = htmlHandle.read()
3145 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3146 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3147 return
3148 if dlNewest:
3149 url = htmlHandle.geturl()
3150 mobj = re.match(self._VALID_URL, url)
3151 if mobj is None:
3152 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3153 return
3154 if mobj.group('episode') == '':
3155 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3156 return
3157 epTitle = mobj.group('episode')
3158
3159 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3160 if len(mMovieParams) == 0:
3161 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3162 return
3163
3164 playerUrl_raw = mMovieParams[0][0]
3165 self.report_player_url(epTitle)
3166 try:
3167 urlHandle = urllib2.urlopen(playerUrl_raw)
3168 playerUrl = urlHandle.geturl()
3169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3170 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3171 return
3172
3173 uri = mMovieParams[0][1]
3174 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3175 self.report_index_download(epTitle)
3176 try:
3177 indexXml = urllib2.urlopen(indexUrl).read()
3178 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3179 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3180 return
3181
3182 idoc = xml.etree.ElementTree.fromstring(indexXml)
3183 itemEls = idoc.findall('.//item')
3184 for itemEl in itemEls:
3185 mediaId = itemEl.findall('./guid')[0].text
3186 shortMediaId = mediaId.split(':')[-1]
3187 showId = mediaId.split(':')[-2].replace('.com', '')
3188 officialTitle = itemEl.findall('./title')[0].text
3189 officialDate = itemEl.findall('./pubDate')[0].text
3190
3191 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3192 urllib.urlencode({'uri': mediaId}))
3193 configReq = urllib2.Request(configUrl)
3194 self.report_config_download(epTitle)
3195 try:
3196 configXml = urllib2.urlopen(configReq).read()
3197 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3198 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3199 return
3200
3201 cdoc = xml.etree.ElementTree.fromstring(configXml)
3202 turls = []
3203 for rendition in cdoc.findall('.//rendition'):
3204 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3205 turls.append(finfo)
3206
3207 if len(turls) == 0:
3208 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3209 continue
3210
3211 # For now, just pick the highest bitrate
3212 format,video_url = turls[-1]
3213
3214 self._downloader.increment_downloads()
3215
3216 effTitle = showId + '-' + epTitle
3217 info = {
3218 'id': shortMediaId,
3219 'url': video_url,
3220 'uploader': showId,
3221 'upload_date': officialDate,
3222 'title': effTitle,
3223 'stitle': self._simplify_title(effTitle),
3224 'ext': 'mp4',
3225 'format': format,
3226 'thumbnail': None,
3227 'description': officialTitle,
3228 'player_url': playerUrl
3229 }
3230
3231 try:
3232 self._downloader.process_info(info)
3233 except UnavailableVideoError, err:
3234 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3235 continue
3236
3237
3238 class EscapistIE(InfoExtractor):
3239 """Information extractor for The Escapist """
3240
3241 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3242 IE_NAME = u'escapist'
3243
3244 def report_extraction(self, showName):
3245 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3246
3247 def report_config_download(self, showName):
3248 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3249
3250 def _simplify_title(self, title):
3251 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3252 res = res.strip(ur'_')
3253 return res
3254
3255 def _real_extract(self, url):
3256 htmlParser = HTMLParser.HTMLParser()
3257
3258 mobj = re.match(self._VALID_URL, url)
3259 if mobj is None:
3260 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3261 return
3262 showName = mobj.group('showname')
3263 videoId = mobj.group('episode')
3264
3265 self.report_extraction(showName)
3266 try:
3267 webPage = urllib2.urlopen(url).read()
3268 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3269 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3270 return
3271
3272 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3273 description = htmlParser.unescape(descMatch.group(1))
3274 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3275 imgUrl = htmlParser.unescape(imgMatch.group(1))
3276 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3277 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3278 configUrlMatch = re.search('config=(.*)$', playerUrl)
3279 configUrl = urllib2.unquote(configUrlMatch.group(1))
3280
3281 self.report_config_download(showName)
3282 try:
3283 configJSON = urllib2.urlopen(configUrl).read()
3284 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3285 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3286 return
3287
3288 # Technically, it's JavaScript, not JSON
3289 configJSON = configJSON.replace("'", '"')
3290
3291 try:
3292 config = json.loads(configJSON)
3293 except (ValueError,), err:
3294 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3295 return
3296
3297 playlist = config['playlist']
3298 videoUrl = playlist[1]['url']
3299
3300 self._downloader.increment_downloads()
3301 info = {
3302 'id': videoId,
3303 'url': videoUrl,
3304 'uploader': showName,
3305 'upload_date': None,
3306 'title': showName,
3307 'stitle': self._simplify_title(showName),
3308 'ext': 'flv',
3309 'format': 'flv',
3310 'thumbnail': imgUrl,
3311 'description': description,
3312 'player_url': playerUrl,
3313 }
3314
3315 try:
3316 self._downloader.process_info(info)
3317 except UnavailableVideoError, err:
3318 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3319
3320
3321 class CollegeHumorIE(InfoExtractor):
3322 """Information extractor for collegehumor.com"""
3323
3324 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3325 IE_NAME = u'collegehumor'
3326
3327 def report_webpage(self, video_id):
3328 """Report information extraction."""
3329 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3330
3331 def report_extraction(self, video_id):
3332 """Report information extraction."""
3333 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3334
3335 def _simplify_title(self, title):
3336 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3337 res = res.strip(ur'_')
3338 return res
3339
3340 def _real_extract(self, url):
3341 htmlParser = HTMLParser.HTMLParser()
3342
3343 mobj = re.match(self._VALID_URL, url)
3344 if mobj is None:
3345 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3346 return
3347 video_id = mobj.group('videoid')
3348
3349 self.report_webpage(video_id)
3350 request = urllib2.Request(url)
3351 try:
3352 webpage = urllib2.urlopen(request).read()
3353 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3354 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3355 return
3356
3357 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3358 if m is None:
3359 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3360 return
3361 internal_video_id = m.group('internalvideoid')
3362
3363 info = {
3364 'id': video_id,
3365 'internal_id': internal_video_id,
3366 }
3367
3368 self.report_extraction(video_id)
3369 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3370 try:
3371 metaXml = urllib2.urlopen(xmlUrl).read()
3372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3373 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3374 return
3375
3376 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3377 try:
3378 videoNode = mdoc.findall('./video')[0]
3379 info['description'] = videoNode.findall('./description')[0].text
3380 info['title'] = videoNode.findall('./caption')[0].text
3381 info['stitle'] = self._simplify_title(info['title'])
3382 info['url'] = videoNode.findall('./file')[0].text
3383 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3384 info['ext'] = info['url'].rpartition('.')[2]
3385 info['format'] = info['ext']
3386 except IndexError:
3387 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3388 return
3389
3390 self._downloader.increment_downloads()
3391
3392 try:
3393 self._downloader.process_info(info)
3394 except UnavailableVideoError, err:
3395 self._downloader.trouble(u'\nERROR: unable to download video')
3396
3397
3398 class XVideosIE(InfoExtractor):
3399 """Information extractor for xvideos.com"""
3400
3401 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3402 IE_NAME = u'xvideos'
3403
3404 def report_webpage(self, video_id):
3405 """Report information extraction."""
3406 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3407
3408 def report_extraction(self, video_id):
3409 """Report information extraction."""
3410 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3411
3412 def _simplify_title(self, title):
3413 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3414 res = res.strip(ur'_')
3415 return res
3416
3417 def _real_extract(self, url):
3418 htmlParser = HTMLParser.HTMLParser()
3419
3420 mobj = re.match(self._VALID_URL, url)
3421 if mobj is None:
3422 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3423 return
3424 video_id = mobj.group(1).decode('utf-8')
3425
3426 self.report_webpage(video_id)
3427
3428 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3429 try:
3430 webpage = urllib2.urlopen(request).read()
3431 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3432 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3433 return
3434
3435 self.report_extraction(video_id)
3436
3437
3438 # Extract video URL
3439 mobj = re.search(r'flv_url=(.+?)&', webpage)
3440 if mobj is None:
3441 self._downloader.trouble(u'ERROR: unable to extract video url')
3442 return
3443 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3444
3445
3446 # Extract title
3447 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3448 if mobj is None:
3449 self._downloader.trouble(u'ERROR: unable to extract video title')
3450 return
3451 video_title = mobj.group(1).decode('utf-8')
3452
3453
3454 # Extract video thumbnail
3455 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3456 if mobj is None:
3457 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3458 return
3459 video_thumbnail = mobj.group(1).decode('utf-8')
3460
3461
3462
3463 self._downloader.increment_downloads()
3464 info = {
3465 'id': video_id,
3466 'url': video_url,
3467 'uploader': None,
3468 'upload_date': None,
3469 'title': video_title,
3470 'stitle': self._simplify_title(video_title),
3471 'ext': 'flv',
3472 'format': 'flv',
3473 'thumbnail': video_thumbnail,
3474 'description': None,
3475 'player_url': None,
3476 }
3477
3478 try:
3479 self._downloader.process_info(info)
3480 except UnavailableVideoError, err:
3481 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3482
3483
3484 class PostProcessor(object):
3485 """Post Processor class.
3486
3487 PostProcessor objects can be added to downloaders with their
3488 add_post_processor() method. When the downloader has finished a
3489 successful download, it will take its internal chain of PostProcessors
3490 and start calling the run() method on each one of them, first with
3491 an initial argument and then with the returned value of the previous
3492 PostProcessor.
3493
3494 The chain will be stopped if one of them ever returns None or the end
3495 of the chain is reached.
3496
3497 PostProcessor objects follow a "mutual registration" process similar
3498 to InfoExtractor objects.
3499 """
3500
3501 _downloader = None
3502
3503 def __init__(self, downloader=None):
3504 self._downloader = downloader
3505
3506 def set_downloader(self, downloader):
3507 """Sets the downloader for this PP."""
3508 self._downloader = downloader
3509
3510 def run(self, information):
3511 """Run the PostProcessor.
3512
3513 The "information" argument is a dictionary like the ones
3514 composed by InfoExtractors. The only difference is that this
3515 one has an extra field called "filepath" that points to the
3516 downloaded file.
3517
3518 When this method returns None, the postprocessing chain is
3519 stopped. However, this method may return an information
3520 dictionary that will be passed to the next postprocessing
3521 object in the chain. It can be the one it received after
3522 changing some fields.
3523
3524 In addition, this method may raise a PostProcessingError
3525 exception that will be taken into account by the downloader
3526 it was called from.
3527 """
3528 return information # by default, do nothing
3529
3530
3531 class FFmpegExtractAudioPP(PostProcessor):
3532
3533 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3534 PostProcessor.__init__(self, downloader)
3535 if preferredcodec is None:
3536 preferredcodec = 'best'
3537 self._preferredcodec = preferredcodec
3538 self._preferredquality = preferredquality
3539 self._keepvideo = keepvideo
3540
3541 @staticmethod
3542 def get_audio_codec(path):
3543 try:
3544 cmd = ['ffprobe', '-show_streams', '--', path]
3545 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3546 output = handle.communicate()[0]
3547 if handle.wait() != 0:
3548 return None
3549 except (IOError, OSError):
3550 return None
3551 audio_codec = None
3552 for line in output.split('\n'):
3553 if line.startswith('codec_name='):
3554 audio_codec = line.split('=')[1].strip()
3555 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3556 return audio_codec
3557 return None
3558
3559 @staticmethod
3560 def run_ffmpeg(path, out_path, codec, more_opts):
3561 try:
3562 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3563 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3564 return (ret == 0)
3565 except (IOError, OSError):
3566 return False
3567
3568 def run(self, information):
3569 path = information['filepath']
3570
3571 filecodec = self.get_audio_codec(path)
3572 if filecodec is None:
3573 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3574 return None
3575
3576 more_opts = []
3577 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3578 if filecodec in ['aac', 'mp3', 'vorbis']:
3579 # Lossless if possible
3580 acodec = 'copy'
3581 extension = filecodec
3582 if filecodec == 'aac':
3583 more_opts = ['-f', 'adts']
3584 if filecodec == 'vorbis':
3585 extension = 'ogg'
3586 else:
3587 # MP3 otherwise.
3588 acodec = 'libmp3lame'
3589 extension = 'mp3'
3590 more_opts = []
3591 if self._preferredquality is not None:
3592 more_opts += ['-ab', self._preferredquality]
3593 else:
3594 # We convert the audio (lossy)
3595 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3596 extension = self._preferredcodec
3597 more_opts = []
3598 if self._preferredquality is not None:
3599 more_opts += ['-ab', self._preferredquality]
3600 if self._preferredcodec == 'aac':
3601 more_opts += ['-f', 'adts']
3602 if self._preferredcodec == 'vorbis':
3603 extension = 'ogg'
3604
3605 (prefix, ext) = os.path.splitext(path)
3606 new_path = prefix + '.' + extension
3607 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3608 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3609
3610 if not status:
3611 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3612 return None
3613
3614 # Try to update the date time for extracted audio file.
3615 if information.get('filetime') is not None:
3616 try:
3617 os.utime(new_path, (time.time(), information['filetime']))
3618 except:
3619 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3620
3621 if not self._keepvideo:
3622 try:
3623 os.remove(path)
3624 except (IOError, OSError):
3625 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3626 return None
3627
3628 information['filepath'] = new_path
3629 return information
3630
3631
3632 def updateSelf(downloader, filename):
3633 ''' Update the program file with the latest version from the repository '''
3634 # Note: downloader only used for options
3635 if not os.access(filename, os.W_OK):
3636 sys.exit('ERROR: no write permissions on %s' % filename)
3637
3638 downloader.to_screen('Updating to latest version...')
3639
3640 try:
3641 try:
3642 urlh = urllib.urlopen(UPDATE_URL)
3643 newcontent = urlh.read()
3644
3645 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3646 if vmatch is not None and vmatch.group(1) == __version__:
3647 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3648 return
3649 finally:
3650 urlh.close()
3651 except (IOError, OSError), err:
3652 sys.exit('ERROR: unable to download latest version')
3653
3654 try:
3655 outf = open(filename, 'wb')
3656 try:
3657 outf.write(newcontent)
3658 finally:
3659 outf.close()
3660 except (IOError, OSError), err:
3661 sys.exit('ERROR: unable to overwrite current version')
3662
3663 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3664
3665 def parseOpts():
3666 # Deferred imports
3667 import getpass
3668 import optparse
3669
3670 def _format_option_string(option):
3671 ''' ('-o', '--option') -> -o, --format METAVAR'''
3672
3673 opts = []
3674
3675 if option._short_opts: opts.append(option._short_opts[0])
3676 if option._long_opts: opts.append(option._long_opts[0])
3677 if len(opts) > 1: opts.insert(1, ', ')
3678
3679 if option.takes_value(): opts.append(' %s' % option.metavar)
3680
3681 return "".join(opts)
3682
3683 def _find_term_columns():
3684 columns = os.environ.get('COLUMNS', None)
3685 if columns:
3686 return int(columns)
3687
3688 try:
3689 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3690 out,err = sp.communicate()
3691 return int(out.split()[1])
3692 except:
3693 pass
3694 return None
3695
3696 max_width = 80
3697 max_help_position = 80
3698
3699 # No need to wrap help messages if we're on a wide console
3700 columns = _find_term_columns()
3701 if columns: max_width = columns
3702
3703 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3704 fmt.format_option_strings = _format_option_string
3705
3706 kw = {
3707 'version' : __version__,
3708 'formatter' : fmt,
3709 'usage' : '%prog [options] url [url...]',
3710 'conflict_handler' : 'resolve',
3711 }
3712
3713 parser = optparse.OptionParser(**kw)
3714
3715 # option groups
3716 general = optparse.OptionGroup(parser, 'General Options')
3717 selection = optparse.OptionGroup(parser, 'Video Selection')
3718 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3719 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3720 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3721 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3722 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3723
3724 general.add_option('-h', '--help',
3725 action='help', help='print this help text and exit')
3726 general.add_option('-v', '--version',
3727 action='version', help='print program version and exit')
3728 general.add_option('-U', '--update',
3729 action='store_true', dest='update_self', help='update this program to latest version')
3730 general.add_option('-i', '--ignore-errors',
3731 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3732 general.add_option('-r', '--rate-limit',
3733 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3734 general.add_option('-R', '--retries',
3735 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3736 general.add_option('--dump-user-agent',
3737 action='store_true', dest='dump_user_agent',
3738 help='display the current browser identification', default=False)
3739 general.add_option('--list-extractors',
3740 action='store_true', dest='list_extractors',
3741 help='List all supported extractors and the URLs they would handle', default=False)
3742
3743 selection.add_option('--playlist-start',
3744 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3745 selection.add_option('--playlist-end',
3746 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3747 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3748 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3749
3750 authentication.add_option('-u', '--username',
3751 dest='username', metavar='USERNAME', help='account username')
3752 authentication.add_option('-p', '--password',
3753 dest='password', metavar='PASSWORD', help='account password')
3754 authentication.add_option('-n', '--netrc',
3755 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3756
3757
3758 video_format.add_option('-f', '--format',
3759 action='store', dest='format', metavar='FORMAT', help='video format code')
3760 video_format.add_option('--all-formats',
3761 action='store_const', dest='format', help='download all available video formats', const='all')
3762 video_format.add_option('--max-quality',
3763 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3764 video_format.add_option('-F', '--list-formats',
3765 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
3766
3767
3768 verbosity.add_option('-q', '--quiet',
3769 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3770 verbosity.add_option('-s', '--simulate',
3771 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3772 verbosity.add_option('--skip-download',
3773 action='store_true', dest='skip_download', help='do not download the video', default=False)
3774 verbosity.add_option('-g', '--get-url',
3775 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3776 verbosity.add_option('-e', '--get-title',
3777 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3778 verbosity.add_option('--get-thumbnail',
3779 action='store_true', dest='getthumbnail',
3780 help='simulate, quiet but print thumbnail URL', default=False)
3781 verbosity.add_option('--get-description',
3782 action='store_true', dest='getdescription',
3783 help='simulate, quiet but print video description', default=False)
3784 verbosity.add_option('--get-filename',
3785 action='store_true', dest='getfilename',
3786 help='simulate, quiet but print output filename', default=False)
3787 verbosity.add_option('--get-format',
3788 action='store_true', dest='getformat',
3789 help='simulate, quiet but print output format', default=False)
3790 verbosity.add_option('--no-progress',
3791 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3792 verbosity.add_option('--console-title',
3793 action='store_true', dest='consoletitle',
3794 help='display progress in console titlebar', default=False)
3795
3796
3797 filesystem.add_option('-t', '--title',
3798 action='store_true', dest='usetitle', help='use title in file name', default=False)
3799 filesystem.add_option('-l', '--literal',
3800 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3801 filesystem.add_option('-A', '--auto-number',
3802 action='store_true', dest='autonumber',
3803 help='number downloaded files starting from 00000', default=False)
3804 filesystem.add_option('-o', '--output',
3805 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3806 filesystem.add_option('-a', '--batch-file',
3807 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3808 filesystem.add_option('-w', '--no-overwrites',
3809 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3810 filesystem.add_option('-c', '--continue',
3811 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3812 filesystem.add_option('--no-continue',
3813 action='store_false', dest='continue_dl',
3814 help='do not resume partially downloaded files (restart from beginning)')
3815 filesystem.add_option('--cookies',
3816 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3817 filesystem.add_option('--no-part',
3818 action='store_true', dest='nopart', help='do not use .part files', default=False)
3819 filesystem.add_option('--no-mtime',
3820 action='store_false', dest='updatetime',
3821 help='do not use the Last-modified header to set the file modification time', default=True)
3822 filesystem.add_option('--write-description',
3823 action='store_true', dest='writedescription',
3824 help='write video description to a .description file', default=False)
3825 filesystem.add_option('--write-info-json',
3826 action='store_true', dest='writeinfojson',
3827 help='write video metadata to a .info.json file', default=False)
3828
3829
3830 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3831 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3832 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3833 help='"best", "aac", "vorbis" or "mp3"; best by default')
3834 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3835 help='ffmpeg audio bitrate specification, 128k by default')
3836 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3837 help='keeps the video file on disk after the post-processing; the video is erased by default')
3838
3839
3840 parser.add_option_group(general)
3841 parser.add_option_group(selection)
3842 parser.add_option_group(filesystem)
3843 parser.add_option_group(verbosity)
3844 parser.add_option_group(video_format)
3845 parser.add_option_group(authentication)
3846 parser.add_option_group(postproc)
3847
3848 opts, args = parser.parse_args()
3849
3850 return parser, opts, args
3851
3852 def gen_extractors():
3853 """ Return a list of an instance of every supported extractor.
3854 The order does matter; the first extractor matched is the one handling the URL.
3855 """
3856 youtube_ie = YoutubeIE()
3857 google_ie = GoogleIE()
3858 yahoo_ie = YahooIE()
3859 return [
3860 YoutubePlaylistIE(youtube_ie),
3861 YoutubeUserIE(youtube_ie),
3862 YoutubeSearchIE(youtube_ie),
3863 youtube_ie,
3864 MetacafeIE(youtube_ie),
3865 DailymotionIE(),
3866 google_ie,
3867 GoogleSearchIE(google_ie),
3868 PhotobucketIE(),
3869 yahoo_ie,
3870 YahooSearchIE(yahoo_ie),
3871 DepositFilesIE(),
3872 FacebookIE(),
3873 BlipTVIE(),
3874 VimeoIE(),
3875 MyVideoIE(),
3876 ComedyCentralIE(),
3877 EscapistIE(),
3878 CollegeHumorIE(),
3879 XVideosIE(),
3880
3881 GenericIE()
3882 ]
3883
3884 def main():
3885 parser, opts, args = parseOpts()
3886
3887 # Open appropriate CookieJar
3888 if opts.cookiefile is None:
3889 jar = cookielib.CookieJar()
3890 else:
3891 try:
3892 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3893 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3894 jar.load()
3895 except (IOError, OSError), err:
3896 sys.exit(u'ERROR: unable to open cookie file')
3897
3898 # Dump user agent
3899 if opts.dump_user_agent:
3900 print std_headers['User-Agent']
3901 sys.exit(0)
3902
3903 # Batch file verification
3904 batchurls = []
3905 if opts.batchfile is not None:
3906 try:
3907 if opts.batchfile == '-':
3908 batchfd = sys.stdin
3909 else:
3910 batchfd = open(opts.batchfile, 'r')
3911 batchurls = batchfd.readlines()
3912 batchurls = [x.strip() for x in batchurls]
3913 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3914 except IOError:
3915 sys.exit(u'ERROR: batch file could not be read')
3916 all_urls = batchurls + args
3917
3918 # General configuration
3919 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3920 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3921 urllib2.install_opener(opener)
3922 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3923
3924 extractors = gen_extractors()
3925
3926 if opts.list_extractors:
3927 for ie in extractors:
3928 print(ie.IE_NAME)
3929 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3930 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3931 for mu in matchedUrls:
3932 print(u' ' + mu)
3933 sys.exit(0)
3934
3935 # Conflicting, missing and erroneous options
3936 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3937 parser.error(u'using .netrc conflicts with giving username/password')
3938 if opts.password is not None and opts.username is None:
3939 parser.error(u'account username missing')
3940 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3941 parser.error(u'using output template conflicts with using title, literal title or auto number')
3942 if opts.usetitle and opts.useliteral:
3943 parser.error(u'using title conflicts with using literal title')
3944 if opts.username is not None and opts.password is None:
3945 opts.password = getpass.getpass(u'Type account password and press return:')
3946 if opts.ratelimit is not None:
3947 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3948 if numeric_limit is None:
3949 parser.error(u'invalid rate limit specified')
3950 opts.ratelimit = numeric_limit
3951 if opts.retries is not None:
3952 try:
3953 opts.retries = long(opts.retries)
3954 except (TypeError, ValueError), err:
3955 parser.error(u'invalid retry count specified')
3956 try:
3957 opts.playliststart = int(opts.playliststart)
3958 if opts.playliststart <= 0:
3959 raise ValueError(u'Playlist start must be positive')
3960 except (TypeError, ValueError), err:
3961 parser.error(u'invalid playlist start number specified')
3962 try:
3963 opts.playlistend = int(opts.playlistend)
3964 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3965 raise ValueError(u'Playlist end must be greater than playlist start')
3966 except (TypeError, ValueError), err:
3967 parser.error(u'invalid playlist end number specified')
3968 if opts.extractaudio:
3969 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3970 parser.error(u'invalid audio format specified')
3971
3972 # File downloader
3973 fd = FileDownloader({
3974 'usenetrc': opts.usenetrc,
3975 'username': opts.username,
3976 'password': opts.password,
3977 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3978 'forceurl': opts.geturl,
3979 'forcetitle': opts.gettitle,
3980 'forcethumbnail': opts.getthumbnail,
3981 'forcedescription': opts.getdescription,
3982 'forcefilename': opts.getfilename,
3983 'forceformat': opts.getformat,
3984 'simulate': opts.simulate,
3985 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3986 'format': opts.format,
3987 'format_limit': opts.format_limit,
3988 'listformats': opts.listformats,
3989 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3990 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3991 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3992 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3993 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3994 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3995 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3996 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3997 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3998 or u'%(id)s.%(ext)s'),
3999 'ignoreerrors': opts.ignoreerrors,
4000 'ratelimit': opts.ratelimit,
4001 'nooverwrites': opts.nooverwrites,
4002 'retries': opts.retries,
4003 'continuedl': opts.continue_dl,
4004 'noprogress': opts.noprogress,
4005 'playliststart': opts.playliststart,
4006 'playlistend': opts.playlistend,
4007 'logtostderr': opts.outtmpl == '-',
4008 'consoletitle': opts.consoletitle,
4009 'nopart': opts.nopart,
4010 'updatetime': opts.updatetime,
4011 'writedescription': opts.writedescription,
4012 'writeinfojson': opts.writeinfojson,
4013 'matchtitle': opts.matchtitle,
4014 'rejecttitle': opts.rejecttitle,
4015 })
4016 for extractor in extractors:
4017 fd.add_info_extractor(extractor)
4018
4019 # PostProcessors
4020 if opts.extractaudio:
4021 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4022
4023 # Update version
4024 if opts.update_self:
4025 updateSelf(fd, sys.argv[0])
4026
4027 # Maybe do nothing
4028 if len(all_urls) < 1:
4029 if not opts.update_self:
4030 parser.error(u'you must provide at least one URL')
4031 else:
4032 sys.exit()
4033 retcode = fd.download(all_urls)
4034
4035 # Dump cookie jar if requested
4036 if opts.cookiefile is not None:
4037 try:
4038 jar.save()
4039 except (IOError, OSError), err:
4040 sys.exit(u'ERROR: unable to save cookie jar')
4041
4042 sys.exit(retcode)
4043
4044
4045 if __name__ == '__main__':
4046 try:
4047 main()
4048 except DownloadError:
4049 sys.exit(1)
4050 except SameFileError:
4051 sys.exit(u'ERROR: fixed output name but more than one file to download')
4052 except KeyboardInterrupt:
4053 sys.exit(u'\nERROR: Interrupted by user')
4054
4055 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: