]> Raphaël G. Git Repositories - youtubedl/blob - youtube-dl
debian/control: Update short description.
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __author__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 )
16
17 __license__ = 'Public Domain'
18 __version__ = '2011.09.27'
19
20 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
21
22 import cookielib
23 import datetime
24 import gzip
25 import htmlentitydefs
26 import HTMLParser
27 import httplib
28 import locale
29 import math
30 import netrc
31 import os
32 import os.path
33 import re
34 import socket
35 import string
36 import subprocess
37 import sys
38 import time
39 import urllib
40 import urllib2
41 import warnings
42 import zlib
43
44 if os.name == 'nt':
45 import ctypes
46
47 try:
48 import email.utils
49 except ImportError: # Python 2.4
50 import email.Utils
51 try:
52 import cStringIO as StringIO
53 except ImportError:
54 import StringIO
55
56 # parse_qs was moved from the cgi module to the urlparse module recently.
57 try:
58 from urlparse import parse_qs
59 except ImportError:
60 from cgi import parse_qs
61
62 try:
63 import lxml.etree
64 except ImportError:
65 pass # Handled below
66
67 try:
68 import xml.etree.ElementTree
69 except ImportError: # Python<2.5: Not officially supported, but let it slip
70 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
71
72 std_headers = {
73 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
74 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
75 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
76 'Accept-Encoding': 'gzip, deflate',
77 'Accept-Language': 'en-us,en;q=0.5',
78 }
79
80 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
81
82 try:
83 import json
84 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
85 import re
86 class json(object):
87 @staticmethod
88 def loads(s):
89 s = s.decode('UTF-8')
90 def raiseError(msg, i):
91 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
92 def skipSpace(i, expectMore=True):
93 while i < len(s) and s[i] in ' \t\r\n':
94 i += 1
95 if expectMore:
96 if i >= len(s):
97 raiseError('Premature end', i)
98 return i
99 def decodeEscape(match):
100 esc = match.group(1)
101 _STATIC = {
102 '"': '"',
103 '\\': '\\',
104 '/': '/',
105 'b': unichr(0x8),
106 'f': unichr(0xc),
107 'n': '\n',
108 'r': '\r',
109 't': '\t',
110 }
111 if esc in _STATIC:
112 return _STATIC[esc]
113 if esc[0] == 'u':
114 if len(esc) == 1+4:
115 return unichr(int(esc[1:5], 16))
116 if len(esc) == 5+6 and esc[5:7] == '\\u':
117 hi = int(esc[1:5], 16)
118 low = int(esc[7:11], 16)
119 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
120 raise ValueError('Unknown escape ' + str(esc))
121 def parseString(i):
122 i += 1
123 e = i
124 while True:
125 e = s.index('"', e)
126 bslashes = 0
127 while s[e-bslashes-1] == '\\':
128 bslashes += 1
129 if bslashes % 2 == 1:
130 e += 1
131 continue
132 break
133 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
134 stri = rexp.sub(decodeEscape, s[i:e])
135 return (e+1,stri)
136 def parseObj(i):
137 i += 1
138 res = {}
139 i = skipSpace(i)
140 if s[i] == '}': # Empty dictionary
141 return (i+1,res)
142 while True:
143 if s[i] != '"':
144 raiseError('Expected a string object key', i)
145 i,key = parseString(i)
146 i = skipSpace(i)
147 if i >= len(s) or s[i] != ':':
148 raiseError('Expected a colon', i)
149 i,val = parse(i+1)
150 res[key] = val
151 i = skipSpace(i)
152 if s[i] == '}':
153 return (i+1, res)
154 if s[i] != ',':
155 raiseError('Expected comma or closing curly brace', i)
156 i = skipSpace(i+1)
157 def parseArray(i):
158 res = []
159 i = skipSpace(i+1)
160 if s[i] == ']': # Empty array
161 return (i+1,res)
162 while True:
163 i,val = parse(i)
164 res.append(val)
165 i = skipSpace(i) # Raise exception if premature end
166 if s[i] == ']':
167 return (i+1, res)
168 if s[i] != ',':
169 raiseError('Expected a comma or closing bracket', i)
170 i = skipSpace(i+1)
171 def parseDiscrete(i):
172 for k,v in {'true': True, 'false': False, 'null': None}.items():
173 if s.startswith(k, i):
174 return (i+len(k), v)
175 raiseError('Not a boolean (or null)', i)
176 def parseNumber(i):
177 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
178 if mobj is None:
179 raiseError('Not a number', i)
180 nums = mobj.group(1)
181 if '.' in nums or 'e' in nums or 'E' in nums:
182 return (i+len(nums), float(nums))
183 return (i+len(nums), int(nums))
184 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
185 def parse(i):
186 i = skipSpace(i)
187 i,res = CHARMAP.get(s[i], parseNumber)(i)
188 i = skipSpace(i, False)
189 return (i,res)
190 i,res = parse(0)
191 if i < len(s):
192 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
193 return res
194
195 def preferredencoding():
196 """Get preferred encoding.
197
198 Returns the best encoding scheme for the system, based on
199 locale.getpreferredencoding() and some further tweaks.
200 """
201 def yield_preferredencoding():
202 try:
203 pref = locale.getpreferredencoding()
204 u'TEST'.encode(pref)
205 except:
206 pref = 'UTF-8'
207 while True:
208 yield pref
209 return yield_preferredencoding().next()
210
211
212 def htmlentity_transform(matchobj):
213 """Transforms an HTML entity to a Unicode character.
214
215 This function receives a match object and is intended to be used with
216 the re.sub() function.
217 """
218 entity = matchobj.group(1)
219
220 # Known non-numeric HTML entity
221 if entity in htmlentitydefs.name2codepoint:
222 return unichr(htmlentitydefs.name2codepoint[entity])
223
224 # Unicode character
225 mobj = re.match(ur'(?u)#(x?\d+)', entity)
226 if mobj is not None:
227 numstr = mobj.group(1)
228 if numstr.startswith(u'x'):
229 base = 16
230 numstr = u'0%s' % numstr
231 else:
232 base = 10
233 return unichr(long(numstr, base))
234
235 # Unknown entity in name, return its literal representation
236 return (u'&%s;' % entity)
237
238
239 def sanitize_title(utitle):
240 """Sanitizes a video title so it could be used as part of a filename."""
241 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
242 return utitle.replace(unicode(os.sep), u'%')
243
244
245 def sanitize_open(filename, open_mode):
246 """Try to open the given filename, and slightly tweak it if this fails.
247
248 Attempts to open the given filename. If this fails, it tries to change
249 the filename slightly, step by step, until it's either able to open it
250 or it fails and raises a final exception, like the standard open()
251 function.
252
253 It returns the tuple (stream, definitive_file_name).
254 """
255 try:
256 if filename == u'-':
257 if sys.platform == 'win32':
258 import msvcrt
259 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
260 return (sys.stdout, filename)
261 stream = open(filename, open_mode)
262 return (stream, filename)
263 except (IOError, OSError), err:
264 # In case of error, try to remove win32 forbidden chars
265 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
266
267 # An exception here should be caught in the caller
268 stream = open(filename, open_mode)
269 return (stream, filename)
270
271
272 def timeconvert(timestr):
273 """Convert RFC 2822 defined time string into system timestamp"""
274 timestamp = None
275 timetuple = email.utils.parsedate_tz(timestr)
276 if timetuple is not None:
277 timestamp = email.utils.mktime_tz(timetuple)
278 return timestamp
279
280
281 class DownloadError(Exception):
282 """Download Error exception.
283
284 This exception may be thrown by FileDownloader objects if they are not
285 configured to continue on errors. They will contain the appropriate
286 error message.
287 """
288 pass
289
290
291 class SameFileError(Exception):
292 """Same File exception.
293
294 This exception will be thrown by FileDownloader objects if they detect
295 multiple files would have to be downloaded to the same file on disk.
296 """
297 pass
298
299
300 class PostProcessingError(Exception):
301 """Post Processing exception.
302
303 This exception may be raised by PostProcessor's .run() method to
304 indicate an error in the postprocessing task.
305 """
306 pass
307
308
309 class UnavailableVideoError(Exception):
310 """Unavailable Format exception.
311
312 This exception will be thrown when a video is requested
313 in a format that is not available for that video.
314 """
315 pass
316
317
318 class ContentTooShortError(Exception):
319 """Content Too Short exception.
320
321 This exception may be raised by FileDownloader objects when a file they
322 download is too small for what the server announced first, indicating
323 the connection was probably interrupted.
324 """
325 # Both in bytes
326 downloaded = None
327 expected = None
328
329 def __init__(self, downloaded, expected):
330 self.downloaded = downloaded
331 self.expected = expected
332
333
334 class YoutubeDLHandler(urllib2.HTTPHandler):
335 """Handler for HTTP requests and responses.
336
337 This class, when installed with an OpenerDirector, automatically adds
338 the standard headers to every HTTP request and handles gzipped and
339 deflated responses from web servers. If compression is to be avoided in
340 a particular request, the original request in the program code only has
341 to include the HTTP header "Youtubedl-No-Compression", which will be
342 removed before making the real request.
343
344 Part of this code was copied from:
345
346 http://techknack.net/python-urllib2-handlers/
347
348 Andrew Rowls, the author of that code, agreed to release it to the
349 public domain.
350 """
351
352 @staticmethod
353 def deflate(data):
354 try:
355 return zlib.decompress(data, -zlib.MAX_WBITS)
356 except zlib.error:
357 return zlib.decompress(data)
358
359 @staticmethod
360 def addinfourl_wrapper(stream, headers, url, code):
361 if hasattr(urllib2.addinfourl, 'getcode'):
362 return urllib2.addinfourl(stream, headers, url, code)
363 ret = urllib2.addinfourl(stream, headers, url)
364 ret.code = code
365 return ret
366
367 def http_request(self, req):
368 for h in std_headers:
369 if h in req.headers:
370 del req.headers[h]
371 req.add_header(h, std_headers[h])
372 if 'Youtubedl-no-compression' in req.headers:
373 if 'Accept-encoding' in req.headers:
374 del req.headers['Accept-encoding']
375 del req.headers['Youtubedl-no-compression']
376 return req
377
378 def http_response(self, req, resp):
379 old_resp = resp
380 # gzip
381 if resp.headers.get('Content-encoding', '') == 'gzip':
382 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
383 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
384 resp.msg = old_resp.msg
385 # deflate
386 if resp.headers.get('Content-encoding', '') == 'deflate':
387 gz = StringIO.StringIO(self.deflate(resp.read()))
388 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
389 resp.msg = old_resp.msg
390 return resp
391
392
393 class FileDownloader(object):
394 """File Downloader class.
395
396 File downloader objects are the ones responsible of downloading the
397 actual video file and writing it to disk if the user has requested
398 it, among some other tasks. In most cases there should be one per
399 program. As, given a video URL, the downloader doesn't know how to
400 extract all the needed information, task that InfoExtractors do, it
401 has to pass the URL to one of them.
402
403 For this, file downloader objects have a method that allows
404 InfoExtractors to be registered in a given order. When it is passed
405 a URL, the file downloader handles it to the first InfoExtractor it
406 finds that reports being able to handle it. The InfoExtractor extracts
407 all the information about the video or videos the URL refers to, and
408 asks the FileDownloader to process the video information, possibly
409 downloading the video.
410
411 File downloaders accept a lot of parameters. In order not to saturate
412 the object constructor with arguments, it receives a dictionary of
413 options instead. These options are available through the params
414 attribute for the InfoExtractors to use. The FileDownloader also
415 registers itself as the downloader in charge for the InfoExtractors
416 that are added to it, so this is a "mutual registration".
417
418 Available options:
419
420 username: Username for authentication purposes.
421 password: Password for authentication purposes.
422 usenetrc: Use netrc for authentication instead.
423 quiet: Do not print messages to stdout.
424 forceurl: Force printing final URL.
425 forcetitle: Force printing title.
426 forcethumbnail: Force printing thumbnail URL.
427 forcedescription: Force printing description.
428 forcefilename: Force printing final filename.
429 simulate: Do not download the video files.
430 format: Video format code.
431 format_limit: Highest quality format to try.
432 outtmpl: Template for output names.
433 ignoreerrors: Do not stop on download errors.
434 ratelimit: Download speed limit, in bytes/sec.
435 nooverwrites: Prevent overwriting files.
436 retries: Number of times to retry for HTTP error 5xx
437 continuedl: Try to continue downloads if possible.
438 noprogress: Do not print the progress bar.
439 playliststart: Playlist item to start at.
440 playlistend: Playlist item to end at.
441 matchtitle: Download only matching titles.
442 rejecttitle: Reject downloads for matching titles.
443 logtostderr: Log messages to stderr instead of stdout.
444 consoletitle: Display progress in console window's titlebar.
445 nopart: Do not use temporary .part files.
446 updatetime: Use the Last-modified header to set output file timestamps.
447 writedescription: Write the video description to a .description file
448 writeinfojson: Write the video description to a .info.json file
449 """
450
451 params = None
452 _ies = []
453 _pps = []
454 _download_retcode = None
455 _num_downloads = None
456 _screen_file = None
457
458 def __init__(self, params):
459 """Create a FileDownloader object with the given options."""
460 self._ies = []
461 self._pps = []
462 self._download_retcode = 0
463 self._num_downloads = 0
464 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
465 self.params = params
466
467 @staticmethod
468 def format_bytes(bytes):
469 if bytes is None:
470 return 'N/A'
471 if type(bytes) is str:
472 bytes = float(bytes)
473 if bytes == 0.0:
474 exponent = 0
475 else:
476 exponent = long(math.log(bytes, 1024.0))
477 suffix = 'bkMGTPEZY'[exponent]
478 converted = float(bytes) / float(1024 ** exponent)
479 return '%.2f%s' % (converted, suffix)
480
481 @staticmethod
482 def calc_percent(byte_counter, data_len):
483 if data_len is None:
484 return '---.-%'
485 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
486
487 @staticmethod
488 def calc_eta(start, now, total, current):
489 if total is None:
490 return '--:--'
491 dif = now - start
492 if current == 0 or dif < 0.001: # One millisecond
493 return '--:--'
494 rate = float(current) / dif
495 eta = long((float(total) - float(current)) / rate)
496 (eta_mins, eta_secs) = divmod(eta, 60)
497 if eta_mins > 99:
498 return '--:--'
499 return '%02d:%02d' % (eta_mins, eta_secs)
500
501 @staticmethod
502 def calc_speed(start, now, bytes):
503 dif = now - start
504 if bytes == 0 or dif < 0.001: # One millisecond
505 return '%10s' % '---b/s'
506 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
507
508 @staticmethod
509 def best_block_size(elapsed_time, bytes):
510 new_min = max(bytes / 2.0, 1.0)
511 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
512 if elapsed_time < 0.001:
513 return long(new_max)
514 rate = bytes / elapsed_time
515 if rate > new_max:
516 return long(new_max)
517 if rate < new_min:
518 return long(new_min)
519 return long(rate)
520
521 @staticmethod
522 def parse_bytes(bytestr):
523 """Parse a string indicating a byte quantity into a long integer."""
524 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
525 if matchobj is None:
526 return None
527 number = float(matchobj.group(1))
528 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
529 return long(round(number * multiplier))
530
531 def add_info_extractor(self, ie):
532 """Add an InfoExtractor object to the end of the list."""
533 self._ies.append(ie)
534 ie.set_downloader(self)
535
536 def add_post_processor(self, pp):
537 """Add a PostProcessor object to the end of the chain."""
538 self._pps.append(pp)
539 pp.set_downloader(self)
540
541 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
542 """Print message to stdout if not in quiet mode."""
543 try:
544 if not self.params.get('quiet', False):
545 terminator = [u'\n', u''][skip_eol]
546 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
547 self._screen_file.flush()
548 except (UnicodeEncodeError), err:
549 if not ignore_encoding_errors:
550 raise
551
552 def to_stderr(self, message):
553 """Print message to stderr."""
554 print >>sys.stderr, message.encode(preferredencoding())
555
556 def to_cons_title(self, message):
557 """Set console/terminal window title to message."""
558 if not self.params.get('consoletitle', False):
559 return
560 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
561 # c_wchar_p() might not be necessary if `message` is
562 # already of type unicode()
563 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
564 elif 'TERM' in os.environ:
565 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
566
567 def fixed_template(self):
568 """Checks if the output template is fixed."""
569 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
570
571 def trouble(self, message=None):
572 """Determine action to take when a download problem appears.
573
574 Depending on if the downloader has been configured to ignore
575 download errors or not, this method may throw an exception or
576 not when errors are found, after printing the message.
577 """
578 if message is not None:
579 self.to_stderr(message)
580 if not self.params.get('ignoreerrors', False):
581 raise DownloadError(message)
582 self._download_retcode = 1
583
584 def slow_down(self, start_time, byte_counter):
585 """Sleep if the download speed is over the rate limit."""
586 rate_limit = self.params.get('ratelimit', None)
587 if rate_limit is None or byte_counter == 0:
588 return
589 now = time.time()
590 elapsed = now - start_time
591 if elapsed <= 0.0:
592 return
593 speed = float(byte_counter) / elapsed
594 if speed > rate_limit:
595 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
596
597 def temp_name(self, filename):
598 """Returns a temporary filename for the given filename."""
599 if self.params.get('nopart', False) or filename == u'-' or \
600 (os.path.exists(filename) and not os.path.isfile(filename)):
601 return filename
602 return filename + u'.part'
603
604 def undo_temp_name(self, filename):
605 if filename.endswith(u'.part'):
606 return filename[:-len(u'.part')]
607 return filename
608
609 def try_rename(self, old_filename, new_filename):
610 try:
611 if old_filename == new_filename:
612 return
613 os.rename(old_filename, new_filename)
614 except (IOError, OSError), err:
615 self.trouble(u'ERROR: unable to rename file')
616
617 def try_utime(self, filename, last_modified_hdr):
618 """Try to set the last-modified time of the given file."""
619 if last_modified_hdr is None:
620 return
621 if not os.path.isfile(filename):
622 return
623 timestr = last_modified_hdr
624 if timestr is None:
625 return
626 filetime = timeconvert(timestr)
627 if filetime is None:
628 return filetime
629 try:
630 os.utime(filename, (time.time(), filetime))
631 except:
632 pass
633 return filetime
634
635 def report_writedescription(self, descfn):
636 """ Report that the description file is being written """
637 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
638
639 def report_writeinfojson(self, infofn):
640 """ Report that the metadata file has been written """
641 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
642
643 def report_destination(self, filename):
644 """Report destination filename."""
645 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
646
647 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
648 """Report download progress."""
649 if self.params.get('noprogress', False):
650 return
651 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
652 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
653 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
654 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
655
656 def report_resuming_byte(self, resume_len):
657 """Report attempt to resume at given byte."""
658 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
659
660 def report_retry(self, count, retries):
661 """Report retry in case of HTTP error 5xx"""
662 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
663
664 def report_file_already_downloaded(self, file_name):
665 """Report file has already been fully downloaded."""
666 try:
667 self.to_screen(u'[download] %s has already been downloaded' % file_name)
668 except (UnicodeEncodeError), err:
669 self.to_screen(u'[download] The file has already been downloaded')
670
671 def report_unable_to_resume(self):
672 """Report it was impossible to resume download."""
673 self.to_screen(u'[download] Unable to resume')
674
675 def report_finish(self):
676 """Report download finished."""
677 if self.params.get('noprogress', False):
678 self.to_screen(u'[download] Download completed')
679 else:
680 self.to_screen(u'')
681
682 def increment_downloads(self):
683 """Increment the ordinal that assigns a number to each file."""
684 self._num_downloads += 1
685
686 def prepare_filename(self, info_dict):
687 """Generate the output filename."""
688 try:
689 template_dict = dict(info_dict)
690 template_dict['epoch'] = unicode(long(time.time()))
691 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
692 filename = self.params['outtmpl'] % template_dict
693 return filename
694 except (ValueError, KeyError), err:
695 self.trouble(u'ERROR: invalid system charset or erroneous output template')
696 return None
697
698 def process_info(self, info_dict):
699 """Process a single dictionary returned by an InfoExtractor."""
700 filename = self.prepare_filename(info_dict)
701
702 # Forced printings
703 if self.params.get('forcetitle', False):
704 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
705 if self.params.get('forceurl', False):
706 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
707 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
708 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
709 if self.params.get('forcedescription', False) and 'description' in info_dict:
710 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
711 if self.params.get('forcefilename', False) and filename is not None:
712 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
713 if self.params.get('forceformat', False):
714 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
715
716 # Do nothing else if in simulate mode
717 if self.params.get('simulate', False):
718 return
719
720 if filename is None:
721 return
722
723 matchtitle=self.params.get('matchtitle',False)
724 rejecttitle=self.params.get('rejecttitle',False)
725 title=info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
726 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
727 self.to_screen(u'[download] "%s" title did not match pattern "%s"' % (title, matchtitle))
728 return
729 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
730 self.to_screen(u'[download] "%s" title matched reject pattern "%s"' % (title, rejecttitle))
731 return
732
733 if self.params.get('nooverwrites', False) and os.path.exists(filename):
734 self.to_stderr(u'WARNING: file exists and will be skipped')
735 return
736
737 try:
738 dn = os.path.dirname(filename)
739 if dn != '' and not os.path.exists(dn):
740 os.makedirs(dn)
741 except (OSError, IOError), err:
742 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
743 return
744
745 if self.params.get('writedescription', False):
746 try:
747 descfn = filename + '.description'
748 self.report_writedescription(descfn)
749 descfile = open(descfn, 'wb')
750 try:
751 descfile.write(info_dict['description'].encode('utf-8'))
752 finally:
753 descfile.close()
754 except (OSError, IOError):
755 self.trouble(u'ERROR: Cannot write description file ' + descfn)
756 return
757
758 if self.params.get('writeinfojson', False):
759 infofn = filename + '.info.json'
760 self.report_writeinfojson(infofn)
761 try:
762 json.dump
763 except (NameError,AttributeError):
764 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
765 return
766 try:
767 infof = open(infofn, 'wb')
768 try:
769 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
770 json.dump(json_info_dict, infof)
771 finally:
772 infof.close()
773 except (OSError, IOError):
774 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
775 return
776
777 if not self.params.get('skip_download', False):
778 try:
779 success = self._do_download(filename, info_dict)
780 except (OSError, IOError), err:
781 raise UnavailableVideoError
782 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
783 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
784 return
785 except (ContentTooShortError, ), err:
786 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
787 return
788
789 if success:
790 try:
791 self.post_process(filename, info_dict)
792 except (PostProcessingError), err:
793 self.trouble(u'ERROR: postprocessing: %s' % str(err))
794 return
795
796 def download(self, url_list):
797 """Download a given list of URLs."""
798 if len(url_list) > 1 and self.fixed_template():
799 raise SameFileError(self.params['outtmpl'])
800
801 for url in url_list:
802 suitable_found = False
803 for ie in self._ies:
804 # Go to next InfoExtractor if not suitable
805 if not ie.suitable(url):
806 continue
807
808 # Suitable InfoExtractor found
809 suitable_found = True
810
811 # Extract information from URL and process it
812 ie.extract(url)
813
814 # Suitable InfoExtractor had been found; go to next URL
815 break
816
817 if not suitable_found:
818 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
819
820 return self._download_retcode
821
822 def post_process(self, filename, ie_info):
823 """Run the postprocessing chain on the given file."""
824 info = dict(ie_info)
825 info['filepath'] = filename
826 for pp in self._pps:
827 info = pp.run(info)
828 if info is None:
829 break
830
831 def _download_with_rtmpdump(self, filename, url, player_url):
832 self.report_destination(filename)
833 tmpfilename = self.temp_name(filename)
834
835 # Check for rtmpdump first
836 try:
837 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
838 except (OSError, IOError):
839 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
840 return False
841
842 # Download using rtmpdump. rtmpdump returns exit code 2 when
843 # the connection was interrumpted and resuming appears to be
844 # possible. This is part of rtmpdump's normal usage, AFAIK.
845 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
846 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
847 while retval == 2 or retval == 1:
848 prevsize = os.path.getsize(tmpfilename)
849 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
850 time.sleep(5.0) # This seems to be needed
851 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
852 cursize = os.path.getsize(tmpfilename)
853 if prevsize == cursize and retval == 1:
854 break
855 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
856 if prevsize == cursize and retval == 2 and cursize > 1024:
857 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
858 retval = 0
859 break
860 if retval == 0:
861 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
862 self.try_rename(tmpfilename, filename)
863 return True
864 else:
865 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
866 return False
867
868 def _do_download(self, filename, info_dict):
869 url = info_dict['url']
870 player_url = info_dict.get('player_url', None)
871
872 # Check file already present
873 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
874 self.report_file_already_downloaded(filename)
875 return True
876
877 # Attempt to download using rtmpdump
878 if url.startswith('rtmp'):
879 return self._download_with_rtmpdump(filename, url, player_url)
880
881 tmpfilename = self.temp_name(filename)
882 stream = None
883
884 # Do not include the Accept-Encoding header
885 headers = {'Youtubedl-no-compression': 'True'}
886 basic_request = urllib2.Request(url, None, headers)
887 request = urllib2.Request(url, None, headers)
888
889 # Establish possible resume length
890 if os.path.isfile(tmpfilename):
891 resume_len = os.path.getsize(tmpfilename)
892 else:
893 resume_len = 0
894
895 open_mode = 'wb'
896 if resume_len != 0:
897 if self.params.get('continuedl', False):
898 self.report_resuming_byte(resume_len)
899 request.add_header('Range','bytes=%d-' % resume_len)
900 open_mode = 'ab'
901 else:
902 resume_len = 0
903
904 count = 0
905 retries = self.params.get('retries', 0)
906 while count <= retries:
907 # Establish connection
908 try:
909 if count == 0 and 'urlhandle' in info_dict:
910 data = info_dict['urlhandle']
911 data = urllib2.urlopen(request)
912 break
913 except (urllib2.HTTPError, ), err:
914 if (err.code < 500 or err.code >= 600) and err.code != 416:
915 # Unexpected HTTP error
916 raise
917 elif err.code == 416:
918 # Unable to resume (requested range not satisfiable)
919 try:
920 # Open the connection again without the range header
921 data = urllib2.urlopen(basic_request)
922 content_length = data.info()['Content-Length']
923 except (urllib2.HTTPError, ), err:
924 if err.code < 500 or err.code >= 600:
925 raise
926 else:
927 # Examine the reported length
928 if (content_length is not None and
929 (resume_len - 100 < long(content_length) < resume_len + 100)):
930 # The file had already been fully downloaded.
931 # Explanation to the above condition: in issue #175 it was revealed that
932 # YouTube sometimes adds or removes a few bytes from the end of the file,
933 # changing the file size slightly and causing problems for some users. So
934 # I decided to implement a suggested change and consider the file
935 # completely downloaded if the file size differs less than 100 bytes from
936 # the one in the hard drive.
937 self.report_file_already_downloaded(filename)
938 self.try_rename(tmpfilename, filename)
939 return True
940 else:
941 # The length does not match, we start the download over
942 self.report_unable_to_resume()
943 open_mode = 'wb'
944 break
945 # Retry
946 count += 1
947 if count <= retries:
948 self.report_retry(count, retries)
949
950 if count > retries:
951 self.trouble(u'ERROR: giving up after %s retries' % retries)
952 return False
953
954 data_len = data.info().get('Content-length', None)
955 if data_len is not None:
956 data_len = long(data_len) + resume_len
957 data_len_str = self.format_bytes(data_len)
958 byte_counter = 0 + resume_len
959 block_size = 1024
960 start = time.time()
961 while True:
962 # Download and write
963 before = time.time()
964 data_block = data.read(block_size)
965 after = time.time()
966 if len(data_block) == 0:
967 break
968 byte_counter += len(data_block)
969
970 # Open file just in time
971 if stream is None:
972 try:
973 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
974 assert stream is not None
975 filename = self.undo_temp_name(tmpfilename)
976 self.report_destination(filename)
977 except (OSError, IOError), err:
978 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
979 return False
980 try:
981 stream.write(data_block)
982 except (IOError, OSError), err:
983 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
984 return False
985 block_size = self.best_block_size(after - before, len(data_block))
986
987 # Progress message
988 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
989 if data_len is None:
990 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
991 else:
992 percent_str = self.calc_percent(byte_counter, data_len)
993 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
994 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
995
996 # Apply rate limit
997 self.slow_down(start, byte_counter - resume_len)
998
999 if stream is None:
1000 self.trouble(u'\nERROR: Did not get any data blocks')
1001 return False
1002 stream.close()
1003 self.report_finish()
1004 if data_len is not None and byte_counter != data_len:
1005 raise ContentTooShortError(byte_counter, long(data_len))
1006 self.try_rename(tmpfilename, filename)
1007
1008 # Update file modification time
1009 if self.params.get('updatetime', True):
1010 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1011
1012 return True
1013
1014
1015 class InfoExtractor(object):
1016 """Information Extractor class.
1017
1018 Information extractors are the classes that, given a URL, extract
1019 information from the video (or videos) the URL refers to. This
1020 information includes the real video URL, the video title and simplified
1021 title, author and others. The information is stored in a dictionary
1022 which is then passed to the FileDownloader. The FileDownloader
1023 processes this information possibly downloading the video to the file
1024 system, among other possible outcomes. The dictionaries must include
1025 the following fields:
1026
1027 id: Video identifier.
1028 url: Final video URL.
1029 uploader: Nickname of the video uploader.
1030 title: Literal title.
1031 stitle: Simplified title.
1032 ext: Video filename extension.
1033 format: Video format.
1034 player_url: SWF Player URL (may be None).
1035
1036 The following fields are optional. Their primary purpose is to allow
1037 youtube-dl to serve as the backend for a video search function, such
1038 as the one in youtube2mp3. They are only used when their respective
1039 forced printing functions are called:
1040
1041 thumbnail: Full URL to a video thumbnail image.
1042 description: One-line video description.
1043
1044 Subclasses of this one should re-define the _real_initialize() and
1045 _real_extract() methods and define a _VALID_URL regexp.
1046 Probably, they should also be added to the list of extractors.
1047 """
1048
1049 _ready = False
1050 _downloader = None
1051
1052 def __init__(self, downloader=None):
1053 """Constructor. Receives an optional downloader."""
1054 self._ready = False
1055 self.set_downloader(downloader)
1056
1057 def suitable(self, url):
1058 """Receives a URL and returns True if suitable for this IE."""
1059 return re.match(self._VALID_URL, url) is not None
1060
1061 def initialize(self):
1062 """Initializes an instance (authentication, etc)."""
1063 if not self._ready:
1064 self._real_initialize()
1065 self._ready = True
1066
1067 def extract(self, url):
1068 """Extracts URL information and returns it in list of dicts."""
1069 self.initialize()
1070 return self._real_extract(url)
1071
1072 def set_downloader(self, downloader):
1073 """Sets the downloader for this IE."""
1074 self._downloader = downloader
1075
1076 def _real_initialize(self):
1077 """Real initialization process. Redefine in subclasses."""
1078 pass
1079
1080 def _real_extract(self, url):
1081 """Real extraction process. Redefine in subclasses."""
1082 pass
1083
1084
1085 class YoutubeIE(InfoExtractor):
1086 """Information extractor for youtube.com."""
1087
1088 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1089 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1090 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1091 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1092 _NETRC_MACHINE = 'youtube'
1093 # Listed in order of quality
1094 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1095 _video_extensions = {
1096 '13': '3gp',
1097 '17': 'mp4',
1098 '18': 'mp4',
1099 '22': 'mp4',
1100 '37': 'mp4',
1101 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1102 '43': 'webm',
1103 '44': 'webm',
1104 '45': 'webm',
1105 }
1106 IE_NAME = u'youtube'
1107
1108 def report_lang(self):
1109 """Report attempt to set language."""
1110 self._downloader.to_screen(u'[youtube] Setting language')
1111
1112 def report_login(self):
1113 """Report attempt to log in."""
1114 self._downloader.to_screen(u'[youtube] Logging in')
1115
1116 def report_age_confirmation(self):
1117 """Report attempt to confirm age."""
1118 self._downloader.to_screen(u'[youtube] Confirming age')
1119
1120 def report_video_webpage_download(self, video_id):
1121 """Report attempt to download video webpage."""
1122 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1123
1124 def report_video_info_webpage_download(self, video_id):
1125 """Report attempt to download video info webpage."""
1126 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1127
1128 def report_information_extraction(self, video_id):
1129 """Report attempt to extract video information."""
1130 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1131
1132 def report_unavailable_format(self, video_id, format):
1133 """Report extracted video URL."""
1134 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1135
1136 def report_rtmp_download(self):
1137 """Indicate the download will use the RTMP protocol."""
1138 self._downloader.to_screen(u'[youtube] RTMP download detected')
1139
1140 def _real_initialize(self):
1141 if self._downloader is None:
1142 return
1143
1144 username = None
1145 password = None
1146 downloader_params = self._downloader.params
1147
1148 # Attempt to use provided username and password or .netrc data
1149 if downloader_params.get('username', None) is not None:
1150 username = downloader_params['username']
1151 password = downloader_params['password']
1152 elif downloader_params.get('usenetrc', False):
1153 try:
1154 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1155 if info is not None:
1156 username = info[0]
1157 password = info[2]
1158 else:
1159 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1160 except (IOError, netrc.NetrcParseError), err:
1161 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1162 return
1163
1164 # Set language
1165 request = urllib2.Request(self._LANG_URL)
1166 try:
1167 self.report_lang()
1168 urllib2.urlopen(request).read()
1169 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1170 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1171 return
1172
1173 # No authentication to be performed
1174 if username is None:
1175 return
1176
1177 # Log in
1178 login_form = {
1179 'current_form': 'loginForm',
1180 'next': '/',
1181 'action_login': 'Log In',
1182 'username': username,
1183 'password': password,
1184 }
1185 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1186 try:
1187 self.report_login()
1188 login_results = urllib2.urlopen(request).read()
1189 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1190 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1191 return
1192 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1193 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1194 return
1195
1196 # Confirm age
1197 age_form = {
1198 'next_url': '/',
1199 'action_confirm': 'Confirm',
1200 }
1201 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1202 try:
1203 self.report_age_confirmation()
1204 age_results = urllib2.urlopen(request).read()
1205 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1206 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1207 return
1208
1209 def _real_extract(self, url):
1210 # Extract video id from URL
1211 mobj = re.match(self._VALID_URL, url)
1212 if mobj is None:
1213 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1214 return
1215 video_id = mobj.group(2)
1216
1217 # Get video webpage
1218 self.report_video_webpage_download(video_id)
1219 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&amp;has_verified=1' % video_id)
1220 try:
1221 video_webpage = urllib2.urlopen(request).read()
1222 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1223 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1224 return
1225
1226 # Attempt to extract SWF player URL
1227 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1228 if mobj is not None:
1229 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1230 else:
1231 player_url = None
1232
1233 # Get video info
1234 self.report_video_info_webpage_download(video_id)
1235 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1236 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1237 % (video_id, el_type))
1238 request = urllib2.Request(video_info_url)
1239 try:
1240 video_info_webpage = urllib2.urlopen(request).read()
1241 video_info = parse_qs(video_info_webpage)
1242 if 'token' in video_info:
1243 break
1244 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1245 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1246 return
1247 if 'token' not in video_info:
1248 if 'reason' in video_info:
1249 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1250 else:
1251 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1252 return
1253
1254 # Start extracting information
1255 self.report_information_extraction(video_id)
1256
1257 # uploader
1258 if 'author' not in video_info:
1259 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1260 return
1261 video_uploader = urllib.unquote_plus(video_info['author'][0])
1262
1263 # title
1264 if 'title' not in video_info:
1265 self._downloader.trouble(u'ERROR: unable to extract video title')
1266 return
1267 video_title = urllib.unquote_plus(video_info['title'][0])
1268 video_title = video_title.decode('utf-8')
1269 video_title = sanitize_title(video_title)
1270
1271 # simplified title
1272 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1273 simple_title = simple_title.strip(ur'_')
1274
1275 # thumbnail image
1276 if 'thumbnail_url' not in video_info:
1277 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1278 video_thumbnail = ''
1279 else: # don't panic if we can't find it
1280 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1281
1282 # upload date
1283 upload_date = u'NA'
1284 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1285 if mobj is not None:
1286 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1287 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1288 for expression in format_expressions:
1289 try:
1290 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1291 except:
1292 pass
1293
1294 # description
1295 try:
1296 lxml.etree
1297 except NameError:
1298 video_description = u'No description available.'
1299 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1300 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1301 if mobj is not None:
1302 video_description = mobj.group(1).decode('utf-8')
1303 else:
1304 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1305 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1306 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1307 # TODO use another parser
1308
1309 # token
1310 video_token = urllib.unquote_plus(video_info['token'][0])
1311
1312 # Decide which formats to download
1313 req_format = self._downloader.params.get('format', None)
1314
1315 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1316 self.report_rtmp_download()
1317 video_url_list = [(None, video_info['conn'][0])]
1318 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1319 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1320 url_data = [parse_qs(uds) for uds in url_data_strs]
1321 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1322 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1323
1324 format_limit = self._downloader.params.get('format_limit', None)
1325 if format_limit is not None and format_limit in self._available_formats:
1326 format_list = self._available_formats[self._available_formats.index(format_limit):]
1327 else:
1328 format_list = self._available_formats
1329 existing_formats = [x for x in format_list if x in url_map]
1330 if len(existing_formats) == 0:
1331 self._downloader.trouble(u'ERROR: no known formats available for video')
1332 return
1333 if req_format is None or req_format == 'best':
1334 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1335 elif req_format == 'worst':
1336 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1337 elif req_format in ('-1', 'all'):
1338 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1339 else:
1340 # Specific formats. We pick the first in a slash-delimeted sequence.
1341 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1342 req_formats = req_format.split('/')
1343 video_url_list = None
1344 for rf in req_formats:
1345 if rf in url_map:
1346 video_url_list = [(rf, url_map[rf])]
1347 break
1348 if video_url_list is None:
1349 self._downloader.trouble(u'ERROR: requested format not available')
1350 return
1351 else:
1352 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1353 return
1354
1355 for format_param, video_real_url in video_url_list:
1356 # At this point we have a new video
1357 self._downloader.increment_downloads()
1358
1359 # Extension
1360 video_extension = self._video_extensions.get(format_param, 'flv')
1361
1362 try:
1363 # Process video information
1364 self._downloader.process_info({
1365 'id': video_id.decode('utf-8'),
1366 'url': video_real_url.decode('utf-8'),
1367 'uploader': video_uploader.decode('utf-8'),
1368 'upload_date': upload_date,
1369 'title': video_title,
1370 'stitle': simple_title,
1371 'ext': video_extension.decode('utf-8'),
1372 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1373 'thumbnail': video_thumbnail.decode('utf-8'),
1374 'description': video_description,
1375 'player_url': player_url,
1376 })
1377 except UnavailableVideoError, err:
1378 self._downloader.trouble(u'\nERROR: unable to download video')
1379
1380
1381 class MetacafeIE(InfoExtractor):
1382 """Information Extractor for metacafe.com."""
1383
1384 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1385 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1386 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1387 _youtube_ie = None
1388 IE_NAME = u'metacafe'
1389
1390 def __init__(self, youtube_ie, downloader=None):
1391 InfoExtractor.__init__(self, downloader)
1392 self._youtube_ie = youtube_ie
1393
1394 def report_disclaimer(self):
1395 """Report disclaimer retrieval."""
1396 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1397
1398 def report_age_confirmation(self):
1399 """Report attempt to confirm age."""
1400 self._downloader.to_screen(u'[metacafe] Confirming age')
1401
1402 def report_download_webpage(self, video_id):
1403 """Report webpage download."""
1404 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1405
1406 def report_extraction(self, video_id):
1407 """Report information extraction."""
1408 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1409
1410 def _real_initialize(self):
1411 # Retrieve disclaimer
1412 request = urllib2.Request(self._DISCLAIMER)
1413 try:
1414 self.report_disclaimer()
1415 disclaimer = urllib2.urlopen(request).read()
1416 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1417 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1418 return
1419
1420 # Confirm age
1421 disclaimer_form = {
1422 'filters': '0',
1423 'submit': "Continue - I'm over 18",
1424 }
1425 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1426 try:
1427 self.report_age_confirmation()
1428 disclaimer = urllib2.urlopen(request).read()
1429 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1430 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1431 return
1432
1433 def _real_extract(self, url):
1434 # Extract id and simplified title from URL
1435 mobj = re.match(self._VALID_URL, url)
1436 if mobj is None:
1437 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1438 return
1439
1440 video_id = mobj.group(1)
1441
1442 # Check if video comes from YouTube
1443 mobj2 = re.match(r'^yt-(.*)$', video_id)
1444 if mobj2 is not None:
1445 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1446 return
1447
1448 # At this point we have a new video
1449 self._downloader.increment_downloads()
1450
1451 simple_title = mobj.group(2).decode('utf-8')
1452
1453 # Retrieve video webpage to extract further information
1454 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1455 try:
1456 self.report_download_webpage(video_id)
1457 webpage = urllib2.urlopen(request).read()
1458 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1459 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1460 return
1461
1462 # Extract URL, uploader and title from webpage
1463 self.report_extraction(video_id)
1464 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1465 if mobj is not None:
1466 mediaURL = urllib.unquote(mobj.group(1))
1467 video_extension = mediaURL[-3:]
1468
1469 # Extract gdaKey if available
1470 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1471 if mobj is None:
1472 video_url = mediaURL
1473 else:
1474 gdaKey = mobj.group(1)
1475 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1476 else:
1477 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1478 if mobj is None:
1479 self._downloader.trouble(u'ERROR: unable to extract media URL')
1480 return
1481 vardict = parse_qs(mobj.group(1))
1482 if 'mediaData' not in vardict:
1483 self._downloader.trouble(u'ERROR: unable to extract media URL')
1484 return
1485 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1486 if mobj is None:
1487 self._downloader.trouble(u'ERROR: unable to extract media URL')
1488 return
1489 mediaURL = mobj.group(1).replace('\\/', '/')
1490 video_extension = mediaURL[-3:]
1491 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1492
1493 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1494 if mobj is None:
1495 self._downloader.trouble(u'ERROR: unable to extract title')
1496 return
1497 video_title = mobj.group(1).decode('utf-8')
1498 video_title = sanitize_title(video_title)
1499
1500 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1501 if mobj is None:
1502 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1503 return
1504 video_uploader = mobj.group(1)
1505
1506 try:
1507 # Process video information
1508 self._downloader.process_info({
1509 'id': video_id.decode('utf-8'),
1510 'url': video_url.decode('utf-8'),
1511 'uploader': video_uploader.decode('utf-8'),
1512 'upload_date': u'NA',
1513 'title': video_title,
1514 'stitle': simple_title,
1515 'ext': video_extension.decode('utf-8'),
1516 'format': u'NA',
1517 'player_url': None,
1518 })
1519 except UnavailableVideoError:
1520 self._downloader.trouble(u'\nERROR: unable to download video')
1521
1522
1523 class DailymotionIE(InfoExtractor):
1524 """Information Extractor for Dailymotion"""
1525
1526 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1527 IE_NAME = u'dailymotion'
1528
1529 def __init__(self, downloader=None):
1530 InfoExtractor.__init__(self, downloader)
1531
1532 def report_download_webpage(self, video_id):
1533 """Report webpage download."""
1534 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1535
1536 def report_extraction(self, video_id):
1537 """Report information extraction."""
1538 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1539
1540 def _real_initialize(self):
1541 return
1542
1543 def _real_extract(self, url):
1544 # Extract id and simplified title from URL
1545 mobj = re.match(self._VALID_URL, url)
1546 if mobj is None:
1547 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1548 return
1549
1550 # At this point we have a new video
1551 self._downloader.increment_downloads()
1552 video_id = mobj.group(1)
1553
1554 simple_title = mobj.group(2).decode('utf-8')
1555 video_extension = 'flv'
1556
1557 # Retrieve video webpage to extract further information
1558 request = urllib2.Request(url)
1559 request.add_header('Cookie', 'family_filter=off')
1560 try:
1561 self.report_download_webpage(video_id)
1562 webpage = urllib2.urlopen(request).read()
1563 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1564 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1565 return
1566
1567 # Extract URL, uploader and title from webpage
1568 self.report_extraction(video_id)
1569 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1570 if mobj is None:
1571 self._downloader.trouble(u'ERROR: unable to extract media URL')
1572 return
1573 sequence = urllib.unquote(mobj.group(1))
1574 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1575 if mobj is None:
1576 self._downloader.trouble(u'ERROR: unable to extract media URL')
1577 return
1578 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1579
1580 # if needed add http://www.dailymotion.com/ if relative URL
1581
1582 video_url = mediaURL
1583
1584 mobj = re.search(r'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage)
1585 if mobj is None:
1586 self._downloader.trouble(u'ERROR: unable to extract title')
1587 return
1588 video_title = mobj.group(1).decode('utf-8')
1589 video_title = sanitize_title(video_title)
1590
1591 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1592 if mobj is None:
1593 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1594 return
1595 video_uploader = mobj.group(1)
1596
1597 try:
1598 # Process video information
1599 self._downloader.process_info({
1600 'id': video_id.decode('utf-8'),
1601 'url': video_url.decode('utf-8'),
1602 'uploader': video_uploader.decode('utf-8'),
1603 'upload_date': u'NA',
1604 'title': video_title,
1605 'stitle': simple_title,
1606 'ext': video_extension.decode('utf-8'),
1607 'format': u'NA',
1608 'player_url': None,
1609 })
1610 except UnavailableVideoError:
1611 self._downloader.trouble(u'\nERROR: unable to download video')
1612
1613
1614 class GoogleIE(InfoExtractor):
1615 """Information extractor for video.google.com."""
1616
1617 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1618 IE_NAME = u'video.google'
1619
1620 def __init__(self, downloader=None):
1621 InfoExtractor.__init__(self, downloader)
1622
1623 def report_download_webpage(self, video_id):
1624 """Report webpage download."""
1625 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1626
1627 def report_extraction(self, video_id):
1628 """Report information extraction."""
1629 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1630
1631 def _real_initialize(self):
1632 return
1633
1634 def _real_extract(self, url):
1635 # Extract id from URL
1636 mobj = re.match(self._VALID_URL, url)
1637 if mobj is None:
1638 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1639 return
1640
1641 # At this point we have a new video
1642 self._downloader.increment_downloads()
1643 video_id = mobj.group(1)
1644
1645 video_extension = 'mp4'
1646
1647 # Retrieve video webpage to extract further information
1648 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1649 try:
1650 self.report_download_webpage(video_id)
1651 webpage = urllib2.urlopen(request).read()
1652 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1653 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1654 return
1655
1656 # Extract URL, uploader, and title from webpage
1657 self.report_extraction(video_id)
1658 mobj = re.search(r"download_url:'([^']+)'", webpage)
1659 if mobj is None:
1660 video_extension = 'flv'
1661 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1662 if mobj is None:
1663 self._downloader.trouble(u'ERROR: unable to extract media URL')
1664 return
1665 mediaURL = urllib.unquote(mobj.group(1))
1666 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1667 mediaURL = mediaURL.replace('\\x26', '\x26')
1668
1669 video_url = mediaURL
1670
1671 mobj = re.search(r'<title>(.*)</title>', webpage)
1672 if mobj is None:
1673 self._downloader.trouble(u'ERROR: unable to extract title')
1674 return
1675 video_title = mobj.group(1).decode('utf-8')
1676 video_title = sanitize_title(video_title)
1677 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1678
1679 # Extract video description
1680 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1681 if mobj is None:
1682 self._downloader.trouble(u'ERROR: unable to extract video description')
1683 return
1684 video_description = mobj.group(1).decode('utf-8')
1685 if not video_description:
1686 video_description = 'No description available.'
1687
1688 # Extract video thumbnail
1689 if self._downloader.params.get('forcethumbnail', False):
1690 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1691 try:
1692 webpage = urllib2.urlopen(request).read()
1693 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1694 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1695 return
1696 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1697 if mobj is None:
1698 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1699 return
1700 video_thumbnail = mobj.group(1)
1701 else: # we need something to pass to process_info
1702 video_thumbnail = ''
1703
1704 try:
1705 # Process video information
1706 self._downloader.process_info({
1707 'id': video_id.decode('utf-8'),
1708 'url': video_url.decode('utf-8'),
1709 'uploader': u'NA',
1710 'upload_date': u'NA',
1711 'title': video_title,
1712 'stitle': simple_title,
1713 'ext': video_extension.decode('utf-8'),
1714 'format': u'NA',
1715 'player_url': None,
1716 })
1717 except UnavailableVideoError:
1718 self._downloader.trouble(u'\nERROR: unable to download video')
1719
1720
1721 class PhotobucketIE(InfoExtractor):
1722 """Information extractor for photobucket.com."""
1723
1724 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1725 IE_NAME = u'photobucket'
1726
1727 def __init__(self, downloader=None):
1728 InfoExtractor.__init__(self, downloader)
1729
1730 def report_download_webpage(self, video_id):
1731 """Report webpage download."""
1732 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1733
1734 def report_extraction(self, video_id):
1735 """Report information extraction."""
1736 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1737
1738 def _real_initialize(self):
1739 return
1740
1741 def _real_extract(self, url):
1742 # Extract id from URL
1743 mobj = re.match(self._VALID_URL, url)
1744 if mobj is None:
1745 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1746 return
1747
1748 # At this point we have a new video
1749 self._downloader.increment_downloads()
1750 video_id = mobj.group(1)
1751
1752 video_extension = 'flv'
1753
1754 # Retrieve video webpage to extract further information
1755 request = urllib2.Request(url)
1756 try:
1757 self.report_download_webpage(video_id)
1758 webpage = urllib2.urlopen(request).read()
1759 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1761 return
1762
1763 # Extract URL, uploader, and title from webpage
1764 self.report_extraction(video_id)
1765 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: unable to extract media URL')
1768 return
1769 mediaURL = urllib.unquote(mobj.group(1))
1770
1771 video_url = mediaURL
1772
1773 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1774 if mobj is None:
1775 self._downloader.trouble(u'ERROR: unable to extract title')
1776 return
1777 video_title = mobj.group(1).decode('utf-8')
1778 video_title = sanitize_title(video_title)
1779 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1780
1781 video_uploader = mobj.group(2).decode('utf-8')
1782
1783 try:
1784 # Process video information
1785 self._downloader.process_info({
1786 'id': video_id.decode('utf-8'),
1787 'url': video_url.decode('utf-8'),
1788 'uploader': video_uploader,
1789 'upload_date': u'NA',
1790 'title': video_title,
1791 'stitle': simple_title,
1792 'ext': video_extension.decode('utf-8'),
1793 'format': u'NA',
1794 'player_url': None,
1795 })
1796 except UnavailableVideoError:
1797 self._downloader.trouble(u'\nERROR: unable to download video')
1798
1799
1800 class YahooIE(InfoExtractor):
1801 """Information extractor for video.yahoo.com."""
1802
1803 # _VALID_URL matches all Yahoo! Video URLs
1804 # _VPAGE_URL matches only the extractable '/watch/' URLs
1805 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1806 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1807 IE_NAME = u'video.yahoo'
1808
1809 def __init__(self, downloader=None):
1810 InfoExtractor.__init__(self, downloader)
1811
1812 def report_download_webpage(self, video_id):
1813 """Report webpage download."""
1814 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1815
1816 def report_extraction(self, video_id):
1817 """Report information extraction."""
1818 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1819
1820 def _real_initialize(self):
1821 return
1822
1823 def _real_extract(self, url, new_video=True):
1824 # Extract ID from URL
1825 mobj = re.match(self._VALID_URL, url)
1826 if mobj is None:
1827 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1828 return
1829
1830 # At this point we have a new video
1831 self._downloader.increment_downloads()
1832 video_id = mobj.group(2)
1833 video_extension = 'flv'
1834
1835 # Rewrite valid but non-extractable URLs as
1836 # extractable English language /watch/ URLs
1837 if re.match(self._VPAGE_URL, url) is None:
1838 request = urllib2.Request(url)
1839 try:
1840 webpage = urllib2.urlopen(request).read()
1841 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1842 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1843 return
1844
1845 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1846 if mobj is None:
1847 self._downloader.trouble(u'ERROR: Unable to extract id field')
1848 return
1849 yahoo_id = mobj.group(1)
1850
1851 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1852 if mobj is None:
1853 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1854 return
1855 yahoo_vid = mobj.group(1)
1856
1857 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1858 return self._real_extract(url, new_video=False)
1859
1860 # Retrieve video webpage to extract further information
1861 request = urllib2.Request(url)
1862 try:
1863 self.report_download_webpage(video_id)
1864 webpage = urllib2.urlopen(request).read()
1865 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1866 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1867 return
1868
1869 # Extract uploader and title from webpage
1870 self.report_extraction(video_id)
1871 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1872 if mobj is None:
1873 self._downloader.trouble(u'ERROR: unable to extract video title')
1874 return
1875 video_title = mobj.group(1).decode('utf-8')
1876 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1877
1878 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1879 if mobj is None:
1880 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1881 return
1882 video_uploader = mobj.group(1).decode('utf-8')
1883
1884 # Extract video thumbnail
1885 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1886 if mobj is None:
1887 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1888 return
1889 video_thumbnail = mobj.group(1).decode('utf-8')
1890
1891 # Extract video description
1892 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1893 if mobj is None:
1894 self._downloader.trouble(u'ERROR: unable to extract video description')
1895 return
1896 video_description = mobj.group(1).decode('utf-8')
1897 if not video_description:
1898 video_description = 'No description available.'
1899
1900 # Extract video height and width
1901 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1902 if mobj is None:
1903 self._downloader.trouble(u'ERROR: unable to extract video height')
1904 return
1905 yv_video_height = mobj.group(1)
1906
1907 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1908 if mobj is None:
1909 self._downloader.trouble(u'ERROR: unable to extract video width')
1910 return
1911 yv_video_width = mobj.group(1)
1912
1913 # Retrieve video playlist to extract media URL
1914 # I'm not completely sure what all these options are, but we
1915 # seem to need most of them, otherwise the server sends a 401.
1916 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1917 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1918 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1919 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1920 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1921 try:
1922 self.report_download_webpage(video_id)
1923 webpage = urllib2.urlopen(request).read()
1924 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1925 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1926 return
1927
1928 # Extract media URL from playlist XML
1929 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1930 if mobj is None:
1931 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1932 return
1933 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1934 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1935
1936 try:
1937 # Process video information
1938 self._downloader.process_info({
1939 'id': video_id.decode('utf-8'),
1940 'url': video_url,
1941 'uploader': video_uploader,
1942 'upload_date': u'NA',
1943 'title': video_title,
1944 'stitle': simple_title,
1945 'ext': video_extension.decode('utf-8'),
1946 'thumbnail': video_thumbnail.decode('utf-8'),
1947 'description': video_description,
1948 'thumbnail': video_thumbnail,
1949 'player_url': None,
1950 })
1951 except UnavailableVideoError:
1952 self._downloader.trouble(u'\nERROR: unable to download video')
1953
1954
1955 class VimeoIE(InfoExtractor):
1956 """Information extractor for vimeo.com."""
1957
1958 # _VALID_URL matches Vimeo URLs
1959 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1960 IE_NAME = u'vimeo'
1961
1962 def __init__(self, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1964
1965 def report_download_webpage(self, video_id):
1966 """Report webpage download."""
1967 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1968
1969 def report_extraction(self, video_id):
1970 """Report information extraction."""
1971 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1972
1973 def _real_initialize(self):
1974 return
1975
1976 def _real_extract(self, url, new_video=True):
1977 # Extract ID from URL
1978 mobj = re.match(self._VALID_URL, url)
1979 if mobj is None:
1980 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1981 return
1982
1983 # At this point we have a new video
1984 self._downloader.increment_downloads()
1985 video_id = mobj.group(1)
1986
1987 # Retrieve video webpage to extract further information
1988 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
1989 try:
1990 self.report_download_webpage(video_id)
1991 webpage = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1994 return
1995
1996 # Now we begin extracting as much information as we can from what we
1997 # retrieved. First we extract the information common to all extractors,
1998 # and latter we extract those that are Vimeo specific.
1999 self.report_extraction(video_id)
2000
2001 # Extract title
2002 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2003 if mobj is None:
2004 self._downloader.trouble(u'ERROR: unable to extract video title')
2005 return
2006 video_title = mobj.group(1).decode('utf-8')
2007 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2008
2009 # Extract uploader
2010 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2011 if mobj is None:
2012 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2013 return
2014 video_uploader = mobj.group(1).decode('utf-8')
2015
2016 # Extract video thumbnail
2017 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2018 if mobj is None:
2019 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2020 return
2021 video_thumbnail = mobj.group(1).decode('utf-8')
2022
2023 # # Extract video description
2024 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2025 # if mobj is None:
2026 # self._downloader.trouble(u'ERROR: unable to extract video description')
2027 # return
2028 # video_description = mobj.group(1).decode('utf-8')
2029 # if not video_description: video_description = 'No description available.'
2030 video_description = 'Foo.'
2031
2032 # Vimeo specific: extract request signature
2033 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2034 if mobj is None:
2035 self._downloader.trouble(u'ERROR: unable to extract request signature')
2036 return
2037 sig = mobj.group(1).decode('utf-8')
2038
2039 # Vimeo specific: Extract request signature expiration
2040 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2041 if mobj is None:
2042 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2043 return
2044 sig_exp = mobj.group(1).decode('utf-8')
2045
2046 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id, sig, sig_exp)
2047
2048 try:
2049 # Process video information
2050 self._downloader.process_info({
2051 'id': video_id.decode('utf-8'),
2052 'url': video_url,
2053 'uploader': video_uploader,
2054 'upload_date': u'NA',
2055 'title': video_title,
2056 'stitle': simple_title,
2057 'ext': u'mp4',
2058 'thumbnail': video_thumbnail.decode('utf-8'),
2059 'description': video_description,
2060 'thumbnail': video_thumbnail,
2061 'description': video_description,
2062 'player_url': None,
2063 })
2064 except UnavailableVideoError:
2065 self._downloader.trouble(u'ERROR: unable to download video')
2066
2067
2068 class GenericIE(InfoExtractor):
2069 """Generic last-resort information extractor."""
2070
2071 _VALID_URL = r'.*'
2072 IE_NAME = u'generic'
2073
2074 def __init__(self, downloader=None):
2075 InfoExtractor.__init__(self, downloader)
2076
2077 def report_download_webpage(self, video_id):
2078 """Report webpage download."""
2079 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2080 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2081
2082 def report_extraction(self, video_id):
2083 """Report information extraction."""
2084 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2085
2086 def _real_initialize(self):
2087 return
2088
2089 def _real_extract(self, url):
2090 # At this point we have a new video
2091 self._downloader.increment_downloads()
2092
2093 video_id = url.split('/')[-1]
2094 request = urllib2.Request(url)
2095 try:
2096 self.report_download_webpage(video_id)
2097 webpage = urllib2.urlopen(request).read()
2098 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2099 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2100 return
2101 except ValueError, err:
2102 # since this is the last-resort InfoExtractor, if
2103 # this error is thrown, it'll be thrown here
2104 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2105 return
2106
2107 self.report_extraction(video_id)
2108 # Start with something easy: JW Player in SWFObject
2109 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2110 if mobj is None:
2111 # Broaden the search a little bit
2112 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2113 if mobj is None:
2114 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2115 return
2116
2117 # It's possible that one of the regexes
2118 # matched, but returned an empty group:
2119 if mobj.group(1) is None:
2120 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2121 return
2122
2123 video_url = urllib.unquote(mobj.group(1))
2124 video_id = os.path.basename(video_url)
2125
2126 # here's a fun little line of code for you:
2127 video_extension = os.path.splitext(video_id)[1][1:]
2128 video_id = os.path.splitext(video_id)[0]
2129
2130 # it's tempting to parse this further, but you would
2131 # have to take into account all the variations like
2132 # Video Title - Site Name
2133 # Site Name | Video Title
2134 # Video Title - Tagline | Site Name
2135 # and so on and so forth; it's just not practical
2136 mobj = re.search(r'<title>(.*)</title>', webpage)
2137 if mobj is None:
2138 self._downloader.trouble(u'ERROR: unable to extract title')
2139 return
2140 video_title = mobj.group(1).decode('utf-8')
2141 video_title = sanitize_title(video_title)
2142 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2143
2144 # video uploader is domain name
2145 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2146 if mobj is None:
2147 self._downloader.trouble(u'ERROR: unable to extract title')
2148 return
2149 video_uploader = mobj.group(1).decode('utf-8')
2150
2151 try:
2152 # Process video information
2153 self._downloader.process_info({
2154 'id': video_id.decode('utf-8'),
2155 'url': video_url.decode('utf-8'),
2156 'uploader': video_uploader,
2157 'upload_date': u'NA',
2158 'title': video_title,
2159 'stitle': simple_title,
2160 'ext': video_extension.decode('utf-8'),
2161 'format': u'NA',
2162 'player_url': None,
2163 })
2164 except UnavailableVideoError, err:
2165 self._downloader.trouble(u'\nERROR: unable to download video')
2166
2167
2168 class YoutubeSearchIE(InfoExtractor):
2169 """Information Extractor for YouTube search queries."""
2170 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2171 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2172 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2173 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2174 _youtube_ie = None
2175 _max_youtube_results = 1000
2176 IE_NAME = u'youtube:search'
2177
2178 def __init__(self, youtube_ie, downloader=None):
2179 InfoExtractor.__init__(self, downloader)
2180 self._youtube_ie = youtube_ie
2181
2182 def report_download_page(self, query, pagenum):
2183 """Report attempt to download playlist page with given number."""
2184 query = query.decode(preferredencoding())
2185 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2186
2187 def _real_initialize(self):
2188 self._youtube_ie.initialize()
2189
2190 def _real_extract(self, query):
2191 mobj = re.match(self._VALID_URL, query)
2192 if mobj is None:
2193 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2194 return
2195
2196 prefix, query = query.split(':')
2197 prefix = prefix[8:]
2198 query = query.encode('utf-8')
2199 if prefix == '':
2200 self._download_n_results(query, 1)
2201 return
2202 elif prefix == 'all':
2203 self._download_n_results(query, self._max_youtube_results)
2204 return
2205 else:
2206 try:
2207 n = long(prefix)
2208 if n <= 0:
2209 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2210 return
2211 elif n > self._max_youtube_results:
2212 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2213 n = self._max_youtube_results
2214 self._download_n_results(query, n)
2215 return
2216 except ValueError: # parsing prefix as integer fails
2217 self._download_n_results(query, 1)
2218 return
2219
2220 def _download_n_results(self, query, n):
2221 """Downloads a specified number of results for a query"""
2222
2223 video_ids = []
2224 already_seen = set()
2225 pagenum = 1
2226
2227 while True:
2228 self.report_download_page(query, pagenum)
2229 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2230 request = urllib2.Request(result_url)
2231 try:
2232 page = urllib2.urlopen(request).read()
2233 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2234 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2235 return
2236
2237 # Extract video identifiers
2238 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2239 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2240 if video_id not in already_seen:
2241 video_ids.append(video_id)
2242 already_seen.add(video_id)
2243 if len(video_ids) == n:
2244 # Specified n videos reached
2245 for id in video_ids:
2246 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2247 return
2248
2249 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2250 for id in video_ids:
2251 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2252 return
2253
2254 pagenum = pagenum + 1
2255
2256
2257 class GoogleSearchIE(InfoExtractor):
2258 """Information Extractor for Google Video search queries."""
2259 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2260 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2261 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2262 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2263 _google_ie = None
2264 _max_google_results = 1000
2265 IE_NAME = u'video.google:search'
2266
2267 def __init__(self, google_ie, downloader=None):
2268 InfoExtractor.__init__(self, downloader)
2269 self._google_ie = google_ie
2270
2271 def report_download_page(self, query, pagenum):
2272 """Report attempt to download playlist page with given number."""
2273 query = query.decode(preferredencoding())
2274 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2275
2276 def _real_initialize(self):
2277 self._google_ie.initialize()
2278
2279 def _real_extract(self, query):
2280 mobj = re.match(self._VALID_URL, query)
2281 if mobj is None:
2282 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2283 return
2284
2285 prefix, query = query.split(':')
2286 prefix = prefix[8:]
2287 query = query.encode('utf-8')
2288 if prefix == '':
2289 self._download_n_results(query, 1)
2290 return
2291 elif prefix == 'all':
2292 self._download_n_results(query, self._max_google_results)
2293 return
2294 else:
2295 try:
2296 n = long(prefix)
2297 if n <= 0:
2298 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2299 return
2300 elif n > self._max_google_results:
2301 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2302 n = self._max_google_results
2303 self._download_n_results(query, n)
2304 return
2305 except ValueError: # parsing prefix as integer fails
2306 self._download_n_results(query, 1)
2307 return
2308
2309 def _download_n_results(self, query, n):
2310 """Downloads a specified number of results for a query"""
2311
2312 video_ids = []
2313 already_seen = set()
2314 pagenum = 1
2315
2316 while True:
2317 self.report_download_page(query, pagenum)
2318 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2319 request = urllib2.Request(result_url)
2320 try:
2321 page = urllib2.urlopen(request).read()
2322 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2323 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2324 return
2325
2326 # Extract video identifiers
2327 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2328 video_id = mobj.group(1)
2329 if video_id not in already_seen:
2330 video_ids.append(video_id)
2331 already_seen.add(video_id)
2332 if len(video_ids) == n:
2333 # Specified n videos reached
2334 for id in video_ids:
2335 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2336 return
2337
2338 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2339 for id in video_ids:
2340 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2341 return
2342
2343 pagenum = pagenum + 1
2344
2345
2346 class YahooSearchIE(InfoExtractor):
2347 """Information Extractor for Yahoo! Video search queries."""
2348 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2349 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2350 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2351 _MORE_PAGES_INDICATOR = r'\s*Next'
2352 _yahoo_ie = None
2353 _max_yahoo_results = 1000
2354 IE_NAME = u'video.yahoo:search'
2355
2356 def __init__(self, yahoo_ie, downloader=None):
2357 InfoExtractor.__init__(self, downloader)
2358 self._yahoo_ie = yahoo_ie
2359
2360 def report_download_page(self, query, pagenum):
2361 """Report attempt to download playlist page with given number."""
2362 query = query.decode(preferredencoding())
2363 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2364
2365 def _real_initialize(self):
2366 self._yahoo_ie.initialize()
2367
2368 def _real_extract(self, query):
2369 mobj = re.match(self._VALID_URL, query)
2370 if mobj is None:
2371 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2372 return
2373
2374 prefix, query = query.split(':')
2375 prefix = prefix[8:]
2376 query = query.encode('utf-8')
2377 if prefix == '':
2378 self._download_n_results(query, 1)
2379 return
2380 elif prefix == 'all':
2381 self._download_n_results(query, self._max_yahoo_results)
2382 return
2383 else:
2384 try:
2385 n = long(prefix)
2386 if n <= 0:
2387 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2388 return
2389 elif n > self._max_yahoo_results:
2390 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2391 n = self._max_yahoo_results
2392 self._download_n_results(query, n)
2393 return
2394 except ValueError: # parsing prefix as integer fails
2395 self._download_n_results(query, 1)
2396 return
2397
2398 def _download_n_results(self, query, n):
2399 """Downloads a specified number of results for a query"""
2400
2401 video_ids = []
2402 already_seen = set()
2403 pagenum = 1
2404
2405 while True:
2406 self.report_download_page(query, pagenum)
2407 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2408 request = urllib2.Request(result_url)
2409 try:
2410 page = urllib2.urlopen(request).read()
2411 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2412 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2413 return
2414
2415 # Extract video identifiers
2416 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2417 video_id = mobj.group(1)
2418 if video_id not in already_seen:
2419 video_ids.append(video_id)
2420 already_seen.add(video_id)
2421 if len(video_ids) == n:
2422 # Specified n videos reached
2423 for id in video_ids:
2424 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2425 return
2426
2427 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2428 for id in video_ids:
2429 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2430 return
2431
2432 pagenum = pagenum + 1
2433
2434
2435 class YoutubePlaylistIE(InfoExtractor):
2436 """Information Extractor for YouTube playlists."""
2437
2438 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2439 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2440 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2441 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2442 _youtube_ie = None
2443 IE_NAME = u'youtube:playlist'
2444
2445 def __init__(self, youtube_ie, downloader=None):
2446 InfoExtractor.__init__(self, downloader)
2447 self._youtube_ie = youtube_ie
2448
2449 def report_download_page(self, playlist_id, pagenum):
2450 """Report attempt to download playlist page with given number."""
2451 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2452
2453 def _real_initialize(self):
2454 self._youtube_ie.initialize()
2455
2456 def _real_extract(self, url):
2457 # Extract playlist id
2458 mobj = re.match(self._VALID_URL, url)
2459 if mobj is None:
2460 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2461 return
2462
2463 # Single video case
2464 if mobj.group(3) is not None:
2465 self._youtube_ie.extract(mobj.group(3))
2466 return
2467
2468 # Download playlist pages
2469 # prefix is 'p' as default for playlists but there are other types that need extra care
2470 playlist_prefix = mobj.group(1)
2471 if playlist_prefix == 'a':
2472 playlist_access = 'artist'
2473 else:
2474 playlist_prefix = 'p'
2475 playlist_access = 'view_play_list'
2476 playlist_id = mobj.group(2)
2477 video_ids = []
2478 pagenum = 1
2479
2480 while True:
2481 self.report_download_page(playlist_id, pagenum)
2482 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2483 try:
2484 page = urllib2.urlopen(request).read()
2485 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2486 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2487 return
2488
2489 # Extract video identifiers
2490 ids_in_page = []
2491 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2492 if mobj.group(1) not in ids_in_page:
2493 ids_in_page.append(mobj.group(1))
2494 video_ids.extend(ids_in_page)
2495
2496 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2497 break
2498 pagenum = pagenum + 1
2499
2500 playliststart = self._downloader.params.get('playliststart', 1) - 1
2501 playlistend = self._downloader.params.get('playlistend', -1)
2502 video_ids = video_ids[playliststart:playlistend]
2503
2504 for id in video_ids:
2505 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2506 return
2507
2508
2509 class YoutubeUserIE(InfoExtractor):
2510 """Information Extractor for YouTube users."""
2511
2512 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2513 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2514 _GDATA_PAGE_SIZE = 50
2515 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2516 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2517 _youtube_ie = None
2518 IE_NAME = u'youtube:user'
2519
2520 def __init__(self, youtube_ie, downloader=None):
2521 InfoExtractor.__init__(self, downloader)
2522 self._youtube_ie = youtube_ie
2523
2524 def report_download_page(self, username, start_index):
2525 """Report attempt to download user page."""
2526 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2527 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2528
2529 def _real_initialize(self):
2530 self._youtube_ie.initialize()
2531
2532 def _real_extract(self, url):
2533 # Extract username
2534 mobj = re.match(self._VALID_URL, url)
2535 if mobj is None:
2536 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2537 return
2538
2539 username = mobj.group(1)
2540
2541 # Download video ids using YouTube Data API. Result size per
2542 # query is limited (currently to 50 videos) so we need to query
2543 # page by page until there are no video ids - it means we got
2544 # all of them.
2545
2546 video_ids = []
2547 pagenum = 0
2548
2549 while True:
2550 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2551 self.report_download_page(username, start_index)
2552
2553 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2554
2555 try:
2556 page = urllib2.urlopen(request).read()
2557 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2559 return
2560
2561 # Extract video identifiers
2562 ids_in_page = []
2563
2564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2565 if mobj.group(1) not in ids_in_page:
2566 ids_in_page.append(mobj.group(1))
2567
2568 video_ids.extend(ids_in_page)
2569
2570 # A little optimization - if current page is not
2571 # "full", ie. does not contain PAGE_SIZE video ids then
2572 # we can assume that this page is the last one - there
2573 # are no more ids on further pages - no need to query
2574 # again.
2575
2576 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2577 break
2578
2579 pagenum += 1
2580
2581 all_ids_count = len(video_ids)
2582 playliststart = self._downloader.params.get('playliststart', 1) - 1
2583 playlistend = self._downloader.params.get('playlistend', -1)
2584
2585 if playlistend == -1:
2586 video_ids = video_ids[playliststart:]
2587 else:
2588 video_ids = video_ids[playliststart:playlistend]
2589
2590 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2591 (username, all_ids_count, len(video_ids)))
2592
2593 for video_id in video_ids:
2594 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2595
2596
2597 class DepositFilesIE(InfoExtractor):
2598 """Information extractor for depositfiles.com"""
2599
2600 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2601 IE_NAME = u'DepositFiles'
2602
2603 def __init__(self, downloader=None):
2604 InfoExtractor.__init__(self, downloader)
2605
2606 def report_download_webpage(self, file_id):
2607 """Report webpage download."""
2608 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2609
2610 def report_extraction(self, file_id):
2611 """Report information extraction."""
2612 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2613
2614 def _real_initialize(self):
2615 return
2616
2617 def _real_extract(self, url):
2618 # At this point we have a new file
2619 self._downloader.increment_downloads()
2620
2621 file_id = url.split('/')[-1]
2622 # Rebuild url in english locale
2623 url = 'http://depositfiles.com/en/files/' + file_id
2624
2625 # Retrieve file webpage with 'Free download' button pressed
2626 free_download_indication = { 'gateway_result' : '1' }
2627 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2628 try:
2629 self.report_download_webpage(file_id)
2630 webpage = urllib2.urlopen(request).read()
2631 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2632 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2633 return
2634
2635 # Search for the real file URL
2636 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2637 if (mobj is None) or (mobj.group(1) is None):
2638 # Try to figure out reason of the error.
2639 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2640 if (mobj is not None) and (mobj.group(1) is not None):
2641 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2642 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2643 else:
2644 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2645 return
2646
2647 file_url = mobj.group(1)
2648 file_extension = os.path.splitext(file_url)[1][1:]
2649
2650 # Search for file title
2651 mobj = re.search(r'<b title="(.*?)">', webpage)
2652 if mobj is None:
2653 self._downloader.trouble(u'ERROR: unable to extract title')
2654 return
2655 file_title = mobj.group(1).decode('utf-8')
2656
2657 try:
2658 # Process file information
2659 self._downloader.process_info({
2660 'id': file_id.decode('utf-8'),
2661 'url': file_url.decode('utf-8'),
2662 'uploader': u'NA',
2663 'upload_date': u'NA',
2664 'title': file_title,
2665 'stitle': file_title,
2666 'ext': file_extension.decode('utf-8'),
2667 'format': u'NA',
2668 'player_url': None,
2669 })
2670 except UnavailableVideoError, err:
2671 self._downloader.trouble(u'ERROR: unable to download file')
2672
2673
2674 class FacebookIE(InfoExtractor):
2675 """Information Extractor for Facebook"""
2676
2677 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/video/video\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2678 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2679 _NETRC_MACHINE = 'facebook'
2680 _available_formats = ['highqual', 'lowqual']
2681 _video_extensions = {
2682 'highqual': 'mp4',
2683 'lowqual': 'mp4',
2684 }
2685 IE_NAME = u'facebook'
2686
2687 def __init__(self, downloader=None):
2688 InfoExtractor.__init__(self, downloader)
2689
2690 def _reporter(self, message):
2691 """Add header and report message."""
2692 self._downloader.to_screen(u'[facebook] %s' % message)
2693
2694 def report_login(self):
2695 """Report attempt to log in."""
2696 self._reporter(u'Logging in')
2697
2698 def report_video_webpage_download(self, video_id):
2699 """Report attempt to download video webpage."""
2700 self._reporter(u'%s: Downloading video webpage' % video_id)
2701
2702 def report_information_extraction(self, video_id):
2703 """Report attempt to extract video information."""
2704 self._reporter(u'%s: Extracting video information' % video_id)
2705
2706 def _parse_page(self, video_webpage):
2707 """Extract video information from page"""
2708 # General data
2709 data = {'title': r'class="video_title datawrap">(.*?)</',
2710 'description': r'<div class="datawrap">(.*?)</div>',
2711 'owner': r'\("video_owner_name", "(.*?)"\)',
2712 'upload_date': r'data-date="(.*?)"',
2713 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2714 }
2715 video_info = {}
2716 for piece in data.keys():
2717 mobj = re.search(data[piece], video_webpage)
2718 if mobj is not None:
2719 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2720
2721 # Video urls
2722 video_urls = {}
2723 for fmt in self._available_formats:
2724 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2725 if mobj is not None:
2726 # URL is in a Javascript segment inside an escaped Unicode format within
2727 # the generally utf-8 page
2728 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2729 video_info['video_urls'] = video_urls
2730
2731 return video_info
2732
2733 def _real_initialize(self):
2734 if self._downloader is None:
2735 return
2736
2737 useremail = None
2738 password = None
2739 downloader_params = self._downloader.params
2740
2741 # Attempt to use provided username and password or .netrc data
2742 if downloader_params.get('username', None) is not None:
2743 useremail = downloader_params['username']
2744 password = downloader_params['password']
2745 elif downloader_params.get('usenetrc', False):
2746 try:
2747 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2748 if info is not None:
2749 useremail = info[0]
2750 password = info[2]
2751 else:
2752 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2753 except (IOError, netrc.NetrcParseError), err:
2754 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2755 return
2756
2757 if useremail is None:
2758 return
2759
2760 # Log in
2761 login_form = {
2762 'email': useremail,
2763 'pass': password,
2764 'login': 'Log+In'
2765 }
2766 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2767 try:
2768 self.report_login()
2769 login_results = urllib2.urlopen(request).read()
2770 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2771 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2772 return
2773 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2774 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2775 return
2776
2777 def _real_extract(self, url):
2778 mobj = re.match(self._VALID_URL, url)
2779 if mobj is None:
2780 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2781 return
2782 video_id = mobj.group('ID')
2783
2784 # Get video webpage
2785 self.report_video_webpage_download(video_id)
2786 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2787 try:
2788 page = urllib2.urlopen(request)
2789 video_webpage = page.read()
2790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2792 return
2793
2794 # Start extracting information
2795 self.report_information_extraction(video_id)
2796
2797 # Extract information
2798 video_info = self._parse_page(video_webpage)
2799
2800 # uploader
2801 if 'owner' not in video_info:
2802 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2803 return
2804 video_uploader = video_info['owner']
2805
2806 # title
2807 if 'title' not in video_info:
2808 self._downloader.trouble(u'ERROR: unable to extract video title')
2809 return
2810 video_title = video_info['title']
2811 video_title = video_title.decode('utf-8')
2812 video_title = sanitize_title(video_title)
2813
2814 # simplified title
2815 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2816 simple_title = simple_title.strip(ur'_')
2817
2818 # thumbnail image
2819 if 'thumbnail' not in video_info:
2820 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2821 video_thumbnail = ''
2822 else:
2823 video_thumbnail = video_info['thumbnail']
2824
2825 # upload date
2826 upload_date = u'NA'
2827 if 'upload_date' in video_info:
2828 upload_time = video_info['upload_date']
2829 timetuple = email.utils.parsedate_tz(upload_time)
2830 if timetuple is not None:
2831 try:
2832 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2833 except:
2834 pass
2835
2836 # description
2837 video_description = video_info.get('description', 'No description available.')
2838
2839 url_map = video_info['video_urls']
2840 if len(url_map.keys()) > 0:
2841 # Decide which formats to download
2842 req_format = self._downloader.params.get('format', None)
2843 format_limit = self._downloader.params.get('format_limit', None)
2844
2845 if format_limit is not None and format_limit in self._available_formats:
2846 format_list = self._available_formats[self._available_formats.index(format_limit):]
2847 else:
2848 format_list = self._available_formats
2849 existing_formats = [x for x in format_list if x in url_map]
2850 if len(existing_formats) == 0:
2851 self._downloader.trouble(u'ERROR: no known formats available for video')
2852 return
2853 if req_format is None:
2854 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2855 elif req_format == 'worst':
2856 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2857 elif req_format == '-1':
2858 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2859 else:
2860 # Specific format
2861 if req_format not in url_map:
2862 self._downloader.trouble(u'ERROR: requested format not available')
2863 return
2864 video_url_list = [(req_format, url_map[req_format])] # Specific format
2865
2866 for format_param, video_real_url in video_url_list:
2867
2868 # At this point we have a new video
2869 self._downloader.increment_downloads()
2870
2871 # Extension
2872 video_extension = self._video_extensions.get(format_param, 'mp4')
2873
2874 try:
2875 # Process video information
2876 self._downloader.process_info({
2877 'id': video_id.decode('utf-8'),
2878 'url': video_real_url.decode('utf-8'),
2879 'uploader': video_uploader.decode('utf-8'),
2880 'upload_date': upload_date,
2881 'title': video_title,
2882 'stitle': simple_title,
2883 'ext': video_extension.decode('utf-8'),
2884 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2885 'thumbnail': video_thumbnail.decode('utf-8'),
2886 'description': video_description.decode('utf-8'),
2887 'player_url': None,
2888 })
2889 except UnavailableVideoError, err:
2890 self._downloader.trouble(u'\nERROR: unable to download video')
2891
2892 class BlipTVIE(InfoExtractor):
2893 """Information extractor for blip.tv"""
2894
2895 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2896 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2897 IE_NAME = u'blip.tv'
2898
2899 def report_extraction(self, file_id):
2900 """Report information extraction."""
2901 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2902
2903 def report_direct_download(self, title):
2904 """Report information extraction."""
2905 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2906
2907 def _simplify_title(self, title):
2908 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
2909 res = res.strip(ur'_')
2910 return res
2911
2912 def _real_extract(self, url):
2913 mobj = re.match(self._VALID_URL, url)
2914 if mobj is None:
2915 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2916 return
2917
2918 if '?' in url:
2919 cchar = '&'
2920 else:
2921 cchar = '?'
2922 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2923 request = urllib2.Request(json_url)
2924 self.report_extraction(mobj.group(1))
2925 info = None
2926 try:
2927 urlh = urllib2.urlopen(request)
2928 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2929 basename = url.split('/')[-1]
2930 title,ext = os.path.splitext(basename)
2931 ext = ext.replace('.', '')
2932 self.report_direct_download(title)
2933 info = {
2934 'id': title,
2935 'url': url,
2936 'title': title,
2937 'stitle': self._simplify_title(title),
2938 'ext': ext,
2939 'urlhandle': urlh
2940 }
2941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2942 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2943 return
2944 if info is None: # Regular URL
2945 try:
2946 json_code = urlh.read()
2947 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2948 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2949 return
2950
2951 try:
2952 json_data = json.loads(json_code)
2953 if 'Post' in json_data:
2954 data = json_data['Post']
2955 else:
2956 data = json_data
2957
2958 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2959 video_url = data['media']['url']
2960 umobj = re.match(self._URL_EXT, video_url)
2961 if umobj is None:
2962 raise ValueError('Can not determine filename extension')
2963 ext = umobj.group(1)
2964
2965 info = {
2966 'id': data['item_id'],
2967 'url': video_url,
2968 'uploader': data['display_name'],
2969 'upload_date': upload_date,
2970 'title': data['title'],
2971 'stitle': self._simplify_title(data['title']),
2972 'ext': ext,
2973 'format': data['media']['mimeType'],
2974 'thumbnail': data['thumbnailUrl'],
2975 'description': data['description'],
2976 'player_url': data['embedUrl']
2977 }
2978 except (ValueError,KeyError), err:
2979 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2980 return
2981
2982 self._downloader.increment_downloads()
2983
2984 try:
2985 self._downloader.process_info(info)
2986 except UnavailableVideoError, err:
2987 self._downloader.trouble(u'\nERROR: unable to download video')
2988
2989
2990 class MyVideoIE(InfoExtractor):
2991 """Information Extractor for myvideo.de."""
2992
2993 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2994 IE_NAME = u'myvideo'
2995
2996 def __init__(self, downloader=None):
2997 InfoExtractor.__init__(self, downloader)
2998
2999 def report_download_webpage(self, video_id):
3000 """Report webpage download."""
3001 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3002
3003 def report_extraction(self, video_id):
3004 """Report information extraction."""
3005 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3006
3007 def _real_initialize(self):
3008 return
3009
3010 def _real_extract(self,url):
3011 mobj = re.match(self._VALID_URL, url)
3012 if mobj is None:
3013 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3014 return
3015
3016 video_id = mobj.group(1)
3017 simple_title = mobj.group(2).decode('utf-8')
3018 # should actually not be necessary
3019 simple_title = sanitize_title(simple_title)
3020 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', simple_title)
3021
3022 # Get video webpage
3023 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3024 try:
3025 self.report_download_webpage(video_id)
3026 webpage = urllib2.urlopen(request).read()
3027 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3028 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3029 return
3030
3031 self.report_extraction(video_id)
3032 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3033 webpage)
3034 if mobj is None:
3035 self._downloader.trouble(u'ERROR: unable to extract media URL')
3036 return
3037 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3038
3039 mobj = re.search('<title>([^<]+)</title>', webpage)
3040 if mobj is None:
3041 self._downloader.trouble(u'ERROR: unable to extract title')
3042 return
3043
3044 video_title = mobj.group(1)
3045 video_title = sanitize_title(video_title)
3046
3047 try:
3048 self._downloader.process_info({
3049 'id': video_id,
3050 'url': video_url,
3051 'uploader': u'NA',
3052 'upload_date': u'NA',
3053 'title': video_title,
3054 'stitle': simple_title,
3055 'ext': u'flv',
3056 'format': u'NA',
3057 'player_url': None,
3058 })
3059 except UnavailableVideoError:
3060 self._downloader.trouble(u'\nERROR: Unable to download video')
3061
3062 class ComedyCentralIE(InfoExtractor):
3063 """Information extractor for The Daily Show and Colbert Report """
3064
3065 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3066 IE_NAME = u'comedycentral'
3067
3068 def report_extraction(self, episode_id):
3069 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3070
3071 def report_config_download(self, episode_id):
3072 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3073
3074 def report_index_download(self, episode_id):
3075 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3076
3077 def report_player_url(self, episode_id):
3078 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3079
3080 def _simplify_title(self, title):
3081 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3082 res = res.strip(ur'_')
3083 return res
3084
3085 def _real_extract(self, url):
3086 mobj = re.match(self._VALID_URL, url)
3087 if mobj is None:
3088 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3089 return
3090
3091 if mobj.group('shortname'):
3092 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3093 url = 'http://www.thedailyshow.com/full-episodes/'
3094 else:
3095 url = 'http://www.colbertnation.com/full-episodes/'
3096 mobj = re.match(self._VALID_URL, url)
3097 assert mobj is not None
3098
3099 dlNewest = not mobj.group('episode')
3100 if dlNewest:
3101 epTitle = mobj.group('showname')
3102 else:
3103 epTitle = mobj.group('episode')
3104
3105 req = urllib2.Request(url)
3106 self.report_extraction(epTitle)
3107 try:
3108 htmlHandle = urllib2.urlopen(req)
3109 html = htmlHandle.read()
3110 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3111 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3112 return
3113 if dlNewest:
3114 url = htmlHandle.geturl()
3115 mobj = re.match(self._VALID_URL, url)
3116 if mobj is None:
3117 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3118 return
3119 if mobj.group('episode') == '':
3120 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3121 return
3122 epTitle = mobj.group('episode')
3123
3124 mMovieParams = re.findall('<param name="movie" value="(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"/>', html)
3125 if len(mMovieParams) == 0:
3126 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3127 return
3128
3129 playerUrl_raw = mMovieParams[0][0]
3130 self.report_player_url(epTitle)
3131 try:
3132 urlHandle = urllib2.urlopen(playerUrl_raw)
3133 playerUrl = urlHandle.geturl()
3134 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3135 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3136 return
3137
3138 uri = mMovieParams[0][1]
3139 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3140 self.report_index_download(epTitle)
3141 try:
3142 indexXml = urllib2.urlopen(indexUrl).read()
3143 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3144 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3145 return
3146
3147 idoc = xml.etree.ElementTree.fromstring(indexXml)
3148 itemEls = idoc.findall('.//item')
3149 for itemEl in itemEls:
3150 mediaId = itemEl.findall('./guid')[0].text
3151 shortMediaId = mediaId.split(':')[-1]
3152 showId = mediaId.split(':')[-2].replace('.com', '')
3153 officialTitle = itemEl.findall('./title')[0].text
3154 officialDate = itemEl.findall('./pubDate')[0].text
3155
3156 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3157 urllib.urlencode({'uri': mediaId}))
3158 configReq = urllib2.Request(configUrl)
3159 self.report_config_download(epTitle)
3160 try:
3161 configXml = urllib2.urlopen(configReq).read()
3162 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3163 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3164 return
3165
3166 cdoc = xml.etree.ElementTree.fromstring(configXml)
3167 turls = []
3168 for rendition in cdoc.findall('.//rendition'):
3169 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3170 turls.append(finfo)
3171
3172 if len(turls) == 0:
3173 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3174 continue
3175
3176 # For now, just pick the highest bitrate
3177 format,video_url = turls[-1]
3178
3179 self._downloader.increment_downloads()
3180
3181 effTitle = showId + '-' + epTitle
3182 info = {
3183 'id': shortMediaId,
3184 'url': video_url,
3185 'uploader': showId,
3186 'upload_date': officialDate,
3187 'title': effTitle,
3188 'stitle': self._simplify_title(effTitle),
3189 'ext': 'mp4',
3190 'format': format,
3191 'thumbnail': None,
3192 'description': officialTitle,
3193 'player_url': playerUrl
3194 }
3195
3196 try:
3197 self._downloader.process_info(info)
3198 except UnavailableVideoError, err:
3199 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3200 continue
3201
3202
3203 class EscapistIE(InfoExtractor):
3204 """Information extractor for The Escapist """
3205
3206 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3207 IE_NAME = u'escapist'
3208
3209 def report_extraction(self, showName):
3210 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3211
3212 def report_config_download(self, showName):
3213 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3214
3215 def _simplify_title(self, title):
3216 res = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', title)
3217 res = res.strip(ur'_')
3218 return res
3219
3220 def _real_extract(self, url):
3221 htmlParser = HTMLParser.HTMLParser()
3222
3223 mobj = re.match(self._VALID_URL, url)
3224 if mobj is None:
3225 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3226 return
3227 showName = mobj.group('showname')
3228 videoId = mobj.group('episode')
3229
3230 self.report_extraction(showName)
3231 try:
3232 webPage = urllib2.urlopen(url).read()
3233 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3234 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3235 return
3236
3237 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3238 description = htmlParser.unescape(descMatch.group(1))
3239 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3240 imgUrl = htmlParser.unescape(imgMatch.group(1))
3241 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3242 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3243 configUrlMatch = re.search('config=(.*)$', playerUrl)
3244 configUrl = urllib2.unquote(configUrlMatch.group(1))
3245
3246 self.report_config_download(showName)
3247 try:
3248 configJSON = urllib2.urlopen(configUrl).read()
3249 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3250 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3251 return
3252
3253 # Technically, it's JavaScript, not JSON
3254 configJSON = configJSON.replace("'", '"')
3255
3256 try:
3257 config = json.loads(configJSON)
3258 except (ValueError,), err:
3259 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3260 return
3261
3262 playlist = config['playlist']
3263 videoUrl = playlist[1]['url']
3264
3265 self._downloader.increment_downloads()
3266 info = {
3267 'id': videoId,
3268 'url': videoUrl,
3269 'uploader': showName,
3270 'upload_date': None,
3271 'title': showName,
3272 'stitle': self._simplify_title(showName),
3273 'ext': 'flv',
3274 'format': 'flv',
3275 'thumbnail': imgUrl,
3276 'description': description,
3277 'player_url': playerUrl,
3278 }
3279
3280 try:
3281 self._downloader.process_info(info)
3282 except UnavailableVideoError, err:
3283 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3284
3285
3286
3287 class PostProcessor(object):
3288 """Post Processor class.
3289
3290 PostProcessor objects can be added to downloaders with their
3291 add_post_processor() method. When the downloader has finished a
3292 successful download, it will take its internal chain of PostProcessors
3293 and start calling the run() method on each one of them, first with
3294 an initial argument and then with the returned value of the previous
3295 PostProcessor.
3296
3297 The chain will be stopped if one of them ever returns None or the end
3298 of the chain is reached.
3299
3300 PostProcessor objects follow a "mutual registration" process similar
3301 to InfoExtractor objects.
3302 """
3303
3304 _downloader = None
3305
3306 def __init__(self, downloader=None):
3307 self._downloader = downloader
3308
3309 def set_downloader(self, downloader):
3310 """Sets the downloader for this PP."""
3311 self._downloader = downloader
3312
3313 def run(self, information):
3314 """Run the PostProcessor.
3315
3316 The "information" argument is a dictionary like the ones
3317 composed by InfoExtractors. The only difference is that this
3318 one has an extra field called "filepath" that points to the
3319 downloaded file.
3320
3321 When this method returns None, the postprocessing chain is
3322 stopped. However, this method may return an information
3323 dictionary that will be passed to the next postprocessing
3324 object in the chain. It can be the one it received after
3325 changing some fields.
3326
3327 In addition, this method may raise a PostProcessingError
3328 exception that will be taken into account by the downloader
3329 it was called from.
3330 """
3331 return information # by default, do nothing
3332
3333
3334 class FFmpegExtractAudioPP(PostProcessor):
3335
3336 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3337 PostProcessor.__init__(self, downloader)
3338 if preferredcodec is None:
3339 preferredcodec = 'best'
3340 self._preferredcodec = preferredcodec
3341 self._preferredquality = preferredquality
3342 self._keepvideo = keepvideo
3343
3344 @staticmethod
3345 def get_audio_codec(path):
3346 try:
3347 cmd = ['ffprobe', '-show_streams', '--', path]
3348 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3349 output = handle.communicate()[0]
3350 if handle.wait() != 0:
3351 return None
3352 except (IOError, OSError):
3353 return None
3354 audio_codec = None
3355 for line in output.split('\n'):
3356 if line.startswith('codec_name='):
3357 audio_codec = line.split('=')[1].strip()
3358 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3359 return audio_codec
3360 return None
3361
3362 @staticmethod
3363 def run_ffmpeg(path, out_path, codec, more_opts):
3364 try:
3365 cmd = ['ffmpeg', '-y', '-i', path, '-vn', '-acodec', codec] + more_opts + ['--', out_path]
3366 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w'), stderr=subprocess.STDOUT)
3367 return (ret == 0)
3368 except (IOError, OSError):
3369 return False
3370
3371 def run(self, information):
3372 path = information['filepath']
3373
3374 filecodec = self.get_audio_codec(path)
3375 if filecodec is None:
3376 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3377 return None
3378
3379 more_opts = []
3380 if self._preferredcodec == 'best' or self._preferredcodec == filecodec:
3381 if filecodec in ['aac', 'mp3', 'vorbis']:
3382 # Lossless if possible
3383 acodec = 'copy'
3384 extension = filecodec
3385 if filecodec == 'aac':
3386 more_opts = ['-f', 'adts']
3387 if filecodec == 'vorbis':
3388 extension = 'ogg'
3389 else:
3390 # MP3 otherwise.
3391 acodec = 'libmp3lame'
3392 extension = 'mp3'
3393 more_opts = []
3394 if self._preferredquality is not None:
3395 more_opts += ['-ab', self._preferredquality]
3396 else:
3397 # We convert the audio (lossy)
3398 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'vorbis': 'libvorbis'}[self._preferredcodec]
3399 extension = self._preferredcodec
3400 more_opts = []
3401 if self._preferredquality is not None:
3402 more_opts += ['-ab', self._preferredquality]
3403 if self._preferredcodec == 'aac':
3404 more_opts += ['-f', 'adts']
3405 if self._preferredcodec == 'vorbis':
3406 extension = 'ogg'
3407
3408 (prefix, ext) = os.path.splitext(path)
3409 new_path = prefix + '.' + extension
3410 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
3411 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3412
3413 if not status:
3414 self._downloader.to_stderr(u'WARNING: error running ffmpeg')
3415 return None
3416
3417 # Try to update the date time for extracted audio file.
3418 if information.get('filetime') is not None:
3419 try:
3420 os.utime(new_path, (time.time(), information['filetime']))
3421 except:
3422 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
3423
3424 if not self._keepvideo:
3425 try:
3426 os.remove(path)
3427 except (IOError, OSError):
3428 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
3429 return None
3430
3431 information['filepath'] = new_path
3432 return information
3433
3434
3435 def updateSelf(downloader, filename):
3436 ''' Update the program file with the latest version from the repository '''
3437 # Note: downloader only used for options
3438 if not os.access(filename, os.W_OK):
3439 sys.exit('ERROR: no write permissions on %s' % filename)
3440
3441 downloader.to_screen('Updating to latest version...')
3442
3443 try:
3444 try:
3445 urlh = urllib.urlopen(UPDATE_URL)
3446 newcontent = urlh.read()
3447
3448 vmatch = re.search("__version__ = '([^']+)'", newcontent)
3449 if vmatch is not None and vmatch.group(1) == __version__:
3450 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
3451 return
3452 finally:
3453 urlh.close()
3454 except (IOError, OSError), err:
3455 sys.exit('ERROR: unable to download latest version')
3456
3457 try:
3458 outf = open(filename, 'wb')
3459 try:
3460 outf.write(newcontent)
3461 finally:
3462 outf.close()
3463 except (IOError, OSError), err:
3464 sys.exit('ERROR: unable to overwrite current version')
3465
3466 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
3467
3468 def parseOpts():
3469 # Deferred imports
3470 import getpass
3471 import optparse
3472
3473 def _format_option_string(option):
3474 ''' ('-o', '--option') -> -o, --format METAVAR'''
3475
3476 opts = []
3477
3478 if option._short_opts: opts.append(option._short_opts[0])
3479 if option._long_opts: opts.append(option._long_opts[0])
3480 if len(opts) > 1: opts.insert(1, ', ')
3481
3482 if option.takes_value(): opts.append(' %s' % option.metavar)
3483
3484 return "".join(opts)
3485
3486 def _find_term_columns():
3487 columns = os.environ.get('COLUMNS', None)
3488 if columns:
3489 return int(columns)
3490
3491 try:
3492 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3493 out,err = sp.communicate()
3494 return int(out.split()[1])
3495 except:
3496 pass
3497 return None
3498
3499 max_width = 80
3500 max_help_position = 80
3501
3502 # No need to wrap help messages if we're on a wide console
3503 columns = _find_term_columns()
3504 if columns: max_width = columns
3505
3506 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
3507 fmt.format_option_strings = _format_option_string
3508
3509 kw = {
3510 'version' : __version__,
3511 'formatter' : fmt,
3512 'usage' : '%prog [options] url [url...]',
3513 'conflict_handler' : 'resolve',
3514 }
3515
3516 parser = optparse.OptionParser(**kw)
3517
3518 # option groups
3519 general = optparse.OptionGroup(parser, 'General Options')
3520 selection = optparse.OptionGroup(parser, 'Video Selection')
3521 authentication = optparse.OptionGroup(parser, 'Authentication Options')
3522 video_format = optparse.OptionGroup(parser, 'Video Format Options')
3523 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
3524 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
3525 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
3526
3527 general.add_option('-h', '--help',
3528 action='help', help='print this help text and exit')
3529 general.add_option('-v', '--version',
3530 action='version', help='print program version and exit')
3531 general.add_option('-U', '--update',
3532 action='store_true', dest='update_self', help='update this program to latest version')
3533 general.add_option('-i', '--ignore-errors',
3534 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
3535 general.add_option('-r', '--rate-limit',
3536 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3537 general.add_option('-R', '--retries',
3538 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
3539 general.add_option('--dump-user-agent',
3540 action='store_true', dest='dump_user_agent',
3541 help='display the current browser identification', default=False)
3542 general.add_option('--list-extractors',
3543 action='store_true', dest='list_extractors',
3544 help='List all supported extractors and the URLs they would handle', default=False)
3545
3546 selection.add_option('--playlist-start',
3547 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
3548 selection.add_option('--playlist-end',
3549 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
3550 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
3551 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
3552
3553 authentication.add_option('-u', '--username',
3554 dest='username', metavar='USERNAME', help='account username')
3555 authentication.add_option('-p', '--password',
3556 dest='password', metavar='PASSWORD', help='account password')
3557 authentication.add_option('-n', '--netrc',
3558 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
3559
3560
3561 video_format.add_option('-f', '--format',
3562 action='store', dest='format', metavar='FORMAT', help='video format code')
3563 video_format.add_option('--all-formats',
3564 action='store_const', dest='format', help='download all available video formats', const='all')
3565 video_format.add_option('--max-quality',
3566 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
3567
3568
3569 verbosity.add_option('-q', '--quiet',
3570 action='store_true', dest='quiet', help='activates quiet mode', default=False)
3571 verbosity.add_option('-s', '--simulate',
3572 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
3573 verbosity.add_option('--skip-download',
3574 action='store_true', dest='skip_download', help='do not download the video', default=False)
3575 verbosity.add_option('-g', '--get-url',
3576 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
3577 verbosity.add_option('-e', '--get-title',
3578 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
3579 verbosity.add_option('--get-thumbnail',
3580 action='store_true', dest='getthumbnail',
3581 help='simulate, quiet but print thumbnail URL', default=False)
3582 verbosity.add_option('--get-description',
3583 action='store_true', dest='getdescription',
3584 help='simulate, quiet but print video description', default=False)
3585 verbosity.add_option('--get-filename',
3586 action='store_true', dest='getfilename',
3587 help='simulate, quiet but print output filename', default=False)
3588 verbosity.add_option('--get-format',
3589 action='store_true', dest='getformat',
3590 help='simulate, quiet but print output format', default=False)
3591 verbosity.add_option('--no-progress',
3592 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
3593 verbosity.add_option('--console-title',
3594 action='store_true', dest='consoletitle',
3595 help='display progress in console titlebar', default=False)
3596
3597
3598 filesystem.add_option('-t', '--title',
3599 action='store_true', dest='usetitle', help='use title in file name', default=False)
3600 filesystem.add_option('-l', '--literal',
3601 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
3602 filesystem.add_option('-A', '--auto-number',
3603 action='store_true', dest='autonumber',
3604 help='number downloaded files starting from 00000', default=False)
3605 filesystem.add_option('-o', '--output',
3606 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, and %% for a literal percent')
3607 filesystem.add_option('-a', '--batch-file',
3608 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
3609 filesystem.add_option('-w', '--no-overwrites',
3610 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
3611 filesystem.add_option('-c', '--continue',
3612 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
3613 filesystem.add_option('--no-continue',
3614 action='store_false', dest='continue_dl',
3615 help='do not resume partially downloaded files (restart from beginning)')
3616 filesystem.add_option('--cookies',
3617 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
3618 filesystem.add_option('--no-part',
3619 action='store_true', dest='nopart', help='do not use .part files', default=False)
3620 filesystem.add_option('--no-mtime',
3621 action='store_false', dest='updatetime',
3622 help='do not use the Last-modified header to set the file modification time', default=True)
3623 filesystem.add_option('--write-description',
3624 action='store_true', dest='writedescription',
3625 help='write video description to a .description file', default=False)
3626 filesystem.add_option('--write-info-json',
3627 action='store_true', dest='writeinfojson',
3628 help='write video metadata to a .info.json file', default=False)
3629
3630
3631 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
3632 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3633 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
3634 help='"best", "aac", "vorbis" or "mp3"; best by default')
3635 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
3636 help='ffmpeg audio bitrate specification, 128k by default')
3637 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
3638 help='keeps the video file on disk after the post-processing; the video is erased by default')
3639
3640
3641 parser.add_option_group(general)
3642 parser.add_option_group(selection)
3643 parser.add_option_group(filesystem)
3644 parser.add_option_group(verbosity)
3645 parser.add_option_group(video_format)
3646 parser.add_option_group(authentication)
3647 parser.add_option_group(postproc)
3648
3649 opts, args = parser.parse_args()
3650
3651 return parser, opts, args
3652
3653 def gen_extractors():
3654 """ Return a list of an instance of every supported extractor.
3655 The order does matter; the first extractor matched is the one handling the URL.
3656 """
3657 youtube_ie = YoutubeIE()
3658 google_ie = GoogleIE()
3659 yahoo_ie = YahooIE()
3660 return [
3661 YoutubePlaylistIE(youtube_ie),
3662 YoutubeUserIE(youtube_ie),
3663 YoutubeSearchIE(youtube_ie),
3664 youtube_ie,
3665 MetacafeIE(youtube_ie),
3666 DailymotionIE(),
3667 google_ie,
3668 GoogleSearchIE(google_ie),
3669 PhotobucketIE(),
3670 yahoo_ie,
3671 YahooSearchIE(yahoo_ie),
3672 DepositFilesIE(),
3673 FacebookIE(),
3674 BlipTVIE(),
3675 VimeoIE(),
3676 MyVideoIE(),
3677 ComedyCentralIE(),
3678 EscapistIE(),
3679
3680 GenericIE()
3681 ]
3682
3683 def main():
3684 parser, opts, args = parseOpts()
3685
3686 # Open appropriate CookieJar
3687 if opts.cookiefile is None:
3688 jar = cookielib.CookieJar()
3689 else:
3690 try:
3691 jar = cookielib.MozillaCookieJar(opts.cookiefile)
3692 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
3693 jar.load()
3694 except (IOError, OSError), err:
3695 sys.exit(u'ERROR: unable to open cookie file')
3696
3697 # Dump user agent
3698 if opts.dump_user_agent:
3699 print std_headers['User-Agent']
3700 sys.exit(0)
3701
3702 # Batch file verification
3703 batchurls = []
3704 if opts.batchfile is not None:
3705 try:
3706 if opts.batchfile == '-':
3707 batchfd = sys.stdin
3708 else:
3709 batchfd = open(opts.batchfile, 'r')
3710 batchurls = batchfd.readlines()
3711 batchurls = [x.strip() for x in batchurls]
3712 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
3713 except IOError:
3714 sys.exit(u'ERROR: batch file could not be read')
3715 all_urls = batchurls + args
3716
3717 # General configuration
3718 cookie_processor = urllib2.HTTPCookieProcessor(jar)
3719 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
3720 urllib2.install_opener(opener)
3721 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3722
3723 extractors = gen_extractors()
3724
3725 if opts.list_extractors:
3726 for ie in extractors:
3727 print(ie.IE_NAME)
3728 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
3729 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
3730 for mu in matchedUrls:
3731 print(u' ' + mu)
3732 sys.exit(0)
3733
3734 # Conflicting, missing and erroneous options
3735 if opts.usenetrc and (opts.username is not None or opts.password is not None):
3736 parser.error(u'using .netrc conflicts with giving username/password')
3737 if opts.password is not None and opts.username is None:
3738 parser.error(u'account username missing')
3739 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
3740 parser.error(u'using output template conflicts with using title, literal title or auto number')
3741 if opts.usetitle and opts.useliteral:
3742 parser.error(u'using title conflicts with using literal title')
3743 if opts.username is not None and opts.password is None:
3744 opts.password = getpass.getpass(u'Type account password and press return:')
3745 if opts.ratelimit is not None:
3746 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
3747 if numeric_limit is None:
3748 parser.error(u'invalid rate limit specified')
3749 opts.ratelimit = numeric_limit
3750 if opts.retries is not None:
3751 try:
3752 opts.retries = long(opts.retries)
3753 except (TypeError, ValueError), err:
3754 parser.error(u'invalid retry count specified')
3755 try:
3756 opts.playliststart = int(opts.playliststart)
3757 if opts.playliststart <= 0:
3758 raise ValueError(u'Playlist start must be positive')
3759 except (TypeError, ValueError), err:
3760 parser.error(u'invalid playlist start number specified')
3761 try:
3762 opts.playlistend = int(opts.playlistend)
3763 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
3764 raise ValueError(u'Playlist end must be greater than playlist start')
3765 except (TypeError, ValueError), err:
3766 parser.error(u'invalid playlist end number specified')
3767 if opts.extractaudio:
3768 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis']:
3769 parser.error(u'invalid audio format specified')
3770
3771 # File downloader
3772 fd = FileDownloader({
3773 'usenetrc': opts.usenetrc,
3774 'username': opts.username,
3775 'password': opts.password,
3776 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3777 'forceurl': opts.geturl,
3778 'forcetitle': opts.gettitle,
3779 'forcethumbnail': opts.getthumbnail,
3780 'forcedescription': opts.getdescription,
3781 'forcefilename': opts.getfilename,
3782 'forceformat': opts.getformat,
3783 'simulate': opts.simulate,
3784 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
3785 'format': opts.format,
3786 'format_limit': opts.format_limit,
3787 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
3788 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3789 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
3790 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
3791 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3792 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3793 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
3794 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
3795 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
3796 or u'%(id)s.%(ext)s'),
3797 'ignoreerrors': opts.ignoreerrors,
3798 'ratelimit': opts.ratelimit,
3799 'nooverwrites': opts.nooverwrites,
3800 'retries': opts.retries,
3801 'continuedl': opts.continue_dl,
3802 'noprogress': opts.noprogress,
3803 'playliststart': opts.playliststart,
3804 'playlistend': opts.playlistend,
3805 'logtostderr': opts.outtmpl == '-',
3806 'consoletitle': opts.consoletitle,
3807 'nopart': opts.nopart,
3808 'updatetime': opts.updatetime,
3809 'writedescription': opts.writedescription,
3810 'writeinfojson': opts.writeinfojson,
3811 'matchtitle': opts.matchtitle,
3812 'rejecttitle': opts.rejecttitle,
3813 })
3814 for extractor in extractors:
3815 fd.add_info_extractor(extractor)
3816
3817 # PostProcessors
3818 if opts.extractaudio:
3819 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
3820
3821 # Update version
3822 if opts.update_self:
3823 updateSelf(fd, sys.argv[0])
3824
3825 # Maybe do nothing
3826 if len(all_urls) < 1:
3827 if not opts.update_self:
3828 parser.error(u'you must provide at least one URL')
3829 else:
3830 sys.exit()
3831 retcode = fd.download(all_urls)
3832
3833 # Dump cookie jar if requested
3834 if opts.cookiefile is not None:
3835 try:
3836 jar.save()
3837 except (IOError, OSError), err:
3838 sys.exit(u'ERROR: unable to save cookie jar')
3839
3840 sys.exit(retcode)
3841
3842
3843 if __name__ == '__main__':
3844 try:
3845 main()
3846 except DownloadError:
3847 sys.exit(1)
3848 except SameFileError:
3849 sys.exit(u'ERROR: fixed output name but more than one file to download')
3850 except KeyboardInterrupt:
3851 sys.exit(u'\nERROR: Interrupted by user')
3852
3853 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: