]> Raphaël G. Git Repositories - youtubedl/blob - youtube-dl
debian/gbp.conf: Add defaults for the git-buildpackage suite.
[youtubedl] / youtube-dl
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.01.05'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25 import cookielib
26 import datetime
27 import gzip
28 import htmlentitydefs
29 import HTMLParser
30 import httplib
31 import locale
32 import math
33 import netrc
34 import os
35 import os.path
36 import re
37 import socket
38 import string
39 import subprocess
40 import sys
41 import time
42 import urllib
43 import urllib2
44 import warnings
45 import zlib
46
47 if os.name == 'nt':
48 import ctypes
49
50 try:
51 import email.utils
52 except ImportError: # Python 2.4
53 import email.Utils
54 try:
55 import cStringIO as StringIO
56 except ImportError:
57 import StringIO
58
59 # parse_qs was moved from the cgi module to the urlparse module recently.
60 try:
61 from urlparse import parse_qs
62 except ImportError:
63 from cgi import parse_qs
64
65 try:
66 import lxml.etree
67 except ImportError:
68 pass # Handled below
69
70 try:
71 import xml.etree.ElementTree
72 except ImportError: # Python<2.5: Not officially supported, but let it slip
73 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
74
75 std_headers = {
76 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
77 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
78 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
79 'Accept-Encoding': 'gzip, deflate',
80 'Accept-Language': 'en-us,en;q=0.5',
81 }
82
83 try:
84 import json
85 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
86 import re
87 class json(object):
88 @staticmethod
89 def loads(s):
90 s = s.decode('UTF-8')
91 def raiseError(msg, i):
92 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
93 def skipSpace(i, expectMore=True):
94 while i < len(s) and s[i] in ' \t\r\n':
95 i += 1
96 if expectMore:
97 if i >= len(s):
98 raiseError('Premature end', i)
99 return i
100 def decodeEscape(match):
101 esc = match.group(1)
102 _STATIC = {
103 '"': '"',
104 '\\': '\\',
105 '/': '/',
106 'b': unichr(0x8),
107 'f': unichr(0xc),
108 'n': '\n',
109 'r': '\r',
110 't': '\t',
111 }
112 if esc in _STATIC:
113 return _STATIC[esc]
114 if esc[0] == 'u':
115 if len(esc) == 1+4:
116 return unichr(int(esc[1:5], 16))
117 if len(esc) == 5+6 and esc[5:7] == '\\u':
118 hi = int(esc[1:5], 16)
119 low = int(esc[7:11], 16)
120 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
121 raise ValueError('Unknown escape ' + str(esc))
122 def parseString(i):
123 i += 1
124 e = i
125 while True:
126 e = s.index('"', e)
127 bslashes = 0
128 while s[e-bslashes-1] == '\\':
129 bslashes += 1
130 if bslashes % 2 == 1:
131 e += 1
132 continue
133 break
134 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
135 stri = rexp.sub(decodeEscape, s[i:e])
136 return (e+1,stri)
137 def parseObj(i):
138 i += 1
139 res = {}
140 i = skipSpace(i)
141 if s[i] == '}': # Empty dictionary
142 return (i+1,res)
143 while True:
144 if s[i] != '"':
145 raiseError('Expected a string object key', i)
146 i,key = parseString(i)
147 i = skipSpace(i)
148 if i >= len(s) or s[i] != ':':
149 raiseError('Expected a colon', i)
150 i,val = parse(i+1)
151 res[key] = val
152 i = skipSpace(i)
153 if s[i] == '}':
154 return (i+1, res)
155 if s[i] != ',':
156 raiseError('Expected comma or closing curly brace', i)
157 i = skipSpace(i+1)
158 def parseArray(i):
159 res = []
160 i = skipSpace(i+1)
161 if s[i] == ']': # Empty array
162 return (i+1,res)
163 while True:
164 i,val = parse(i)
165 res.append(val)
166 i = skipSpace(i) # Raise exception if premature end
167 if s[i] == ']':
168 return (i+1, res)
169 if s[i] != ',':
170 raiseError('Expected a comma or closing bracket', i)
171 i = skipSpace(i+1)
172 def parseDiscrete(i):
173 for k,v in {'true': True, 'false': False, 'null': None}.items():
174 if s.startswith(k, i):
175 return (i+len(k), v)
176 raiseError('Not a boolean (or null)', i)
177 def parseNumber(i):
178 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
179 if mobj is None:
180 raiseError('Not a number', i)
181 nums = mobj.group(1)
182 if '.' in nums or 'e' in nums or 'E' in nums:
183 return (i+len(nums), float(nums))
184 return (i+len(nums), int(nums))
185 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
186 def parse(i):
187 i = skipSpace(i)
188 i,res = CHARMAP.get(s[i], parseNumber)(i)
189 i = skipSpace(i, False)
190 return (i,res)
191 i,res = parse(0)
192 if i < len(s):
193 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
194 return res
195
196 def preferredencoding():
197 """Get preferred encoding.
198
199 Returns the best encoding scheme for the system, based on
200 locale.getpreferredencoding() and some further tweaks.
201 """
202 def yield_preferredencoding():
203 try:
204 pref = locale.getpreferredencoding()
205 u'TEST'.encode(pref)
206 except:
207 pref = 'UTF-8'
208 while True:
209 yield pref
210 return yield_preferredencoding().next()
211
212
213 def htmlentity_transform(matchobj):
214 """Transforms an HTML entity to a Unicode character.
215
216 This function receives a match object and is intended to be used with
217 the re.sub() function.
218 """
219 entity = matchobj.group(1)
220
221 # Known non-numeric HTML entity
222 if entity in htmlentitydefs.name2codepoint:
223 return unichr(htmlentitydefs.name2codepoint[entity])
224
225 # Unicode character
226 mobj = re.match(ur'(?u)#(x?\d+)', entity)
227 if mobj is not None:
228 numstr = mobj.group(1)
229 if numstr.startswith(u'x'):
230 base = 16
231 numstr = u'0%s' % numstr
232 else:
233 base = 10
234 return unichr(long(numstr, base))
235
236 # Unknown entity in name, return its literal representation
237 return (u'&%s;' % entity)
238
239
240 def sanitize_title(utitle):
241 """Sanitizes a video title so it could be used as part of a filename."""
242 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
243 return utitle.replace(unicode(os.sep), u'%')
244
245
246 def sanitize_open(filename, open_mode):
247 """Try to open the given filename, and slightly tweak it if this fails.
248
249 Attempts to open the given filename. If this fails, it tries to change
250 the filename slightly, step by step, until it's either able to open it
251 or it fails and raises a final exception, like the standard open()
252 function.
253
254 It returns the tuple (stream, definitive_file_name).
255 """
256 try:
257 if filename == u'-':
258 if sys.platform == 'win32':
259 import msvcrt
260 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
261 return (sys.stdout, filename)
262 stream = open(filename, open_mode)
263 return (stream, filename)
264 except (IOError, OSError), err:
265 # In case of error, try to remove win32 forbidden chars
266 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
267
268 # An exception here should be caught in the caller
269 stream = open(filename, open_mode)
270 return (stream, filename)
271
272
273 def timeconvert(timestr):
274 """Convert RFC 2822 defined time string into system timestamp"""
275 timestamp = None
276 timetuple = email.utils.parsedate_tz(timestr)
277 if timetuple is not None:
278 timestamp = email.utils.mktime_tz(timetuple)
279 return timestamp
280
281 def _simplify_title(title):
282 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
283 return expr.sub(u'_', title).strip(u'_')
284
285 def _orderedSet(iterable):
286 """ Remove all duplicates from the input iterable """
287 res = []
288 for el in iterable:
289 if el not in res:
290 res.append(el)
291 return res
292
293 def _unescapeHTML(s):
294 """
295 @param s a string (of type unicode)
296 """
297 assert type(s) == type(u'')
298
299 htmlParser = HTMLParser.HTMLParser()
300 return htmlParser.unescape(s)
301
302 class DownloadError(Exception):
303 """Download Error exception.
304
305 This exception may be thrown by FileDownloader objects if they are not
306 configured to continue on errors. They will contain the appropriate
307 error message.
308 """
309 pass
310
311
312 class SameFileError(Exception):
313 """Same File exception.
314
315 This exception will be thrown by FileDownloader objects if they detect
316 multiple files would have to be downloaded to the same file on disk.
317 """
318 pass
319
320
321 class PostProcessingError(Exception):
322 """Post Processing exception.
323
324 This exception may be raised by PostProcessor's .run() method to
325 indicate an error in the postprocessing task.
326 """
327 pass
328
329 class MaxDownloadsReached(Exception):
330 """ --max-downloads limit has been reached. """
331 pass
332
333
334 class UnavailableVideoError(Exception):
335 """Unavailable Format exception.
336
337 This exception will be thrown when a video is requested
338 in a format that is not available for that video.
339 """
340 pass
341
342
343 class ContentTooShortError(Exception):
344 """Content Too Short exception.
345
346 This exception may be raised by FileDownloader objects when a file they
347 download is too small for what the server announced first, indicating
348 the connection was probably interrupted.
349 """
350 # Both in bytes
351 downloaded = None
352 expected = None
353
354 def __init__(self, downloaded, expected):
355 self.downloaded = downloaded
356 self.expected = expected
357
358
359 class YoutubeDLHandler(urllib2.HTTPHandler):
360 """Handler for HTTP requests and responses.
361
362 This class, when installed with an OpenerDirector, automatically adds
363 the standard headers to every HTTP request and handles gzipped and
364 deflated responses from web servers. If compression is to be avoided in
365 a particular request, the original request in the program code only has
366 to include the HTTP header "Youtubedl-No-Compression", which will be
367 removed before making the real request.
368
369 Part of this code was copied from:
370
371 http://techknack.net/python-urllib2-handlers/
372
373 Andrew Rowls, the author of that code, agreed to release it to the
374 public domain.
375 """
376
377 @staticmethod
378 def deflate(data):
379 try:
380 return zlib.decompress(data, -zlib.MAX_WBITS)
381 except zlib.error:
382 return zlib.decompress(data)
383
384 @staticmethod
385 def addinfourl_wrapper(stream, headers, url, code):
386 if hasattr(urllib2.addinfourl, 'getcode'):
387 return urllib2.addinfourl(stream, headers, url, code)
388 ret = urllib2.addinfourl(stream, headers, url)
389 ret.code = code
390 return ret
391
392 def http_request(self, req):
393 for h in std_headers:
394 if h in req.headers:
395 del req.headers[h]
396 req.add_header(h, std_headers[h])
397 if 'Youtubedl-no-compression' in req.headers:
398 if 'Accept-encoding' in req.headers:
399 del req.headers['Accept-encoding']
400 del req.headers['Youtubedl-no-compression']
401 return req
402
403 def http_response(self, req, resp):
404 old_resp = resp
405 # gzip
406 if resp.headers.get('Content-encoding', '') == 'gzip':
407 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
408 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
409 resp.msg = old_resp.msg
410 # deflate
411 if resp.headers.get('Content-encoding', '') == 'deflate':
412 gz = StringIO.StringIO(self.deflate(resp.read()))
413 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
414 resp.msg = old_resp.msg
415 return resp
416
417
418 class FileDownloader(object):
419 """File Downloader class.
420
421 File downloader objects are the ones responsible of downloading the
422 actual video file and writing it to disk if the user has requested
423 it, among some other tasks. In most cases there should be one per
424 program. As, given a video URL, the downloader doesn't know how to
425 extract all the needed information, task that InfoExtractors do, it
426 has to pass the URL to one of them.
427
428 For this, file downloader objects have a method that allows
429 InfoExtractors to be registered in a given order. When it is passed
430 a URL, the file downloader handles it to the first InfoExtractor it
431 finds that reports being able to handle it. The InfoExtractor extracts
432 all the information about the video or videos the URL refers to, and
433 asks the FileDownloader to process the video information, possibly
434 downloading the video.
435
436 File downloaders accept a lot of parameters. In order not to saturate
437 the object constructor with arguments, it receives a dictionary of
438 options instead. These options are available through the params
439 attribute for the InfoExtractors to use. The FileDownloader also
440 registers itself as the downloader in charge for the InfoExtractors
441 that are added to it, so this is a "mutual registration".
442
443 Available options:
444
445 username: Username for authentication purposes.
446 password: Password for authentication purposes.
447 usenetrc: Use netrc for authentication instead.
448 quiet: Do not print messages to stdout.
449 forceurl: Force printing final URL.
450 forcetitle: Force printing title.
451 forcethumbnail: Force printing thumbnail URL.
452 forcedescription: Force printing description.
453 forcefilename: Force printing final filename.
454 simulate: Do not download the video files.
455 format: Video format code.
456 format_limit: Highest quality format to try.
457 outtmpl: Template for output names.
458 ignoreerrors: Do not stop on download errors.
459 ratelimit: Download speed limit, in bytes/sec.
460 nooverwrites: Prevent overwriting files.
461 retries: Number of times to retry for HTTP error 5xx
462 continuedl: Try to continue downloads if possible.
463 noprogress: Do not print the progress bar.
464 playliststart: Playlist item to start at.
465 playlistend: Playlist item to end at.
466 matchtitle: Download only matching titles.
467 rejecttitle: Reject downloads for matching titles.
468 logtostderr: Log messages to stderr instead of stdout.
469 consoletitle: Display progress in console window's titlebar.
470 nopart: Do not use temporary .part files.
471 updatetime: Use the Last-modified header to set output file timestamps.
472 writedescription: Write the video description to a .description file
473 writeinfojson: Write the video description to a .info.json file
474 """
475
476 params = None
477 _ies = []
478 _pps = []
479 _download_retcode = None
480 _num_downloads = None
481 _screen_file = None
482
483 def __init__(self, params):
484 """Create a FileDownloader object with the given options."""
485 self._ies = []
486 self._pps = []
487 self._download_retcode = 0
488 self._num_downloads = 0
489 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
490 self.params = params
491
492 @staticmethod
493 def format_bytes(bytes):
494 if bytes is None:
495 return 'N/A'
496 if type(bytes) is str:
497 bytes = float(bytes)
498 if bytes == 0.0:
499 exponent = 0
500 else:
501 exponent = long(math.log(bytes, 1024.0))
502 suffix = 'bkMGTPEZY'[exponent]
503 converted = float(bytes) / float(1024 ** exponent)
504 return '%.2f%s' % (converted, suffix)
505
506 @staticmethod
507 def calc_percent(byte_counter, data_len):
508 if data_len is None:
509 return '---.-%'
510 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
511
512 @staticmethod
513 def calc_eta(start, now, total, current):
514 if total is None:
515 return '--:--'
516 dif = now - start
517 if current == 0 or dif < 0.001: # One millisecond
518 return '--:--'
519 rate = float(current) / dif
520 eta = long((float(total) - float(current)) / rate)
521 (eta_mins, eta_secs) = divmod(eta, 60)
522 if eta_mins > 99:
523 return '--:--'
524 return '%02d:%02d' % (eta_mins, eta_secs)
525
526 @staticmethod
527 def calc_speed(start, now, bytes):
528 dif = now - start
529 if bytes == 0 or dif < 0.001: # One millisecond
530 return '%10s' % '---b/s'
531 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
532
533 @staticmethod
534 def best_block_size(elapsed_time, bytes):
535 new_min = max(bytes / 2.0, 1.0)
536 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
537 if elapsed_time < 0.001:
538 return long(new_max)
539 rate = bytes / elapsed_time
540 if rate > new_max:
541 return long(new_max)
542 if rate < new_min:
543 return long(new_min)
544 return long(rate)
545
546 @staticmethod
547 def parse_bytes(bytestr):
548 """Parse a string indicating a byte quantity into a long integer."""
549 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
550 if matchobj is None:
551 return None
552 number = float(matchobj.group(1))
553 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
554 return long(round(number * multiplier))
555
556 def add_info_extractor(self, ie):
557 """Add an InfoExtractor object to the end of the list."""
558 self._ies.append(ie)
559 ie.set_downloader(self)
560
561 def add_post_processor(self, pp):
562 """Add a PostProcessor object to the end of the chain."""
563 self._pps.append(pp)
564 pp.set_downloader(self)
565
566 def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False):
567 """Print message to stdout if not in quiet mode."""
568 try:
569 if not self.params.get('quiet', False):
570 terminator = [u'\n', u''][skip_eol]
571 print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()),
572 self._screen_file.flush()
573 except (UnicodeEncodeError), err:
574 if not ignore_encoding_errors:
575 raise
576
577 def to_stderr(self, message):
578 """Print message to stderr."""
579 print >>sys.stderr, message.encode(preferredencoding())
580
581 def to_cons_title(self, message):
582 """Set console/terminal window title to message."""
583 if not self.params.get('consoletitle', False):
584 return
585 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
586 # c_wchar_p() might not be necessary if `message` is
587 # already of type unicode()
588 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
589 elif 'TERM' in os.environ:
590 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
591
592 def fixed_template(self):
593 """Checks if the output template is fixed."""
594 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
595
596 def trouble(self, message=None):
597 """Determine action to take when a download problem appears.
598
599 Depending on if the downloader has been configured to ignore
600 download errors or not, this method may throw an exception or
601 not when errors are found, after printing the message.
602 """
603 if message is not None:
604 self.to_stderr(message)
605 if not self.params.get('ignoreerrors', False):
606 raise DownloadError(message)
607 self._download_retcode = 1
608
609 def slow_down(self, start_time, byte_counter):
610 """Sleep if the download speed is over the rate limit."""
611 rate_limit = self.params.get('ratelimit', None)
612 if rate_limit is None or byte_counter == 0:
613 return
614 now = time.time()
615 elapsed = now - start_time
616 if elapsed <= 0.0:
617 return
618 speed = float(byte_counter) / elapsed
619 if speed > rate_limit:
620 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
621
622 def temp_name(self, filename):
623 """Returns a temporary filename for the given filename."""
624 if self.params.get('nopart', False) or filename == u'-' or \
625 (os.path.exists(filename) and not os.path.isfile(filename)):
626 return filename
627 return filename + u'.part'
628
629 def undo_temp_name(self, filename):
630 if filename.endswith(u'.part'):
631 return filename[:-len(u'.part')]
632 return filename
633
634 def try_rename(self, old_filename, new_filename):
635 try:
636 if old_filename == new_filename:
637 return
638 os.rename(old_filename, new_filename)
639 except (IOError, OSError), err:
640 self.trouble(u'ERROR: unable to rename file')
641
642 def try_utime(self, filename, last_modified_hdr):
643 """Try to set the last-modified time of the given file."""
644 if last_modified_hdr is None:
645 return
646 if not os.path.isfile(filename):
647 return
648 timestr = last_modified_hdr
649 if timestr is None:
650 return
651 filetime = timeconvert(timestr)
652 if filetime is None:
653 return filetime
654 try:
655 os.utime(filename, (time.time(), filetime))
656 except:
657 pass
658 return filetime
659
660 def report_writedescription(self, descfn):
661 """ Report that the description file is being written """
662 self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True)
663
664 def report_writeinfojson(self, infofn):
665 """ Report that the metadata file has been written """
666 self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True)
667
668 def report_destination(self, filename):
669 """Report destination filename."""
670 self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
671
672 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
673 """Report download progress."""
674 if self.params.get('noprogress', False):
675 return
676 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
677 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
678 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
679 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
680
681 def report_resuming_byte(self, resume_len):
682 """Report attempt to resume at given byte."""
683 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
684
685 def report_retry(self, count, retries):
686 """Report retry in case of HTTP error 5xx"""
687 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
688
689 def report_file_already_downloaded(self, file_name):
690 """Report file has already been fully downloaded."""
691 try:
692 self.to_screen(u'[download] %s has already been downloaded' % file_name)
693 except (UnicodeEncodeError), err:
694 self.to_screen(u'[download] The file has already been downloaded')
695
696 def report_unable_to_resume(self):
697 """Report it was impossible to resume download."""
698 self.to_screen(u'[download] Unable to resume')
699
700 def report_finish(self):
701 """Report download finished."""
702 if self.params.get('noprogress', False):
703 self.to_screen(u'[download] Download completed')
704 else:
705 self.to_screen(u'')
706
707 def increment_downloads(self):
708 """Increment the ordinal that assigns a number to each file."""
709 self._num_downloads += 1
710
711 def prepare_filename(self, info_dict):
712 """Generate the output filename."""
713 try:
714 template_dict = dict(info_dict)
715 template_dict['epoch'] = unicode(long(time.time()))
716 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
717 filename = self.params['outtmpl'] % template_dict
718 return filename
719 except (ValueError, KeyError), err:
720 self.trouble(u'ERROR: invalid system charset or erroneous output template')
721 return None
722
723 def _match_entry(self, info_dict):
724 """ Returns None iff the file should be downloaded """
725
726 title = info_dict['title']
727 matchtitle = self.params.get('matchtitle', False)
728 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
729 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
730 rejecttitle = self.params.get('rejecttitle', False)
731 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
732 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
733 return None
734
735 def process_info(self, info_dict):
736 """Process a single dictionary returned by an InfoExtractor."""
737
738 reason = self._match_entry(info_dict)
739 if reason is not None:
740 self.to_screen(u'[download] ' + reason)
741 return
742
743 max_downloads = self.params.get('max_downloads')
744 if max_downloads is not None:
745 if self._num_downloads > int(max_downloads):
746 raise MaxDownloadsReached()
747
748 filename = self.prepare_filename(info_dict)
749
750 # Forced printings
751 if self.params.get('forcetitle', False):
752 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
753 if self.params.get('forceurl', False):
754 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
755 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
756 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
757 if self.params.get('forcedescription', False) and 'description' in info_dict:
758 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
759 if self.params.get('forcefilename', False) and filename is not None:
760 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
761 if self.params.get('forceformat', False):
762 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
763
764 # Do nothing else if in simulate mode
765 if self.params.get('simulate', False):
766 return
767
768 if filename is None:
769 return
770
771 try:
772 dn = os.path.dirname(filename)
773 if dn != '' and not os.path.exists(dn):
774 os.makedirs(dn)
775 except (OSError, IOError), err:
776 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
777 return
778
779 if self.params.get('writedescription', False):
780 try:
781 descfn = filename + '.description'
782 self.report_writedescription(descfn)
783 descfile = open(descfn, 'wb')
784 try:
785 descfile.write(info_dict['description'].encode('utf-8'))
786 finally:
787 descfile.close()
788 except (OSError, IOError):
789 self.trouble(u'ERROR: Cannot write description file ' + descfn)
790 return
791
792 if self.params.get('writeinfojson', False):
793 infofn = filename + '.info.json'
794 self.report_writeinfojson(infofn)
795 try:
796 json.dump
797 except (NameError,AttributeError):
798 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
799 return
800 try:
801 infof = open(infofn, 'wb')
802 try:
803 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
804 json.dump(json_info_dict, infof)
805 finally:
806 infof.close()
807 except (OSError, IOError):
808 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
809 return
810
811 if not self.params.get('skip_download', False):
812 if self.params.get('nooverwrites', False) and os.path.exists(filename):
813 success = True
814 else:
815 try:
816 success = self._do_download(filename, info_dict)
817 except (OSError, IOError), err:
818 raise UnavailableVideoError
819 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
820 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
821 return
822 except (ContentTooShortError, ), err:
823 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
824 return
825
826 if success:
827 try:
828 self.post_process(filename, info_dict)
829 except (PostProcessingError), err:
830 self.trouble(u'ERROR: postprocessing: %s' % str(err))
831 return
832
833 def download(self, url_list):
834 """Download a given list of URLs."""
835 if len(url_list) > 1 and self.fixed_template():
836 raise SameFileError(self.params['outtmpl'])
837
838 for url in url_list:
839 suitable_found = False
840 for ie in self._ies:
841 # Go to next InfoExtractor if not suitable
842 if not ie.suitable(url):
843 continue
844
845 # Suitable InfoExtractor found
846 suitable_found = True
847
848 # Extract information from URL and process it
849 ie.extract(url)
850
851 # Suitable InfoExtractor had been found; go to next URL
852 break
853
854 if not suitable_found:
855 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
856
857 return self._download_retcode
858
859 def post_process(self, filename, ie_info):
860 """Run the postprocessing chain on the given file."""
861 info = dict(ie_info)
862 info['filepath'] = filename
863 for pp in self._pps:
864 info = pp.run(info)
865 if info is None:
866 break
867
868 def _download_with_rtmpdump(self, filename, url, player_url):
869 self.report_destination(filename)
870 tmpfilename = self.temp_name(filename)
871
872 # Check for rtmpdump first
873 try:
874 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
875 except (OSError, IOError):
876 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
877 return False
878
879 # Download using rtmpdump. rtmpdump returns exit code 2 when
880 # the connection was interrumpted and resuming appears to be
881 # possible. This is part of rtmpdump's normal usage, AFAIK.
882 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
883 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
884 while retval == 2 or retval == 1:
885 prevsize = os.path.getsize(tmpfilename)
886 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
887 time.sleep(5.0) # This seems to be needed
888 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
889 cursize = os.path.getsize(tmpfilename)
890 if prevsize == cursize and retval == 1:
891 break
892 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
893 if prevsize == cursize and retval == 2 and cursize > 1024:
894 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
895 retval = 0
896 break
897 if retval == 0:
898 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename))
899 self.try_rename(tmpfilename, filename)
900 return True
901 else:
902 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
903 return False
904
905 def _do_download(self, filename, info_dict):
906 url = info_dict['url']
907 player_url = info_dict.get('player_url', None)
908
909 # Check file already present
910 if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False):
911 self.report_file_already_downloaded(filename)
912 return True
913
914 # Attempt to download using rtmpdump
915 if url.startswith('rtmp'):
916 return self._download_with_rtmpdump(filename, url, player_url)
917
918 tmpfilename = self.temp_name(filename)
919 stream = None
920
921 # Do not include the Accept-Encoding header
922 headers = {'Youtubedl-no-compression': 'True'}
923 basic_request = urllib2.Request(url, None, headers)
924 request = urllib2.Request(url, None, headers)
925
926 # Establish possible resume length
927 if os.path.isfile(tmpfilename):
928 resume_len = os.path.getsize(tmpfilename)
929 else:
930 resume_len = 0
931
932 open_mode = 'wb'
933 if resume_len != 0:
934 if self.params.get('continuedl', False):
935 self.report_resuming_byte(resume_len)
936 request.add_header('Range','bytes=%d-' % resume_len)
937 open_mode = 'ab'
938 else:
939 resume_len = 0
940
941 count = 0
942 retries = self.params.get('retries', 0)
943 while count <= retries:
944 # Establish connection
945 try:
946 if count == 0 and 'urlhandle' in info_dict:
947 data = info_dict['urlhandle']
948 data = urllib2.urlopen(request)
949 break
950 except (urllib2.HTTPError, ), err:
951 if (err.code < 500 or err.code >= 600) and err.code != 416:
952 # Unexpected HTTP error
953 raise
954 elif err.code == 416:
955 # Unable to resume (requested range not satisfiable)
956 try:
957 # Open the connection again without the range header
958 data = urllib2.urlopen(basic_request)
959 content_length = data.info()['Content-Length']
960 except (urllib2.HTTPError, ), err:
961 if err.code < 500 or err.code >= 600:
962 raise
963 else:
964 # Examine the reported length
965 if (content_length is not None and
966 (resume_len - 100 < long(content_length) < resume_len + 100)):
967 # The file had already been fully downloaded.
968 # Explanation to the above condition: in issue #175 it was revealed that
969 # YouTube sometimes adds or removes a few bytes from the end of the file,
970 # changing the file size slightly and causing problems for some users. So
971 # I decided to implement a suggested change and consider the file
972 # completely downloaded if the file size differs less than 100 bytes from
973 # the one in the hard drive.
974 self.report_file_already_downloaded(filename)
975 self.try_rename(tmpfilename, filename)
976 return True
977 else:
978 # The length does not match, we start the download over
979 self.report_unable_to_resume()
980 open_mode = 'wb'
981 break
982 # Retry
983 count += 1
984 if count <= retries:
985 self.report_retry(count, retries)
986
987 if count > retries:
988 self.trouble(u'ERROR: giving up after %s retries' % retries)
989 return False
990
991 data_len = data.info().get('Content-length', None)
992 if data_len is not None:
993 data_len = long(data_len) + resume_len
994 data_len_str = self.format_bytes(data_len)
995 byte_counter = 0 + resume_len
996 block_size = 1024
997 start = time.time()
998 while True:
999 # Download and write
1000 before = time.time()
1001 data_block = data.read(block_size)
1002 after = time.time()
1003 if len(data_block) == 0:
1004 break
1005 byte_counter += len(data_block)
1006
1007 # Open file just in time
1008 if stream is None:
1009 try:
1010 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1011 assert stream is not None
1012 filename = self.undo_temp_name(tmpfilename)
1013 self.report_destination(filename)
1014 except (OSError, IOError), err:
1015 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1016 return False
1017 try:
1018 stream.write(data_block)
1019 except (IOError, OSError), err:
1020 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1021 return False
1022 block_size = self.best_block_size(after - before, len(data_block))
1023
1024 # Progress message
1025 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1026 if data_len is None:
1027 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1028 else:
1029 percent_str = self.calc_percent(byte_counter, data_len)
1030 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1031 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1032
1033 # Apply rate limit
1034 self.slow_down(start, byte_counter - resume_len)
1035
1036 if stream is None:
1037 self.trouble(u'\nERROR: Did not get any data blocks')
1038 return False
1039 stream.close()
1040 self.report_finish()
1041 if data_len is not None and byte_counter != data_len:
1042 raise ContentTooShortError(byte_counter, long(data_len))
1043 self.try_rename(tmpfilename, filename)
1044
1045 # Update file modification time
1046 if self.params.get('updatetime', True):
1047 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1048
1049 return True
1050
1051
1052 class InfoExtractor(object):
1053 """Information Extractor class.
1054
1055 Information extractors are the classes that, given a URL, extract
1056 information from the video (or videos) the URL refers to. This
1057 information includes the real video URL, the video title and simplified
1058 title, author and others. The information is stored in a dictionary
1059 which is then passed to the FileDownloader. The FileDownloader
1060 processes this information possibly downloading the video to the file
1061 system, among other possible outcomes. The dictionaries must include
1062 the following fields:
1063
1064 id: Video identifier.
1065 url: Final video URL.
1066 uploader: Nickname of the video uploader.
1067 title: Literal title.
1068 stitle: Simplified title.
1069 ext: Video filename extension.
1070 format: Video format.
1071 player_url: SWF Player URL (may be None).
1072
1073 The following fields are optional. Their primary purpose is to allow
1074 youtube-dl to serve as the backend for a video search function, such
1075 as the one in youtube2mp3. They are only used when their respective
1076 forced printing functions are called:
1077
1078 thumbnail: Full URL to a video thumbnail image.
1079 description: One-line video description.
1080
1081 Subclasses of this one should re-define the _real_initialize() and
1082 _real_extract() methods and define a _VALID_URL regexp.
1083 Probably, they should also be added to the list of extractors.
1084 """
1085
1086 _ready = False
1087 _downloader = None
1088
1089 def __init__(self, downloader=None):
1090 """Constructor. Receives an optional downloader."""
1091 self._ready = False
1092 self.set_downloader(downloader)
1093
1094 def suitable(self, url):
1095 """Receives a URL and returns True if suitable for this IE."""
1096 return re.match(self._VALID_URL, url) is not None
1097
1098 def initialize(self):
1099 """Initializes an instance (authentication, etc)."""
1100 if not self._ready:
1101 self._real_initialize()
1102 self._ready = True
1103
1104 def extract(self, url):
1105 """Extracts URL information and returns it in list of dicts."""
1106 self.initialize()
1107 return self._real_extract(url)
1108
1109 def set_downloader(self, downloader):
1110 """Sets the downloader for this IE."""
1111 self._downloader = downloader
1112
1113 def _real_initialize(self):
1114 """Real initialization process. Redefine in subclasses."""
1115 pass
1116
1117 def _real_extract(self, url):
1118 """Real extraction process. Redefine in subclasses."""
1119 pass
1120
1121
1122 class YoutubeIE(InfoExtractor):
1123 """Information extractor for youtube.com."""
1124
1125 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1126 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1127 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1128 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1129 _NETRC_MACHINE = 'youtube'
1130 # Listed in order of quality
1131 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1132 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1133 _video_extensions = {
1134 '13': '3gp',
1135 '17': 'mp4',
1136 '18': 'mp4',
1137 '22': 'mp4',
1138 '37': 'mp4',
1139 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1140 '43': 'webm',
1141 '44': 'webm',
1142 '45': 'webm',
1143 }
1144 _video_dimensions = {
1145 '5': '240x400',
1146 '6': '???',
1147 '13': '???',
1148 '17': '144x176',
1149 '18': '360x640',
1150 '22': '720x1280',
1151 '34': '360x640',
1152 '35': '480x854',
1153 '37': '1080x1920',
1154 '38': '3072x4096',
1155 '43': '360x640',
1156 '44': '480x854',
1157 '45': '720x1280',
1158 }
1159 IE_NAME = u'youtube'
1160
1161 def report_lang(self):
1162 """Report attempt to set language."""
1163 self._downloader.to_screen(u'[youtube] Setting language')
1164
1165 def report_login(self):
1166 """Report attempt to log in."""
1167 self._downloader.to_screen(u'[youtube] Logging in')
1168
1169 def report_age_confirmation(self):
1170 """Report attempt to confirm age."""
1171 self._downloader.to_screen(u'[youtube] Confirming age')
1172
1173 def report_video_webpage_download(self, video_id):
1174 """Report attempt to download video webpage."""
1175 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1176
1177 def report_video_info_webpage_download(self, video_id):
1178 """Report attempt to download video info webpage."""
1179 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1180
1181 def report_information_extraction(self, video_id):
1182 """Report attempt to extract video information."""
1183 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1184
1185 def report_unavailable_format(self, video_id, format):
1186 """Report extracted video URL."""
1187 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1188
1189 def report_rtmp_download(self):
1190 """Indicate the download will use the RTMP protocol."""
1191 self._downloader.to_screen(u'[youtube] RTMP download detected')
1192
1193 def _print_formats(self, formats):
1194 print 'Available formats:'
1195 for x in formats:
1196 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1197
1198 def _real_initialize(self):
1199 if self._downloader is None:
1200 return
1201
1202 username = None
1203 password = None
1204 downloader_params = self._downloader.params
1205
1206 # Attempt to use provided username and password or .netrc data
1207 if downloader_params.get('username', None) is not None:
1208 username = downloader_params['username']
1209 password = downloader_params['password']
1210 elif downloader_params.get('usenetrc', False):
1211 try:
1212 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1213 if info is not None:
1214 username = info[0]
1215 password = info[2]
1216 else:
1217 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1218 except (IOError, netrc.NetrcParseError), err:
1219 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1220 return
1221
1222 # Set language
1223 request = urllib2.Request(self._LANG_URL)
1224 try:
1225 self.report_lang()
1226 urllib2.urlopen(request).read()
1227 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1228 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1229 return
1230
1231 # No authentication to be performed
1232 if username is None:
1233 return
1234
1235 # Log in
1236 login_form = {
1237 'current_form': 'loginForm',
1238 'next': '/',
1239 'action_login': 'Log In',
1240 'username': username,
1241 'password': password,
1242 }
1243 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1244 try:
1245 self.report_login()
1246 login_results = urllib2.urlopen(request).read()
1247 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1248 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1249 return
1250 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1251 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1252 return
1253
1254 # Confirm age
1255 age_form = {
1256 'next_url': '/',
1257 'action_confirm': 'Confirm',
1258 }
1259 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1260 try:
1261 self.report_age_confirmation()
1262 age_results = urllib2.urlopen(request).read()
1263 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1264 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1265 return
1266
1267 def _real_extract(self, url):
1268 # Extract video id from URL
1269 mobj = re.match(self._VALID_URL, url)
1270 if mobj is None:
1271 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1272 return
1273 video_id = mobj.group(2)
1274
1275 # Get video webpage
1276 self.report_video_webpage_download(video_id)
1277 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1278 try:
1279 video_webpage = urllib2.urlopen(request).read()
1280 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1281 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1282 return
1283
1284 # Attempt to extract SWF player URL
1285 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1286 if mobj is not None:
1287 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1288 else:
1289 player_url = None
1290
1291 # Get video info
1292 self.report_video_info_webpage_download(video_id)
1293 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1294 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1295 % (video_id, el_type))
1296 request = urllib2.Request(video_info_url)
1297 try:
1298 video_info_webpage = urllib2.urlopen(request).read()
1299 video_info = parse_qs(video_info_webpage)
1300 if 'token' in video_info:
1301 break
1302 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1303 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1304 return
1305 if 'token' not in video_info:
1306 if 'reason' in video_info:
1307 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1308 else:
1309 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1310 return
1311
1312 # Start extracting information
1313 self.report_information_extraction(video_id)
1314
1315 # uploader
1316 if 'author' not in video_info:
1317 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1318 return
1319 video_uploader = urllib.unquote_plus(video_info['author'][0])
1320
1321 # title
1322 if 'title' not in video_info:
1323 self._downloader.trouble(u'ERROR: unable to extract video title')
1324 return
1325 video_title = urllib.unquote_plus(video_info['title'][0])
1326 video_title = video_title.decode('utf-8')
1327 video_title = sanitize_title(video_title)
1328
1329 # simplified title
1330 simple_title = _simplify_title(video_title)
1331
1332 # thumbnail image
1333 if 'thumbnail_url' not in video_info:
1334 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1335 video_thumbnail = ''
1336 else: # don't panic if we can't find it
1337 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1338
1339 # upload date
1340 upload_date = u'NA'
1341 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1342 if mobj is not None:
1343 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1344 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1345 for expression in format_expressions:
1346 try:
1347 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1348 except:
1349 pass
1350
1351 # description
1352 try:
1353 lxml.etree
1354 except NameError:
1355 video_description = u'No description available.'
1356 if self._downloader.params.get('forcedescription', False) or self._downloader.params.get('writedescription', False):
1357 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage)
1358 if mobj is not None:
1359 video_description = mobj.group(1).decode('utf-8')
1360 else:
1361 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1362 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1363 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1364 # TODO use another parser
1365
1366 # token
1367 video_token = urllib.unquote_plus(video_info['token'][0])
1368
1369 # Decide which formats to download
1370 req_format = self._downloader.params.get('format', None)
1371
1372 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1373 self.report_rtmp_download()
1374 video_url_list = [(None, video_info['conn'][0])]
1375 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1376 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1377 url_data = [parse_qs(uds) for uds in url_data_strs]
1378 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1379 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1380
1381 format_limit = self._downloader.params.get('format_limit', None)
1382 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1383 if format_limit is not None and format_limit in available_formats:
1384 format_list = available_formats[available_formats.index(format_limit):]
1385 else:
1386 format_list = available_formats
1387 existing_formats = [x for x in format_list if x in url_map]
1388 if len(existing_formats) == 0:
1389 self._downloader.trouble(u'ERROR: no known formats available for video')
1390 return
1391 if self._downloader.params.get('listformats', None):
1392 self._print_formats(existing_formats)
1393 return
1394 if req_format is None or req_format == 'best':
1395 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1396 elif req_format == 'worst':
1397 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1398 elif req_format in ('-1', 'all'):
1399 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1400 else:
1401 # Specific formats. We pick the first in a slash-delimeted sequence.
1402 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1403 req_formats = req_format.split('/')
1404 video_url_list = None
1405 for rf in req_formats:
1406 if rf in url_map:
1407 video_url_list = [(rf, url_map[rf])]
1408 break
1409 if video_url_list is None:
1410 self._downloader.trouble(u'ERROR: requested format not available')
1411 return
1412 else:
1413 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1414 return
1415
1416 for format_param, video_real_url in video_url_list:
1417 # At this point we have a new video
1418 self._downloader.increment_downloads()
1419
1420 # Extension
1421 video_extension = self._video_extensions.get(format_param, 'flv')
1422
1423 try:
1424 # Process video information
1425 self._downloader.process_info({
1426 'id': video_id.decode('utf-8'),
1427 'url': video_real_url.decode('utf-8'),
1428 'uploader': video_uploader.decode('utf-8'),
1429 'upload_date': upload_date,
1430 'title': video_title,
1431 'stitle': simple_title,
1432 'ext': video_extension.decode('utf-8'),
1433 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1434 'thumbnail': video_thumbnail.decode('utf-8'),
1435 'description': video_description,
1436 'player_url': player_url,
1437 })
1438 except UnavailableVideoError, err:
1439 self._downloader.trouble(u'\nERROR: unable to download video')
1440
1441
1442 class MetacafeIE(InfoExtractor):
1443 """Information Extractor for metacafe.com."""
1444
1445 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1446 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1447 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1448 _youtube_ie = None
1449 IE_NAME = u'metacafe'
1450
1451 def __init__(self, youtube_ie, downloader=None):
1452 InfoExtractor.__init__(self, downloader)
1453 self._youtube_ie = youtube_ie
1454
1455 def report_disclaimer(self):
1456 """Report disclaimer retrieval."""
1457 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1458
1459 def report_age_confirmation(self):
1460 """Report attempt to confirm age."""
1461 self._downloader.to_screen(u'[metacafe] Confirming age')
1462
1463 def report_download_webpage(self, video_id):
1464 """Report webpage download."""
1465 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1466
1467 def report_extraction(self, video_id):
1468 """Report information extraction."""
1469 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1470
1471 def _real_initialize(self):
1472 # Retrieve disclaimer
1473 request = urllib2.Request(self._DISCLAIMER)
1474 try:
1475 self.report_disclaimer()
1476 disclaimer = urllib2.urlopen(request).read()
1477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1478 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1479 return
1480
1481 # Confirm age
1482 disclaimer_form = {
1483 'filters': '0',
1484 'submit': "Continue - I'm over 18",
1485 }
1486 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1487 try:
1488 self.report_age_confirmation()
1489 disclaimer = urllib2.urlopen(request).read()
1490 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1491 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1492 return
1493
1494 def _real_extract(self, url):
1495 # Extract id and simplified title from URL
1496 mobj = re.match(self._VALID_URL, url)
1497 if mobj is None:
1498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1499 return
1500
1501 video_id = mobj.group(1)
1502
1503 # Check if video comes from YouTube
1504 mobj2 = re.match(r'^yt-(.*)$', video_id)
1505 if mobj2 is not None:
1506 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1507 return
1508
1509 # At this point we have a new video
1510 self._downloader.increment_downloads()
1511
1512 simple_title = mobj.group(2).decode('utf-8')
1513
1514 # Retrieve video webpage to extract further information
1515 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1516 try:
1517 self.report_download_webpage(video_id)
1518 webpage = urllib2.urlopen(request).read()
1519 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1520 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1521 return
1522
1523 # Extract URL, uploader and title from webpage
1524 self.report_extraction(video_id)
1525 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1526 if mobj is not None:
1527 mediaURL = urllib.unquote(mobj.group(1))
1528 video_extension = mediaURL[-3:]
1529
1530 # Extract gdaKey if available
1531 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1532 if mobj is None:
1533 video_url = mediaURL
1534 else:
1535 gdaKey = mobj.group(1)
1536 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1537 else:
1538 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1539 if mobj is None:
1540 self._downloader.trouble(u'ERROR: unable to extract media URL')
1541 return
1542 vardict = parse_qs(mobj.group(1))
1543 if 'mediaData' not in vardict:
1544 self._downloader.trouble(u'ERROR: unable to extract media URL')
1545 return
1546 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1547 if mobj is None:
1548 self._downloader.trouble(u'ERROR: unable to extract media URL')
1549 return
1550 mediaURL = mobj.group(1).replace('\\/', '/')
1551 video_extension = mediaURL[-3:]
1552 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1553
1554 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1555 if mobj is None:
1556 self._downloader.trouble(u'ERROR: unable to extract title')
1557 return
1558 video_title = mobj.group(1).decode('utf-8')
1559 video_title = sanitize_title(video_title)
1560
1561 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1562 if mobj is None:
1563 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1564 return
1565 video_uploader = mobj.group(1)
1566
1567 try:
1568 # Process video information
1569 self._downloader.process_info({
1570 'id': video_id.decode('utf-8'),
1571 'url': video_url.decode('utf-8'),
1572 'uploader': video_uploader.decode('utf-8'),
1573 'upload_date': u'NA',
1574 'title': video_title,
1575 'stitle': simple_title,
1576 'ext': video_extension.decode('utf-8'),
1577 'format': u'NA',
1578 'player_url': None,
1579 })
1580 except UnavailableVideoError:
1581 self._downloader.trouble(u'\nERROR: unable to download video')
1582
1583
1584 class DailymotionIE(InfoExtractor):
1585 """Information Extractor for Dailymotion"""
1586
1587 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1588 IE_NAME = u'dailymotion'
1589
1590 def __init__(self, downloader=None):
1591 InfoExtractor.__init__(self, downloader)
1592
1593 def report_download_webpage(self, video_id):
1594 """Report webpage download."""
1595 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1596
1597 def report_extraction(self, video_id):
1598 """Report information extraction."""
1599 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1600
1601 def _real_extract(self, url):
1602 # Extract id and simplified title from URL
1603 mobj = re.match(self._VALID_URL, url)
1604 if mobj is None:
1605 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1606 return
1607
1608 # At this point we have a new video
1609 self._downloader.increment_downloads()
1610 video_id = mobj.group(1)
1611
1612 video_extension = 'flv'
1613
1614 # Retrieve video webpage to extract further information
1615 request = urllib2.Request(url)
1616 request.add_header('Cookie', 'family_filter=off')
1617 try:
1618 self.report_download_webpage(video_id)
1619 webpage = urllib2.urlopen(request).read()
1620 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1621 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1622 return
1623
1624 # Extract URL, uploader and title from webpage
1625 self.report_extraction(video_id)
1626 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1627 if mobj is None:
1628 self._downloader.trouble(u'ERROR: unable to extract media URL')
1629 return
1630 sequence = urllib.unquote(mobj.group(1))
1631 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1632 if mobj is None:
1633 self._downloader.trouble(u'ERROR: unable to extract media URL')
1634 return
1635 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1636
1637 # if needed add http://www.dailymotion.com/ if relative URL
1638
1639 video_url = mediaURL
1640
1641 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1642 if mobj is None:
1643 self._downloader.trouble(u'ERROR: unable to extract title')
1644 return
1645 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1646 video_title = sanitize_title(video_title)
1647 simple_title = _simplify_title(video_title)
1648
1649 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1650 if mobj is None:
1651 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1652 return
1653 video_uploader = mobj.group(1)
1654
1655 try:
1656 # Process video information
1657 self._downloader.process_info({
1658 'id': video_id.decode('utf-8'),
1659 'url': video_url.decode('utf-8'),
1660 'uploader': video_uploader.decode('utf-8'),
1661 'upload_date': u'NA',
1662 'title': video_title,
1663 'stitle': simple_title,
1664 'ext': video_extension.decode('utf-8'),
1665 'format': u'NA',
1666 'player_url': None,
1667 })
1668 except UnavailableVideoError:
1669 self._downloader.trouble(u'\nERROR: unable to download video')
1670
1671
1672 class GoogleIE(InfoExtractor):
1673 """Information extractor for video.google.com."""
1674
1675 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1676 IE_NAME = u'video.google'
1677
1678 def __init__(self, downloader=None):
1679 InfoExtractor.__init__(self, downloader)
1680
1681 def report_download_webpage(self, video_id):
1682 """Report webpage download."""
1683 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1684
1685 def report_extraction(self, video_id):
1686 """Report information extraction."""
1687 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1688
1689 def _real_extract(self, url):
1690 # Extract id from URL
1691 mobj = re.match(self._VALID_URL, url)
1692 if mobj is None:
1693 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1694 return
1695
1696 # At this point we have a new video
1697 self._downloader.increment_downloads()
1698 video_id = mobj.group(1)
1699
1700 video_extension = 'mp4'
1701
1702 # Retrieve video webpage to extract further information
1703 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1704 try:
1705 self.report_download_webpage(video_id)
1706 webpage = urllib2.urlopen(request).read()
1707 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1708 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1709 return
1710
1711 # Extract URL, uploader, and title from webpage
1712 self.report_extraction(video_id)
1713 mobj = re.search(r"download_url:'([^']+)'", webpage)
1714 if mobj is None:
1715 video_extension = 'flv'
1716 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1717 if mobj is None:
1718 self._downloader.trouble(u'ERROR: unable to extract media URL')
1719 return
1720 mediaURL = urllib.unquote(mobj.group(1))
1721 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1722 mediaURL = mediaURL.replace('\\x26', '\x26')
1723
1724 video_url = mediaURL
1725
1726 mobj = re.search(r'<title>(.*)</title>', webpage)
1727 if mobj is None:
1728 self._downloader.trouble(u'ERROR: unable to extract title')
1729 return
1730 video_title = mobj.group(1).decode('utf-8')
1731 video_title = sanitize_title(video_title)
1732 simple_title = _simplify_title(video_title)
1733
1734 # Extract video description
1735 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1736 if mobj is None:
1737 self._downloader.trouble(u'ERROR: unable to extract video description')
1738 return
1739 video_description = mobj.group(1).decode('utf-8')
1740 if not video_description:
1741 video_description = 'No description available.'
1742
1743 # Extract video thumbnail
1744 if self._downloader.params.get('forcethumbnail', False):
1745 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1746 try:
1747 webpage = urllib2.urlopen(request).read()
1748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1750 return
1751 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1752 if mobj is None:
1753 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1754 return
1755 video_thumbnail = mobj.group(1)
1756 else: # we need something to pass to process_info
1757 video_thumbnail = ''
1758
1759 try:
1760 # Process video information
1761 self._downloader.process_info({
1762 'id': video_id.decode('utf-8'),
1763 'url': video_url.decode('utf-8'),
1764 'uploader': u'NA',
1765 'upload_date': u'NA',
1766 'title': video_title,
1767 'stitle': simple_title,
1768 'ext': video_extension.decode('utf-8'),
1769 'format': u'NA',
1770 'player_url': None,
1771 })
1772 except UnavailableVideoError:
1773 self._downloader.trouble(u'\nERROR: unable to download video')
1774
1775
1776 class PhotobucketIE(InfoExtractor):
1777 """Information extractor for photobucket.com."""
1778
1779 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1780 IE_NAME = u'photobucket'
1781
1782 def __init__(self, downloader=None):
1783 InfoExtractor.__init__(self, downloader)
1784
1785 def report_download_webpage(self, video_id):
1786 """Report webpage download."""
1787 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1788
1789 def report_extraction(self, video_id):
1790 """Report information extraction."""
1791 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1792
1793 def _real_extract(self, url):
1794 # Extract id from URL
1795 mobj = re.match(self._VALID_URL, url)
1796 if mobj is None:
1797 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1798 return
1799
1800 # At this point we have a new video
1801 self._downloader.increment_downloads()
1802 video_id = mobj.group(1)
1803
1804 video_extension = 'flv'
1805
1806 # Retrieve video webpage to extract further information
1807 request = urllib2.Request(url)
1808 try:
1809 self.report_download_webpage(video_id)
1810 webpage = urllib2.urlopen(request).read()
1811 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1812 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1813 return
1814
1815 # Extract URL, uploader, and title from webpage
1816 self.report_extraction(video_id)
1817 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1818 if mobj is None:
1819 self._downloader.trouble(u'ERROR: unable to extract media URL')
1820 return
1821 mediaURL = urllib.unquote(mobj.group(1))
1822
1823 video_url = mediaURL
1824
1825 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1826 if mobj is None:
1827 self._downloader.trouble(u'ERROR: unable to extract title')
1828 return
1829 video_title = mobj.group(1).decode('utf-8')
1830 video_title = sanitize_title(video_title)
1831 simple_title = _simplify_title(vide_title)
1832
1833 video_uploader = mobj.group(2).decode('utf-8')
1834
1835 try:
1836 # Process video information
1837 self._downloader.process_info({
1838 'id': video_id.decode('utf-8'),
1839 'url': video_url.decode('utf-8'),
1840 'uploader': video_uploader,
1841 'upload_date': u'NA',
1842 'title': video_title,
1843 'stitle': simple_title,
1844 'ext': video_extension.decode('utf-8'),
1845 'format': u'NA',
1846 'player_url': None,
1847 })
1848 except UnavailableVideoError:
1849 self._downloader.trouble(u'\nERROR: unable to download video')
1850
1851
1852 class YahooIE(InfoExtractor):
1853 """Information extractor for video.yahoo.com."""
1854
1855 # _VALID_URL matches all Yahoo! Video URLs
1856 # _VPAGE_URL matches only the extractable '/watch/' URLs
1857 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1858 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1859 IE_NAME = u'video.yahoo'
1860
1861 def __init__(self, downloader=None):
1862 InfoExtractor.__init__(self, downloader)
1863
1864 def report_download_webpage(self, video_id):
1865 """Report webpage download."""
1866 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1867
1868 def report_extraction(self, video_id):
1869 """Report information extraction."""
1870 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1871
1872 def _real_extract(self, url, new_video=True):
1873 # Extract ID from URL
1874 mobj = re.match(self._VALID_URL, url)
1875 if mobj is None:
1876 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1877 return
1878
1879 # At this point we have a new video
1880 self._downloader.increment_downloads()
1881 video_id = mobj.group(2)
1882 video_extension = 'flv'
1883
1884 # Rewrite valid but non-extractable URLs as
1885 # extractable English language /watch/ URLs
1886 if re.match(self._VPAGE_URL, url) is None:
1887 request = urllib2.Request(url)
1888 try:
1889 webpage = urllib2.urlopen(request).read()
1890 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1891 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1892 return
1893
1894 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1895 if mobj is None:
1896 self._downloader.trouble(u'ERROR: Unable to extract id field')
1897 return
1898 yahoo_id = mobj.group(1)
1899
1900 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1901 if mobj is None:
1902 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1903 return
1904 yahoo_vid = mobj.group(1)
1905
1906 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1907 return self._real_extract(url, new_video=False)
1908
1909 # Retrieve video webpage to extract further information
1910 request = urllib2.Request(url)
1911 try:
1912 self.report_download_webpage(video_id)
1913 webpage = urllib2.urlopen(request).read()
1914 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1915 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1916 return
1917
1918 # Extract uploader and title from webpage
1919 self.report_extraction(video_id)
1920 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1921 if mobj is None:
1922 self._downloader.trouble(u'ERROR: unable to extract video title')
1923 return
1924 video_title = mobj.group(1).decode('utf-8')
1925 simple_title = _simplify_title(video_title)
1926
1927 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1928 if mobj is None:
1929 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1930 return
1931 video_uploader = mobj.group(1).decode('utf-8')
1932
1933 # Extract video thumbnail
1934 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1935 if mobj is None:
1936 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1937 return
1938 video_thumbnail = mobj.group(1).decode('utf-8')
1939
1940 # Extract video description
1941 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1942 if mobj is None:
1943 self._downloader.trouble(u'ERROR: unable to extract video description')
1944 return
1945 video_description = mobj.group(1).decode('utf-8')
1946 if not video_description:
1947 video_description = 'No description available.'
1948
1949 # Extract video height and width
1950 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1951 if mobj is None:
1952 self._downloader.trouble(u'ERROR: unable to extract video height')
1953 return
1954 yv_video_height = mobj.group(1)
1955
1956 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1957 if mobj is None:
1958 self._downloader.trouble(u'ERROR: unable to extract video width')
1959 return
1960 yv_video_width = mobj.group(1)
1961
1962 # Retrieve video playlist to extract media URL
1963 # I'm not completely sure what all these options are, but we
1964 # seem to need most of them, otherwise the server sends a 401.
1965 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1966 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1967 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1968 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1969 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1970 try:
1971 self.report_download_webpage(video_id)
1972 webpage = urllib2.urlopen(request).read()
1973 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1974 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1975 return
1976
1977 # Extract media URL from playlist XML
1978 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1979 if mobj is None:
1980 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1981 return
1982 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1983 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1984
1985 try:
1986 # Process video information
1987 self._downloader.process_info({
1988 'id': video_id.decode('utf-8'),
1989 'url': video_url,
1990 'uploader': video_uploader,
1991 'upload_date': u'NA',
1992 'title': video_title,
1993 'stitle': simple_title,
1994 'ext': video_extension.decode('utf-8'),
1995 'thumbnail': video_thumbnail.decode('utf-8'),
1996 'description': video_description,
1997 'thumbnail': video_thumbnail,
1998 'player_url': None,
1999 })
2000 except UnavailableVideoError:
2001 self._downloader.trouble(u'\nERROR: unable to download video')
2002
2003
2004 class VimeoIE(InfoExtractor):
2005 """Information extractor for vimeo.com."""
2006
2007 # _VALID_URL matches Vimeo URLs
2008 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2009 IE_NAME = u'vimeo'
2010
2011 def __init__(self, downloader=None):
2012 InfoExtractor.__init__(self, downloader)
2013
2014 def report_download_webpage(self, video_id):
2015 """Report webpage download."""
2016 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2017
2018 def report_extraction(self, video_id):
2019 """Report information extraction."""
2020 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2021
2022 def _real_extract(self, url, new_video=True):
2023 # Extract ID from URL
2024 mobj = re.match(self._VALID_URL, url)
2025 if mobj is None:
2026 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2027 return
2028
2029 # At this point we have a new video
2030 self._downloader.increment_downloads()
2031 video_id = mobj.group(1)
2032
2033 # Retrieve video webpage to extract further information
2034 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2035 try:
2036 self.report_download_webpage(video_id)
2037 webpage = urllib2.urlopen(request).read()
2038 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2039 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2040 return
2041
2042 # Now we begin extracting as much information as we can from what we
2043 # retrieved. First we extract the information common to all extractors,
2044 # and latter we extract those that are Vimeo specific.
2045 self.report_extraction(video_id)
2046
2047 # Extract title
2048 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2049 if mobj is None:
2050 self._downloader.trouble(u'ERROR: unable to extract video title')
2051 return
2052 video_title = mobj.group(1).decode('utf-8')
2053 simple_title = _simplify_title(video_title)
2054
2055 # Extract uploader
2056 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2057 if mobj is None:
2058 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2059 return
2060 video_uploader = mobj.group(1).decode('utf-8')
2061
2062 # Extract video thumbnail
2063 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2064 if mobj is None:
2065 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2066 return
2067 video_thumbnail = mobj.group(1).decode('utf-8')
2068
2069 # # Extract video description
2070 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2071 # if mobj is None:
2072 # self._downloader.trouble(u'ERROR: unable to extract video description')
2073 # return
2074 # video_description = mobj.group(1).decode('utf-8')
2075 # if not video_description: video_description = 'No description available.'
2076 video_description = 'Foo.'
2077
2078 # Vimeo specific: extract request signature
2079 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2080 if mobj is None:
2081 self._downloader.trouble(u'ERROR: unable to extract request signature')
2082 return
2083 sig = mobj.group(1).decode('utf-8')
2084
2085 # Vimeo specific: extract video quality information
2086 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2087 if mobj is None:
2088 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2089 return
2090 quality = mobj.group(1).decode('utf-8')
2091
2092 if int(quality) == 1:
2093 quality = 'hd'
2094 else:
2095 quality = 'sd'
2096
2097 # Vimeo specific: Extract request signature expiration
2098 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2099 if mobj is None:
2100 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2101 return
2102 sig_exp = mobj.group(1).decode('utf-8')
2103
2104 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2105
2106 try:
2107 # Process video information
2108 self._downloader.process_info({
2109 'id': video_id.decode('utf-8'),
2110 'url': video_url,
2111 'uploader': video_uploader,
2112 'upload_date': u'NA',
2113 'title': video_title,
2114 'stitle': simple_title,
2115 'ext': u'mp4',
2116 'thumbnail': video_thumbnail.decode('utf-8'),
2117 'description': video_description,
2118 'thumbnail': video_thumbnail,
2119 'description': video_description,
2120 'player_url': None,
2121 })
2122 except UnavailableVideoError:
2123 self._downloader.trouble(u'ERROR: unable to download video')
2124
2125
2126 class GenericIE(InfoExtractor):
2127 """Generic last-resort information extractor."""
2128
2129 _VALID_URL = r'.*'
2130 IE_NAME = u'generic'
2131
2132 def __init__(self, downloader=None):
2133 InfoExtractor.__init__(self, downloader)
2134
2135 def report_download_webpage(self, video_id):
2136 """Report webpage download."""
2137 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2138 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2139
2140 def report_extraction(self, video_id):
2141 """Report information extraction."""
2142 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2143
2144 def _real_extract(self, url):
2145 # At this point we have a new video
2146 self._downloader.increment_downloads()
2147
2148 video_id = url.split('/')[-1]
2149 request = urllib2.Request(url)
2150 try:
2151 self.report_download_webpage(video_id)
2152 webpage = urllib2.urlopen(request).read()
2153 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2154 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2155 return
2156 except ValueError, err:
2157 # since this is the last-resort InfoExtractor, if
2158 # this error is thrown, it'll be thrown here
2159 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2160 return
2161
2162 self.report_extraction(video_id)
2163 # Start with something easy: JW Player in SWFObject
2164 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2165 if mobj is None:
2166 # Broaden the search a little bit
2167 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2168 if mobj is None:
2169 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2170 return
2171
2172 # It's possible that one of the regexes
2173 # matched, but returned an empty group:
2174 if mobj.group(1) is None:
2175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2176 return
2177
2178 video_url = urllib.unquote(mobj.group(1))
2179 video_id = os.path.basename(video_url)
2180
2181 # here's a fun little line of code for you:
2182 video_extension = os.path.splitext(video_id)[1][1:]
2183 video_id = os.path.splitext(video_id)[0]
2184
2185 # it's tempting to parse this further, but you would
2186 # have to take into account all the variations like
2187 # Video Title - Site Name
2188 # Site Name | Video Title
2189 # Video Title - Tagline | Site Name
2190 # and so on and so forth; it's just not practical
2191 mobj = re.search(r'<title>(.*)</title>', webpage)
2192 if mobj is None:
2193 self._downloader.trouble(u'ERROR: unable to extract title')
2194 return
2195 video_title = mobj.group(1).decode('utf-8')
2196 video_title = sanitize_title(video_title)
2197 simple_title = _simplify_title(video_title)
2198
2199 # video uploader is domain name
2200 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2201 if mobj is None:
2202 self._downloader.trouble(u'ERROR: unable to extract title')
2203 return
2204 video_uploader = mobj.group(1).decode('utf-8')
2205
2206 try:
2207 # Process video information
2208 self._downloader.process_info({
2209 'id': video_id.decode('utf-8'),
2210 'url': video_url.decode('utf-8'),
2211 'uploader': video_uploader,
2212 'upload_date': u'NA',
2213 'title': video_title,
2214 'stitle': simple_title,
2215 'ext': video_extension.decode('utf-8'),
2216 'format': u'NA',
2217 'player_url': None,
2218 })
2219 except UnavailableVideoError, err:
2220 self._downloader.trouble(u'\nERROR: unable to download video')
2221
2222
2223 class YoutubeSearchIE(InfoExtractor):
2224 """Information Extractor for YouTube search queries."""
2225 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2226 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2227 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2228 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2229 _youtube_ie = None
2230 _max_youtube_results = 1000
2231 IE_NAME = u'youtube:search'
2232
2233 def __init__(self, youtube_ie, downloader=None):
2234 InfoExtractor.__init__(self, downloader)
2235 self._youtube_ie = youtube_ie
2236
2237 def report_download_page(self, query, pagenum):
2238 """Report attempt to download playlist page with given number."""
2239 query = query.decode(preferredencoding())
2240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2241
2242 def _real_initialize(self):
2243 self._youtube_ie.initialize()
2244
2245 def _real_extract(self, query):
2246 mobj = re.match(self._VALID_URL, query)
2247 if mobj is None:
2248 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2249 return
2250
2251 prefix, query = query.split(':')
2252 prefix = prefix[8:]
2253 query = query.encode('utf-8')
2254 if prefix == '':
2255 self._download_n_results(query, 1)
2256 return
2257 elif prefix == 'all':
2258 self._download_n_results(query, self._max_youtube_results)
2259 return
2260 else:
2261 try:
2262 n = long(prefix)
2263 if n <= 0:
2264 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2265 return
2266 elif n > self._max_youtube_results:
2267 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2268 n = self._max_youtube_results
2269 self._download_n_results(query, n)
2270 return
2271 except ValueError: # parsing prefix as integer fails
2272 self._download_n_results(query, 1)
2273 return
2274
2275 def _download_n_results(self, query, n):
2276 """Downloads a specified number of results for a query"""
2277
2278 video_ids = []
2279 already_seen = set()
2280 pagenum = 1
2281
2282 while True:
2283 self.report_download_page(query, pagenum)
2284 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2285 request = urllib2.Request(result_url)
2286 try:
2287 page = urllib2.urlopen(request).read()
2288 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2289 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2290 return
2291
2292 # Extract video identifiers
2293 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2294 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2295 if video_id not in already_seen:
2296 video_ids.append(video_id)
2297 already_seen.add(video_id)
2298 if len(video_ids) == n:
2299 # Specified n videos reached
2300 for id in video_ids:
2301 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2302 return
2303
2304 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2305 for id in video_ids:
2306 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2307 return
2308
2309 pagenum = pagenum + 1
2310
2311
2312 class GoogleSearchIE(InfoExtractor):
2313 """Information Extractor for Google Video search queries."""
2314 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2315 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2316 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2317 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2318 _google_ie = None
2319 _max_google_results = 1000
2320 IE_NAME = u'video.google:search'
2321
2322 def __init__(self, google_ie, downloader=None):
2323 InfoExtractor.__init__(self, downloader)
2324 self._google_ie = google_ie
2325
2326 def report_download_page(self, query, pagenum):
2327 """Report attempt to download playlist page with given number."""
2328 query = query.decode(preferredencoding())
2329 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2330
2331 def _real_initialize(self):
2332 self._google_ie.initialize()
2333
2334 def _real_extract(self, query):
2335 mobj = re.match(self._VALID_URL, query)
2336 if mobj is None:
2337 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2338 return
2339
2340 prefix, query = query.split(':')
2341 prefix = prefix[8:]
2342 query = query.encode('utf-8')
2343 if prefix == '':
2344 self._download_n_results(query, 1)
2345 return
2346 elif prefix == 'all':
2347 self._download_n_results(query, self._max_google_results)
2348 return
2349 else:
2350 try:
2351 n = long(prefix)
2352 if n <= 0:
2353 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2354 return
2355 elif n > self._max_google_results:
2356 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2357 n = self._max_google_results
2358 self._download_n_results(query, n)
2359 return
2360 except ValueError: # parsing prefix as integer fails
2361 self._download_n_results(query, 1)
2362 return
2363
2364 def _download_n_results(self, query, n):
2365 """Downloads a specified number of results for a query"""
2366
2367 video_ids = []
2368 already_seen = set()
2369 pagenum = 1
2370
2371 while True:
2372 self.report_download_page(query, pagenum)
2373 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2374 request = urllib2.Request(result_url)
2375 try:
2376 page = urllib2.urlopen(request).read()
2377 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2378 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2379 return
2380
2381 # Extract video identifiers
2382 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2383 video_id = mobj.group(1)
2384 if video_id not in already_seen:
2385 video_ids.append(video_id)
2386 already_seen.add(video_id)
2387 if len(video_ids) == n:
2388 # Specified n videos reached
2389 for id in video_ids:
2390 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2391 return
2392
2393 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2394 for id in video_ids:
2395 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2396 return
2397
2398 pagenum = pagenum + 1
2399
2400
2401 class YahooSearchIE(InfoExtractor):
2402 """Information Extractor for Yahoo! Video search queries."""
2403 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2404 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2405 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2406 _MORE_PAGES_INDICATOR = r'\s*Next'
2407 _yahoo_ie = None
2408 _max_yahoo_results = 1000
2409 IE_NAME = u'video.yahoo:search'
2410
2411 def __init__(self, yahoo_ie, downloader=None):
2412 InfoExtractor.__init__(self, downloader)
2413 self._yahoo_ie = yahoo_ie
2414
2415 def report_download_page(self, query, pagenum):
2416 """Report attempt to download playlist page with given number."""
2417 query = query.decode(preferredencoding())
2418 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2419
2420 def _real_initialize(self):
2421 self._yahoo_ie.initialize()
2422
2423 def _real_extract(self, query):
2424 mobj = re.match(self._VALID_URL, query)
2425 if mobj is None:
2426 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2427 return
2428
2429 prefix, query = query.split(':')
2430 prefix = prefix[8:]
2431 query = query.encode('utf-8')
2432 if prefix == '':
2433 self._download_n_results(query, 1)
2434 return
2435 elif prefix == 'all':
2436 self._download_n_results(query, self._max_yahoo_results)
2437 return
2438 else:
2439 try:
2440 n = long(prefix)
2441 if n <= 0:
2442 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2443 return
2444 elif n > self._max_yahoo_results:
2445 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2446 n = self._max_yahoo_results
2447 self._download_n_results(query, n)
2448 return
2449 except ValueError: # parsing prefix as integer fails
2450 self._download_n_results(query, 1)
2451 return
2452
2453 def _download_n_results(self, query, n):
2454 """Downloads a specified number of results for a query"""
2455
2456 video_ids = []
2457 already_seen = set()
2458 pagenum = 1
2459
2460 while True:
2461 self.report_download_page(query, pagenum)
2462 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2463 request = urllib2.Request(result_url)
2464 try:
2465 page = urllib2.urlopen(request).read()
2466 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2467 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2468 return
2469
2470 # Extract video identifiers
2471 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2472 video_id = mobj.group(1)
2473 if video_id not in already_seen:
2474 video_ids.append(video_id)
2475 already_seen.add(video_id)
2476 if len(video_ids) == n:
2477 # Specified n videos reached
2478 for id in video_ids:
2479 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2480 return
2481
2482 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2483 for id in video_ids:
2484 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2485 return
2486
2487 pagenum = pagenum + 1
2488
2489
2490 class YoutubePlaylistIE(InfoExtractor):
2491 """Information Extractor for YouTube playlists."""
2492
2493 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2494 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2495 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2496 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2497 _youtube_ie = None
2498 IE_NAME = u'youtube:playlist'
2499
2500 def __init__(self, youtube_ie, downloader=None):
2501 InfoExtractor.__init__(self, downloader)
2502 self._youtube_ie = youtube_ie
2503
2504 def report_download_page(self, playlist_id, pagenum):
2505 """Report attempt to download playlist page with given number."""
2506 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2507
2508 def _real_initialize(self):
2509 self._youtube_ie.initialize()
2510
2511 def _real_extract(self, url):
2512 # Extract playlist id
2513 mobj = re.match(self._VALID_URL, url)
2514 if mobj is None:
2515 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2516 return
2517
2518 # Single video case
2519 if mobj.group(3) is not None:
2520 self._youtube_ie.extract(mobj.group(3))
2521 return
2522
2523 # Download playlist pages
2524 # prefix is 'p' as default for playlists but there are other types that need extra care
2525 playlist_prefix = mobj.group(1)
2526 if playlist_prefix == 'a':
2527 playlist_access = 'artist'
2528 else:
2529 playlist_prefix = 'p'
2530 playlist_access = 'view_play_list'
2531 playlist_id = mobj.group(2)
2532 video_ids = []
2533 pagenum = 1
2534
2535 while True:
2536 self.report_download_page(playlist_id, pagenum)
2537 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2538 request = urllib2.Request(url)
2539 try:
2540 page = urllib2.urlopen(request).read()
2541 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2542 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2543 return
2544
2545 # Extract video identifiers
2546 ids_in_page = []
2547 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2548 if mobj.group(1) not in ids_in_page:
2549 ids_in_page.append(mobj.group(1))
2550 video_ids.extend(ids_in_page)
2551
2552 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2553 break
2554 pagenum = pagenum + 1
2555
2556 playliststart = self._downloader.params.get('playliststart', 1) - 1
2557 playlistend = self._downloader.params.get('playlistend', -1)
2558 video_ids = video_ids[playliststart:playlistend]
2559
2560 for id in video_ids:
2561 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2562 return
2563
2564
2565 class YoutubeUserIE(InfoExtractor):
2566 """Information Extractor for YouTube users."""
2567
2568 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2569 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2570 _GDATA_PAGE_SIZE = 50
2571 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2572 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2573 _youtube_ie = None
2574 IE_NAME = u'youtube:user'
2575
2576 def __init__(self, youtube_ie, downloader=None):
2577 InfoExtractor.__init__(self, downloader)
2578 self._youtube_ie = youtube_ie
2579
2580 def report_download_page(self, username, start_index):
2581 """Report attempt to download user page."""
2582 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2583 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2584
2585 def _real_initialize(self):
2586 self._youtube_ie.initialize()
2587
2588 def _real_extract(self, url):
2589 # Extract username
2590 mobj = re.match(self._VALID_URL, url)
2591 if mobj is None:
2592 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2593 return
2594
2595 username = mobj.group(1)
2596
2597 # Download video ids using YouTube Data API. Result size per
2598 # query is limited (currently to 50 videos) so we need to query
2599 # page by page until there are no video ids - it means we got
2600 # all of them.
2601
2602 video_ids = []
2603 pagenum = 0
2604
2605 while True:
2606 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2607 self.report_download_page(username, start_index)
2608
2609 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2610
2611 try:
2612 page = urllib2.urlopen(request).read()
2613 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2614 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2615 return
2616
2617 # Extract video identifiers
2618 ids_in_page = []
2619
2620 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2621 if mobj.group(1) not in ids_in_page:
2622 ids_in_page.append(mobj.group(1))
2623
2624 video_ids.extend(ids_in_page)
2625
2626 # A little optimization - if current page is not
2627 # "full", ie. does not contain PAGE_SIZE video ids then
2628 # we can assume that this page is the last one - there
2629 # are no more ids on further pages - no need to query
2630 # again.
2631
2632 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2633 break
2634
2635 pagenum += 1
2636
2637 all_ids_count = len(video_ids)
2638 playliststart = self._downloader.params.get('playliststart', 1) - 1
2639 playlistend = self._downloader.params.get('playlistend', -1)
2640
2641 if playlistend == -1:
2642 video_ids = video_ids[playliststart:]
2643 else:
2644 video_ids = video_ids[playliststart:playlistend]
2645
2646 self._downloader.to_screen("[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2647 (username, all_ids_count, len(video_ids)))
2648
2649 for video_id in video_ids:
2650 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2651
2652
2653 class DepositFilesIE(InfoExtractor):
2654 """Information extractor for depositfiles.com"""
2655
2656 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2657 IE_NAME = u'DepositFiles'
2658
2659 def __init__(self, downloader=None):
2660 InfoExtractor.__init__(self, downloader)
2661
2662 def report_download_webpage(self, file_id):
2663 """Report webpage download."""
2664 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2665
2666 def report_extraction(self, file_id):
2667 """Report information extraction."""
2668 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2669
2670 def _real_extract(self, url):
2671 # At this point we have a new file
2672 self._downloader.increment_downloads()
2673
2674 file_id = url.split('/')[-1]
2675 # Rebuild url in english locale
2676 url = 'http://depositfiles.com/en/files/' + file_id
2677
2678 # Retrieve file webpage with 'Free download' button pressed
2679 free_download_indication = { 'gateway_result' : '1' }
2680 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2681 try:
2682 self.report_download_webpage(file_id)
2683 webpage = urllib2.urlopen(request).read()
2684 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2685 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2686 return
2687
2688 # Search for the real file URL
2689 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2690 if (mobj is None) or (mobj.group(1) is None):
2691 # Try to figure out reason of the error.
2692 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2693 if (mobj is not None) and (mobj.group(1) is not None):
2694 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2695 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2696 else:
2697 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2698 return
2699
2700 file_url = mobj.group(1)
2701 file_extension = os.path.splitext(file_url)[1][1:]
2702
2703 # Search for file title
2704 mobj = re.search(r'<b title="(.*?)">', webpage)
2705 if mobj is None:
2706 self._downloader.trouble(u'ERROR: unable to extract title')
2707 return
2708 file_title = mobj.group(1).decode('utf-8')
2709
2710 try:
2711 # Process file information
2712 self._downloader.process_info({
2713 'id': file_id.decode('utf-8'),
2714 'url': file_url.decode('utf-8'),
2715 'uploader': u'NA',
2716 'upload_date': u'NA',
2717 'title': file_title,
2718 'stitle': file_title,
2719 'ext': file_extension.decode('utf-8'),
2720 'format': u'NA',
2721 'player_url': None,
2722 })
2723 except UnavailableVideoError, err:
2724 self._downloader.trouble(u'ERROR: unable to download file')
2725
2726
2727 class FacebookIE(InfoExtractor):
2728 """Information Extractor for Facebook"""
2729
2730 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2731 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2732 _NETRC_MACHINE = 'facebook'
2733 _available_formats = ['video', 'highqual', 'lowqual']
2734 _video_extensions = {
2735 'video': 'mp4',
2736 'highqual': 'mp4',
2737 'lowqual': 'mp4',
2738 }
2739 IE_NAME = u'facebook'
2740
2741 def __init__(self, downloader=None):
2742 InfoExtractor.__init__(self, downloader)
2743
2744 def _reporter(self, message):
2745 """Add header and report message."""
2746 self._downloader.to_screen(u'[facebook] %s' % message)
2747
2748 def report_login(self):
2749 """Report attempt to log in."""
2750 self._reporter(u'Logging in')
2751
2752 def report_video_webpage_download(self, video_id):
2753 """Report attempt to download video webpage."""
2754 self._reporter(u'%s: Downloading video webpage' % video_id)
2755
2756 def report_information_extraction(self, video_id):
2757 """Report attempt to extract video information."""
2758 self._reporter(u'%s: Extracting video information' % video_id)
2759
2760 def _parse_page(self, video_webpage):
2761 """Extract video information from page"""
2762 # General data
2763 data = {'title': r'\("video_title", "(.*?)"\)',
2764 'description': r'<div class="datawrap">(.*?)</div>',
2765 'owner': r'\("video_owner_name", "(.*?)"\)',
2766 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2767 }
2768 video_info = {}
2769 for piece in data.keys():
2770 mobj = re.search(data[piece], video_webpage)
2771 if mobj is not None:
2772 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2773
2774 # Video urls
2775 video_urls = {}
2776 for fmt in self._available_formats:
2777 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2778 if mobj is not None:
2779 # URL is in a Javascript segment inside an escaped Unicode format within
2780 # the generally utf-8 page
2781 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2782 video_info['video_urls'] = video_urls
2783
2784 return video_info
2785
2786 def _real_initialize(self):
2787 if self._downloader is None:
2788 return
2789
2790 useremail = None
2791 password = None
2792 downloader_params = self._downloader.params
2793
2794 # Attempt to use provided username and password or .netrc data
2795 if downloader_params.get('username', None) is not None:
2796 useremail = downloader_params['username']
2797 password = downloader_params['password']
2798 elif downloader_params.get('usenetrc', False):
2799 try:
2800 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2801 if info is not None:
2802 useremail = info[0]
2803 password = info[2]
2804 else:
2805 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2806 except (IOError, netrc.NetrcParseError), err:
2807 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2808 return
2809
2810 if useremail is None:
2811 return
2812
2813 # Log in
2814 login_form = {
2815 'email': useremail,
2816 'pass': password,
2817 'login': 'Log+In'
2818 }
2819 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2820 try:
2821 self.report_login()
2822 login_results = urllib2.urlopen(request).read()
2823 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2824 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2825 return
2826 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2827 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2828 return
2829
2830 def _real_extract(self, url):
2831 mobj = re.match(self._VALID_URL, url)
2832 if mobj is None:
2833 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2834 return
2835 video_id = mobj.group('ID')
2836
2837 # Get video webpage
2838 self.report_video_webpage_download(video_id)
2839 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2840 try:
2841 page = urllib2.urlopen(request)
2842 video_webpage = page.read()
2843 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2844 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2845 return
2846
2847 # Start extracting information
2848 self.report_information_extraction(video_id)
2849
2850 # Extract information
2851 video_info = self._parse_page(video_webpage)
2852
2853 # uploader
2854 if 'owner' not in video_info:
2855 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2856 return
2857 video_uploader = video_info['owner']
2858
2859 # title
2860 if 'title' not in video_info:
2861 self._downloader.trouble(u'ERROR: unable to extract video title')
2862 return
2863 video_title = video_info['title']
2864 video_title = video_title.decode('utf-8')
2865 video_title = sanitize_title(video_title)
2866
2867 simple_title = _simplify_title(video_title)
2868
2869 # thumbnail image
2870 if 'thumbnail' not in video_info:
2871 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2872 video_thumbnail = ''
2873 else:
2874 video_thumbnail = video_info['thumbnail']
2875
2876 # upload date
2877 upload_date = u'NA'
2878 if 'upload_date' in video_info:
2879 upload_time = video_info['upload_date']
2880 timetuple = email.utils.parsedate_tz(upload_time)
2881 if timetuple is not None:
2882 try:
2883 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2884 except:
2885 pass
2886
2887 # description
2888 video_description = video_info.get('description', 'No description available.')
2889
2890 url_map = video_info['video_urls']
2891 if len(url_map.keys()) > 0:
2892 # Decide which formats to download
2893 req_format = self._downloader.params.get('format', None)
2894 format_limit = self._downloader.params.get('format_limit', None)
2895
2896 if format_limit is not None and format_limit in self._available_formats:
2897 format_list = self._available_formats[self._available_formats.index(format_limit):]
2898 else:
2899 format_list = self._available_formats
2900 existing_formats = [x for x in format_list if x in url_map]
2901 if len(existing_formats) == 0:
2902 self._downloader.trouble(u'ERROR: no known formats available for video')
2903 return
2904 if req_format is None:
2905 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2906 elif req_format == 'worst':
2907 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2908 elif req_format == '-1':
2909 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2910 else:
2911 # Specific format
2912 if req_format not in url_map:
2913 self._downloader.trouble(u'ERROR: requested format not available')
2914 return
2915 video_url_list = [(req_format, url_map[req_format])] # Specific format
2916
2917 for format_param, video_real_url in video_url_list:
2918
2919 # At this point we have a new video
2920 self._downloader.increment_downloads()
2921
2922 # Extension
2923 video_extension = self._video_extensions.get(format_param, 'mp4')
2924
2925 try:
2926 # Process video information
2927 self._downloader.process_info({
2928 'id': video_id.decode('utf-8'),
2929 'url': video_real_url.decode('utf-8'),
2930 'uploader': video_uploader.decode('utf-8'),
2931 'upload_date': upload_date,
2932 'title': video_title,
2933 'stitle': simple_title,
2934 'ext': video_extension.decode('utf-8'),
2935 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2936 'thumbnail': video_thumbnail.decode('utf-8'),
2937 'description': video_description.decode('utf-8'),
2938 'player_url': None,
2939 })
2940 except UnavailableVideoError, err:
2941 self._downloader.trouble(u'\nERROR: unable to download video')
2942
2943 class BlipTVIE(InfoExtractor):
2944 """Information extractor for blip.tv"""
2945
2946 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2947 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2948 IE_NAME = u'blip.tv'
2949
2950 def report_extraction(self, file_id):
2951 """Report information extraction."""
2952 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2953
2954 def report_direct_download(self, title):
2955 """Report information extraction."""
2956 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2957
2958 def _real_extract(self, url):
2959 mobj = re.match(self._VALID_URL, url)
2960 if mobj is None:
2961 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2962 return
2963
2964 if '?' in url:
2965 cchar = '&'
2966 else:
2967 cchar = '?'
2968 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2969 request = urllib2.Request(json_url)
2970 self.report_extraction(mobj.group(1))
2971 info = None
2972 try:
2973 urlh = urllib2.urlopen(request)
2974 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2975 basename = url.split('/')[-1]
2976 title,ext = os.path.splitext(basename)
2977 title = title.decode('UTF-8')
2978 ext = ext.replace('.', '')
2979 self.report_direct_download(title)
2980 info = {
2981 'id': title,
2982 'url': url,
2983 'title': title,
2984 'stitle': _simplify_title(title),
2985 'ext': ext,
2986 'urlhandle': urlh
2987 }
2988 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2989 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2990 return
2991 if info is None: # Regular URL
2992 try:
2993 json_code = urlh.read()
2994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2995 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2996 return
2997
2998 try:
2999 json_data = json.loads(json_code)
3000 if 'Post' in json_data:
3001 data = json_data['Post']
3002 else:
3003 data = json_data
3004
3005 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3006 video_url = data['media']['url']
3007 umobj = re.match(self._URL_EXT, video_url)
3008 if umobj is None:
3009 raise ValueError('Can not determine filename extension')
3010 ext = umobj.group(1)
3011
3012 info = {
3013 'id': data['item_id'],
3014 'url': video_url,
3015 'uploader': data['display_name'],
3016 'upload_date': upload_date,
3017 'title': data['title'],
3018 'stitle': _simplify_title(data['title']),
3019 'ext': ext,
3020 'format': data['media']['mimeType'],
3021 'thumbnail': data['thumbnailUrl'],
3022 'description': data['description'],
3023 'player_url': data['embedUrl']
3024 }
3025 except (ValueError,KeyError), err:
3026 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3027 return
3028
3029 self._downloader.increment_downloads()
3030
3031 try:
3032 self._downloader.process_info(info)
3033 except UnavailableVideoError, err:
3034 self._downloader.trouble(u'\nERROR: unable to download video')
3035
3036
3037 class MyVideoIE(InfoExtractor):
3038 """Information Extractor for myvideo.de."""
3039
3040 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3041 IE_NAME = u'myvideo'
3042
3043 def __init__(self, downloader=None):
3044 InfoExtractor.__init__(self, downloader)
3045
3046 def report_download_webpage(self, video_id):
3047 """Report webpage download."""
3048 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3049
3050 def report_extraction(self, video_id):
3051 """Report information extraction."""
3052 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3053
3054 def _real_extract(self,url):
3055 mobj = re.match(self._VALID_URL, url)
3056 if mobj is None:
3057 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3058 return
3059
3060 video_id = mobj.group(1)
3061
3062 # Get video webpage
3063 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3064 try:
3065 self.report_download_webpage(video_id)
3066 webpage = urllib2.urlopen(request).read()
3067 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3068 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3069 return
3070
3071 self.report_extraction(video_id)
3072 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3073 webpage)
3074 if mobj is None:
3075 self._downloader.trouble(u'ERROR: unable to extract media URL')
3076 return
3077 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3078
3079 mobj = re.search('<title>([^<]+)</title>', webpage)
3080 if mobj is None:
3081 self._downloader.trouble(u'ERROR: unable to extract title')
3082 return
3083
3084 video_title = mobj.group(1)
3085 video_title = sanitize_title(video_title)
3086
3087 simple_title = _simplify_title(video_title)
3088
3089 try:
3090 self._downloader.process_info({
3091 'id': video_id,
3092 'url': video_url,
3093 'uploader': u'NA',
3094 'upload_date': u'NA',
3095 'title': video_title,
3096 'stitle': simple_title,
3097 'ext': u'flv',
3098 'format': u'NA',
3099 'player_url': None,
3100 })
3101 except UnavailableVideoError:
3102 self._downloader.trouble(u'\nERROR: Unable to download video')
3103
3104 class ComedyCentralIE(InfoExtractor):
3105 """Information extractor for The Daily Show and Colbert Report """
3106
3107 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3108 IE_NAME = u'comedycentral'
3109
3110 def report_extraction(self, episode_id):
3111 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3112
3113 def report_config_download(self, episode_id):
3114 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3115
3116 def report_index_download(self, episode_id):
3117 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3118
3119 def report_player_url(self, episode_id):
3120 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3121
3122 def _real_extract(self, url):
3123 mobj = re.match(self._VALID_URL, url)
3124 if mobj is None:
3125 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3126 return
3127
3128 if mobj.group('shortname'):
3129 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3130 url = u'http://www.thedailyshow.com/full-episodes/'
3131 else:
3132 url = u'http://www.colbertnation.com/full-episodes/'
3133 mobj = re.match(self._VALID_URL, url)
3134 assert mobj is not None
3135
3136 dlNewest = not mobj.group('episode')
3137 if dlNewest:
3138 epTitle = mobj.group('showname')
3139 else:
3140 epTitle = mobj.group('episode')
3141
3142 req = urllib2.Request(url)
3143 self.report_extraction(epTitle)
3144 try:
3145 htmlHandle = urllib2.urlopen(req)
3146 html = htmlHandle.read()
3147 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3148 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3149 return
3150 if dlNewest:
3151 url = htmlHandle.geturl()
3152 mobj = re.match(self._VALID_URL, url)
3153 if mobj is None:
3154 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3155 return
3156 if mobj.group('episode') == '':
3157 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3158 return
3159 epTitle = mobj.group('episode')
3160
3161 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3162 if len(mMovieParams) == 0:
3163 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3164 return
3165
3166 playerUrl_raw = mMovieParams[0][0]
3167 self.report_player_url(epTitle)
3168 try:
3169 urlHandle = urllib2.urlopen(playerUrl_raw)
3170 playerUrl = urlHandle.geturl()
3171 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3172 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3173 return
3174
3175 uri = mMovieParams[0][1]
3176 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3177 self.report_index_download(epTitle)
3178 try:
3179 indexXml = urllib2.urlopen(indexUrl).read()
3180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3181 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3182 return
3183
3184 idoc = xml.etree.ElementTree.fromstring(indexXml)
3185 itemEls = idoc.findall('.//item')
3186 for itemEl in itemEls:
3187 mediaId = itemEl.findall('./guid')[0].text
3188 shortMediaId = mediaId.split(':')[-1]
3189 showId = mediaId.split(':')[-2].replace('.com', '')
3190 officialTitle = itemEl.findall('./title')[0].text
3191 officialDate = itemEl.findall('./pubDate')[0].text
3192
3193 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3194 urllib.urlencode({'uri': mediaId}))
3195 configReq = urllib2.Request(configUrl)
3196 self.report_config_download(epTitle)
3197 try:
3198 configXml = urllib2.urlopen(configReq).read()
3199 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3200 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3201 return
3202
3203 cdoc = xml.etree.ElementTree.fromstring(configXml)
3204 turls = []
3205 for rendition in cdoc.findall('.//rendition'):
3206 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3207 turls.append(finfo)
3208
3209 if len(turls) == 0:
3210 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3211 continue
3212
3213 # For now, just pick the highest bitrate
3214 format,video_url = turls[-1]
3215
3216 self._downloader.increment_downloads()
3217
3218 effTitle = showId + u'-' + epTitle
3219 info = {
3220 'id': shortMediaId,
3221 'url': video_url,
3222 'uploader': showId,
3223 'upload_date': officialDate,
3224 'title': effTitle,
3225 'stitle': _simplify_title(effTitle),
3226 'ext': 'mp4',
3227 'format': format,
3228 'thumbnail': None,
3229 'description': officialTitle,
3230 'player_url': playerUrl
3231 }
3232
3233 try:
3234 self._downloader.process_info(info)
3235 except UnavailableVideoError, err:
3236 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3237 continue
3238
3239
3240 class EscapistIE(InfoExtractor):
3241 """Information extractor for The Escapist """
3242
3243 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3244 IE_NAME = u'escapist'
3245
3246 def report_extraction(self, showName):
3247 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3248
3249 def report_config_download(self, showName):
3250 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3251
3252 def _real_extract(self, url):
3253 htmlParser = HTMLParser.HTMLParser()
3254
3255 mobj = re.match(self._VALID_URL, url)
3256 if mobj is None:
3257 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3258 return
3259 showName = mobj.group('showname')
3260 videoId = mobj.group('episode')
3261
3262 self.report_extraction(showName)
3263 try:
3264 webPage = urllib2.urlopen(url).read()
3265 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3266 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3267 return
3268
3269 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3270 description = htmlParser.unescape(descMatch.group(1))
3271 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3272 imgUrl = htmlParser.unescape(imgMatch.group(1))
3273 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3274 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3275 configUrlMatch = re.search('config=(.*)$', playerUrl)
3276 configUrl = urllib2.unquote(configUrlMatch.group(1))
3277
3278 self.report_config_download(showName)
3279 try:
3280 configJSON = urllib2.urlopen(configUrl).read()
3281 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3282 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3283 return
3284
3285 # Technically, it's JavaScript, not JSON
3286 configJSON = configJSON.replace("'", '"')
3287
3288 try:
3289 config = json.loads(configJSON)
3290 except (ValueError,), err:
3291 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3292 return
3293
3294 playlist = config['playlist']
3295 videoUrl = playlist[1]['url']
3296
3297 self._downloader.increment_downloads()
3298 info = {
3299 'id': videoId,
3300 'url': videoUrl,
3301 'uploader': showName,
3302 'upload_date': None,
3303 'title': showName,
3304 'stitle': _simplify_title(showName),
3305 'ext': 'flv',
3306 'format': 'flv',
3307 'thumbnail': imgUrl,
3308 'description': description,
3309 'player_url': playerUrl,
3310 }
3311
3312 try:
3313 self._downloader.process_info(info)
3314 except UnavailableVideoError, err:
3315 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3316
3317
3318 class CollegeHumorIE(InfoExtractor):
3319 """Information extractor for collegehumor.com"""
3320
3321 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3322 IE_NAME = u'collegehumor'
3323
3324 def report_webpage(self, video_id):
3325 """Report information extraction."""
3326 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3327
3328 def report_extraction(self, video_id):
3329 """Report information extraction."""
3330 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3331
3332 def _real_extract(self, url):
3333 htmlParser = HTMLParser.HTMLParser()
3334
3335 mobj = re.match(self._VALID_URL, url)
3336 if mobj is None:
3337 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3338 return
3339 video_id = mobj.group('videoid')
3340
3341 self.report_webpage(video_id)
3342 request = urllib2.Request(url)
3343 try:
3344 webpage = urllib2.urlopen(request).read()
3345 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3346 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3347 return
3348
3349 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3350 if m is None:
3351 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3352 return
3353 internal_video_id = m.group('internalvideoid')
3354
3355 info = {
3356 'id': video_id,
3357 'internal_id': internal_video_id,
3358 }
3359
3360 self.report_extraction(video_id)
3361 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3362 try:
3363 metaXml = urllib2.urlopen(xmlUrl).read()
3364 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3365 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3366 return
3367
3368 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3369 try:
3370 videoNode = mdoc.findall('./video')[0]
3371 info['description'] = videoNode.findall('./description')[0].text
3372 info['title'] = videoNode.findall('./caption')[0].text
3373 info['stitle'] = _simplify_title(info['title'])
3374 info['url'] = videoNode.findall('./file')[0].text
3375 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3376 info['ext'] = info['url'].rpartition('.')[2]
3377 info['format'] = info['ext']
3378 except IndexError:
3379 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3380 return
3381
3382 self._downloader.increment_downloads()
3383
3384 try:
3385 self._downloader.process_info(info)
3386 except UnavailableVideoError, err:
3387 self._downloader.trouble(u'\nERROR: unable to download video')
3388
3389
3390 class XVideosIE(InfoExtractor):
3391 """Information extractor for xvideos.com"""
3392
3393 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3394 IE_NAME = u'xvideos'
3395
3396 def report_webpage(self, video_id):
3397 """Report information extraction."""
3398 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3399
3400 def report_extraction(self, video_id):
3401 """Report information extraction."""
3402 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3403
3404 def _real_extract(self, url):
3405 htmlParser = HTMLParser.HTMLParser()
3406
3407 mobj = re.match(self._VALID_URL, url)
3408 if mobj is None:
3409 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3410 return
3411 video_id = mobj.group(1).decode('utf-8')
3412
3413 self.report_webpage(video_id)
3414
3415 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3416 try:
3417 webpage = urllib2.urlopen(request).read()
3418 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3419 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3420 return
3421
3422 self.report_extraction(video_id)
3423
3424
3425 # Extract video URL
3426 mobj = re.search(r'flv_url=(.+?)&', webpage)
3427 if mobj is None:
3428 self._downloader.trouble(u'ERROR: unable to extract video url')
3429 return
3430 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3431
3432
3433 # Extract title
3434 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3435 if mobj is None:
3436 self._downloader.trouble(u'ERROR: unable to extract video title')
3437 return
3438 video_title = mobj.group(1).decode('utf-8')
3439
3440
3441 # Extract video thumbnail
3442 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3443 if mobj is None:
3444 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3445 return
3446 video_thumbnail = mobj.group(1).decode('utf-8')
3447
3448
3449
3450 self._downloader.increment_downloads()
3451 info = {
3452 'id': video_id,
3453 'url': video_url,
3454 'uploader': None,
3455 'upload_date': None,
3456 'title': video_title,
3457 'stitle': _simplify_title(video_title),
3458 'ext': 'flv',
3459 'format': 'flv',
3460 'thumbnail': video_thumbnail,
3461 'description': None,
3462 'player_url': None,
3463 }
3464
3465 try:
3466 self._downloader.process_info(info)
3467 except UnavailableVideoError, err:
3468 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3469
3470
3471 class SoundcloudIE(InfoExtractor):
3472 """Information extractor for soundcloud.com
3473 To access the media, the uid of the song and a stream token
3474 must be extracted from the page source and the script must make
3475 a request to media.soundcloud.com/crossdomain.xml. Then
3476 the media can be grabbed by requesting from an url composed
3477 of the stream token and uid
3478 """
3479
3480 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3481 IE_NAME = u'soundcloud'
3482
3483 def __init__(self, downloader=None):
3484 InfoExtractor.__init__(self, downloader)
3485
3486 def report_webpage(self, video_id):
3487 """Report information extraction."""
3488 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3489
3490 def report_extraction(self, video_id):
3491 """Report information extraction."""
3492 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3493
3494 def _real_extract(self, url):
3495 htmlParser = HTMLParser.HTMLParser()
3496
3497 mobj = re.match(self._VALID_URL, url)
3498 if mobj is None:
3499 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3500 return
3501
3502 # extract uploader (which is in the url)
3503 uploader = mobj.group(1).decode('utf-8')
3504 # extract simple title (uploader + slug of song title)
3505 slug_title = mobj.group(2).decode('utf-8')
3506 simple_title = uploader + '-' + slug_title
3507
3508 self.report_webpage('%s/%s' % (uploader, slug_title))
3509
3510 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3511 try:
3512 webpage = urllib2.urlopen(request).read()
3513 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3514 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3515 return
3516
3517 self.report_extraction('%s/%s' % (uploader, slug_title))
3518
3519 # extract uid and stream token that soundcloud hands out for access
3520 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3521 if mobj:
3522 video_id = mobj.group(1)
3523 stream_token = mobj.group(2)
3524
3525 # extract unsimplified title
3526 mobj = re.search('"title":"(.*?)",', webpage)
3527 if mobj:
3528 title = mobj.group(1)
3529
3530 # construct media url (with uid/token)
3531 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3532 mediaURL = mediaURL % (video_id, stream_token)
3533
3534 # description
3535 description = u'No description available'
3536 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3537 if mobj:
3538 description = mobj.group(1)
3539
3540 # upload date
3541 upload_date = None
3542 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3543 if mobj:
3544 try:
3545 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3546 except Exception, e:
3547 print str(e)
3548
3549 # for soundcloud, a request to a cross domain is required for cookies
3550 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3551
3552 try:
3553 self._downloader.process_info({
3554 'id': video_id.decode('utf-8'),
3555 'url': mediaURL,
3556 'uploader': uploader.decode('utf-8'),
3557 'upload_date': upload_date,
3558 'title': simple_title.decode('utf-8'),
3559 'stitle': simple_title.decode('utf-8'),
3560 'ext': u'mp3',
3561 'format': u'NA',
3562 'player_url': None,
3563 'description': description.decode('utf-8')
3564 })
3565 except UnavailableVideoError:
3566 self._downloader.trouble(u'\nERROR: unable to download video')
3567
3568
3569 class InfoQIE(InfoExtractor):
3570 """Information extractor for infoq.com"""
3571
3572 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3573 IE_NAME = u'infoq'
3574
3575 def report_webpage(self, video_id):
3576 """Report information extraction."""
3577 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3578
3579 def report_extraction(self, video_id):
3580 """Report information extraction."""
3581 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3582
3583 def _real_extract(self, url):
3584 htmlParser = HTMLParser.HTMLParser()
3585
3586 mobj = re.match(self._VALID_URL, url)
3587 if mobj is None:
3588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3589 return
3590
3591 self.report_webpage(url)
3592
3593 request = urllib2.Request(url)
3594 try:
3595 webpage = urllib2.urlopen(request).read()
3596 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3597 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3598 return
3599
3600 self.report_extraction(url)
3601
3602
3603 # Extract video URL
3604 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3605 if mobj is None:
3606 self._downloader.trouble(u'ERROR: unable to extract video url')
3607 return
3608 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3609
3610
3611 # Extract title
3612 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3613 if mobj is None:
3614 self._downloader.trouble(u'ERROR: unable to extract video title')
3615 return
3616 video_title = mobj.group(1).decode('utf-8')
3617
3618 # Extract description
3619 video_description = u'No description available.'
3620 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3621 if mobj is not None:
3622 video_description = mobj.group(1).decode('utf-8')
3623
3624 video_filename = video_url.split('/')[-1]
3625 video_id, extension = video_filename.split('.')
3626
3627 self._downloader.increment_downloads()
3628 info = {
3629 'id': video_id,
3630 'url': video_url,
3631 'uploader': None,
3632 'upload_date': None,
3633 'title': video_title,
3634 'stitle': _simplify_title(video_title),
3635 'ext': extension,
3636 'format': extension, # Extension is always(?) mp4, but seems to be flv
3637 'thumbnail': None,
3638 'description': video_description,
3639 'player_url': None,
3640 }
3641
3642 try:
3643 self._downloader.process_info(info)
3644 except UnavailableVideoError, err:
3645 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3646
3647 class MixcloudIE(InfoExtractor):
3648 """Information extractor for www.mixcloud.com"""
3649 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3650 IE_NAME = u'mixcloud'
3651
3652 def __init__(self, downloader=None):
3653 InfoExtractor.__init__(self, downloader)
3654
3655 def report_download_json(self, file_id):
3656 """Report JSON download."""
3657 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3658
3659 def report_extraction(self, file_id):
3660 """Report information extraction."""
3661 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3662
3663 def get_urls(self, jsonData, fmt, bitrate='best'):
3664 """Get urls from 'audio_formats' section in json"""
3665 file_url = None
3666 try:
3667 bitrate_list = jsonData[fmt]
3668 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3669 bitrate = max(bitrate_list) # select highest
3670
3671 url_list = jsonData[fmt][bitrate]
3672 except TypeError: # we have no bitrate info.
3673 url_list = jsonData[fmt]
3674
3675 return url_list
3676
3677 def check_urls(self, url_list):
3678 """Returns 1st active url from list"""
3679 for url in url_list:
3680 try:
3681 urllib2.urlopen(url)
3682 return url
3683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3684 url = None
3685
3686 return None
3687
3688 def _print_formats(self, formats):
3689 print 'Available formats:'
3690 for fmt in formats.keys():
3691 for b in formats[fmt]:
3692 try:
3693 ext = formats[fmt][b][0]
3694 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3695 except TypeError: # we have no bitrate info
3696 ext = formats[fmt][0]
3697 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3698 break
3699
3700 def _real_extract(self, url):
3701 mobj = re.match(self._VALID_URL, url)
3702 if mobj is None:
3703 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3704 return
3705 # extract uploader & filename from url
3706 uploader = mobj.group(1).decode('utf-8')
3707 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3708
3709 # construct API request
3710 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3711 # retrieve .json file with links to files
3712 request = urllib2.Request(file_url)
3713 try:
3714 self.report_download_json(file_url)
3715 jsonData = urllib2.urlopen(request).read()
3716 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3717 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3718 return
3719
3720 # parse JSON
3721 json_data = json.loads(jsonData)
3722 player_url = json_data['player_swf_url']
3723 formats = dict(json_data['audio_formats'])
3724
3725 req_format = self._downloader.params.get('format', None)
3726 bitrate = None
3727
3728 if self._downloader.params.get('listformats', None):
3729 self._print_formats(formats)
3730 return
3731
3732 if req_format is None or req_format == 'best':
3733 for format_param in formats.keys():
3734 url_list = self.get_urls(formats, format_param)
3735 # check urls
3736 file_url = self.check_urls(url_list)
3737 if file_url is not None:
3738 break # got it!
3739 else:
3740 if req_format not in formats.keys():
3741 self._downloader.trouble(u'ERROR: format is not available')
3742 return
3743
3744 url_list = self.get_urls(formats, req_format)
3745 file_url = self.check_urls(url_list)
3746 format_param = req_format
3747
3748 # We have audio
3749 self._downloader.increment_downloads()
3750 try:
3751 # Process file information
3752 self._downloader.process_info({
3753 'id': file_id.decode('utf-8'),
3754 'url': file_url.decode('utf-8'),
3755 'uploader': uploader.decode('utf-8'),
3756 'upload_date': u'NA',
3757 'title': json_data['name'],
3758 'stitle': _simplify_title(json_data['name']),
3759 'ext': file_url.split('.')[-1].decode('utf-8'),
3760 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3761 'thumbnail': json_data['thumbnail_url'],
3762 'description': json_data['description'],
3763 'player_url': player_url.decode('utf-8'),
3764 })
3765 except UnavailableVideoError, err:
3766 self._downloader.trouble(u'ERROR: unable to download file')
3767
3768 class StanfordOpenClassroomIE(InfoExtractor):
3769 """Information extractor for Stanford's Open ClassRoom"""
3770
3771 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3772 IE_NAME = u'stanfordoc'
3773
3774 def report_download_webpage(self, objid):
3775 """Report information extraction."""
3776 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3777
3778 def report_extraction(self, video_id):
3779 """Report information extraction."""
3780 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3781
3782 def _real_extract(self, url):
3783 mobj = re.match(self._VALID_URL, url)
3784 if mobj is None:
3785 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3786 return
3787
3788 if mobj.group('course') and mobj.group('video'): # A specific video
3789 course = mobj.group('course')
3790 video = mobj.group('video')
3791 info = {
3792 'id': _simplify_title(course + '_' + video),
3793 }
3794
3795 self.report_extraction(info['id'])
3796 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3797 xmlUrl = baseUrl + video + '.xml'
3798 try:
3799 metaXml = urllib2.urlopen(xmlUrl).read()
3800 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3801 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3802 return
3803 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3804 try:
3805 info['title'] = mdoc.findall('./title')[0].text
3806 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3807 except IndexError:
3808 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3809 return
3810 info['stitle'] = _simplify_title(info['title'])
3811 info['ext'] = info['url'].rpartition('.')[2]
3812 info['format'] = info['ext']
3813 self._downloader.increment_downloads()
3814 try:
3815 self._downloader.process_info(info)
3816 except UnavailableVideoError, err:
3817 self._downloader.trouble(u'\nERROR: unable to download video')
3818 elif mobj.group('course'): # A course page
3819 unescapeHTML = HTMLParser.HTMLParser().unescape
3820
3821 course = mobj.group('course')
3822 info = {
3823 'id': _simplify_title(course),
3824 'type': 'playlist',
3825 }
3826
3827 self.report_download_webpage(info['id'])
3828 try:
3829 coursepage = urllib2.urlopen(url).read()
3830 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3831 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3832 return
3833
3834 m = re.search('<h1>([^<]+)</h1>', coursepage)
3835 if m:
3836 info['title'] = unescapeHTML(m.group(1))
3837 else:
3838 info['title'] = info['id']
3839 info['stitle'] = _simplify_title(info['title'])
3840
3841 m = re.search('<description>([^<]+)</description>', coursepage)
3842 if m:
3843 info['description'] = unescapeHTML(m.group(1))
3844
3845 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3846 info['list'] = [
3847 {
3848 'type': 'reference',
3849 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3850 }
3851 for vpage in links]
3852
3853 for entry in info['list']:
3854 assert entry['type'] == 'reference'
3855 self.extract(entry['url'])
3856 else: # Root page
3857 unescapeHTML = HTMLParser.HTMLParser().unescape
3858
3859 info = {
3860 'id': 'Stanford OpenClassroom',
3861 'type': 'playlist',
3862 }
3863
3864 self.report_download_webpage(info['id'])
3865 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3866 try:
3867 rootpage = urllib2.urlopen(rootURL).read()
3868 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3869 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3870 return
3871
3872 info['title'] = info['id']
3873 info['stitle'] = _simplify_title(info['title'])
3874
3875 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3876 info['list'] = [
3877 {
3878 'type': 'reference',
3879 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3880 }
3881 for cpage in links]
3882
3883 for entry in info['list']:
3884 assert entry['type'] == 'reference'
3885 self.extract(entry['url'])
3886
3887
3888 class PostProcessor(object):
3889 """Post Processor class.
3890
3891 PostProcessor objects can be added to downloaders with their
3892 add_post_processor() method. When the downloader has finished a
3893 successful download, it will take its internal chain of PostProcessors
3894 and start calling the run() method on each one of them, first with
3895 an initial argument and then with the returned value of the previous
3896 PostProcessor.
3897
3898 The chain will be stopped if one of them ever returns None or the end
3899 of the chain is reached.
3900
3901 PostProcessor objects follow a "mutual registration" process similar
3902 to InfoExtractor objects.
3903 """
3904
3905 _downloader = None
3906
3907 def __init__(self, downloader=None):
3908 self._downloader = downloader
3909
3910 def set_downloader(self, downloader):
3911 """Sets the downloader for this PP."""
3912 self._downloader = downloader
3913
3914 def run(self, information):
3915 """Run the PostProcessor.
3916
3917 The "information" argument is a dictionary like the ones
3918 composed by InfoExtractors. The only difference is that this
3919 one has an extra field called "filepath" that points to the
3920 downloaded file.
3921
3922 When this method returns None, the postprocessing chain is
3923 stopped. However, this method may return an information
3924 dictionary that will be passed to the next postprocessing
3925 object in the chain. It can be the one it received after
3926 changing some fields.
3927
3928 In addition, this method may raise a PostProcessingError
3929 exception that will be taken into account by the downloader
3930 it was called from.
3931 """
3932 return information # by default, do nothing
3933
3934 class AudioConversionError(BaseException):
3935 def __init__(self, message):
3936 self.message = message
3937
3938 class FFmpegExtractAudioPP(PostProcessor):
3939
3940 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
3941 PostProcessor.__init__(self, downloader)
3942 if preferredcodec is None:
3943 preferredcodec = 'best'
3944 self._preferredcodec = preferredcodec
3945 self._preferredquality = preferredquality
3946 self._keepvideo = keepvideo
3947
3948 @staticmethod
3949 def get_audio_codec(path):
3950 try:
3951 cmd = ['ffprobe', '-show_streams', '--', path]
3952 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
3953 output = handle.communicate()[0]
3954 if handle.wait() != 0:
3955 return None
3956 except (IOError, OSError):
3957 return None
3958 audio_codec = None
3959 for line in output.split('\n'):
3960 if line.startswith('codec_name='):
3961 audio_codec = line.split('=')[1].strip()
3962 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
3963 return audio_codec
3964 return None
3965
3966 @staticmethod
3967 def run_ffmpeg(path, out_path, codec, more_opts):
3968 if codec is None:
3969 acodec_opts = []
3970 else:
3971 acodec_opts = ['-acodec', codec]
3972 cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path]
3973 try:
3974 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3975 stdout,stderr = p.communicate()
3976 except (IOError, OSError):
3977 e = sys.exc_info()[1]
3978 if isinstance(e, OSError) and e.errno == 2:
3979 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
3980 else:
3981 raise e
3982 if p.returncode != 0:
3983 msg = stderr.strip().split('\n')[-1]
3984 raise AudioConversionError(msg)
3985
3986 def run(self, information):
3987 path = information['filepath']
3988
3989 filecodec = self.get_audio_codec(path)
3990 if filecodec is None:
3991 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
3992 return None
3993
3994 more_opts = []
3995 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
3996 if self._preferredcodec == 'm4a' and filecodec == 'aac':
3997 # Lossless, but in another container
3998 acodec = 'copy'
3999 extension = self._preferredcodec
4000 more_opts = ['-absf', 'aac_adtstoasc']
4001 elif filecodec in ['aac', 'mp3', 'vorbis']:
4002 # Lossless if possible
4003 acodec = 'copy'
4004 extension = filecodec
4005 if filecodec == 'aac':
4006 more_opts = ['-f', 'adts']
4007 if filecodec == 'vorbis':
4008 extension = 'ogg'
4009 else:
4010 # MP3 otherwise.
4011 acodec = 'libmp3lame'
4012 extension = 'mp3'
4013 more_opts = []
4014 if self._preferredquality is not None:
4015 more_opts += ['-ab', self._preferredquality]
4016 else:
4017 # We convert the audio (lossy)
4018 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4019 extension = self._preferredcodec
4020 more_opts = []
4021 if self._preferredquality is not None:
4022 more_opts += ['-ab', self._preferredquality]
4023 if self._preferredcodec == 'aac':
4024 more_opts += ['-f', 'adts']
4025 if self._preferredcodec == 'm4a':
4026 more_opts += ['-absf', 'aac_adtstoasc']
4027 if self._preferredcodec == 'vorbis':
4028 extension = 'ogg'
4029 if self._preferredcodec == 'wav':
4030 extension = 'wav'
4031 more_opts += ['-f', 'wav']
4032
4033 (prefix, ext) = os.path.splitext(path)
4034 new_path = prefix + '.' + extension
4035 self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path)
4036 try:
4037 self.run_ffmpeg(path, new_path, acodec, more_opts)
4038 except:
4039 etype,e,tb = sys.exc_info()
4040 if isinstance(e, AudioConversionError):
4041 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4042 else:
4043 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4044 return None
4045
4046 # Try to update the date time for extracted audio file.
4047 if information.get('filetime') is not None:
4048 try:
4049 os.utime(new_path, (time.time(), information['filetime']))
4050 except:
4051 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4052
4053 if not self._keepvideo:
4054 try:
4055 os.remove(path)
4056 except (IOError, OSError):
4057 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4058 return None
4059
4060 information['filepath'] = new_path
4061 return information
4062
4063
4064 def updateSelf(downloader, filename):
4065 ''' Update the program file with the latest version from the repository '''
4066 # Note: downloader only used for options
4067 if not os.access(filename, os.W_OK):
4068 sys.exit('ERROR: no write permissions on %s' % filename)
4069
4070 downloader.to_screen('Updating to latest version...')
4071
4072 try:
4073 try:
4074 urlh = urllib.urlopen(UPDATE_URL)
4075 newcontent = urlh.read()
4076
4077 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4078 if vmatch is not None and vmatch.group(1) == __version__:
4079 downloader.to_screen('youtube-dl is up-to-date (' + __version__ + ')')
4080 return
4081 finally:
4082 urlh.close()
4083 except (IOError, OSError), err:
4084 sys.exit('ERROR: unable to download latest version')
4085
4086 try:
4087 outf = open(filename, 'wb')
4088 try:
4089 outf.write(newcontent)
4090 finally:
4091 outf.close()
4092 except (IOError, OSError), err:
4093 sys.exit('ERROR: unable to overwrite current version')
4094
4095 downloader.to_screen('Updated youtube-dl. Restart youtube-dl to use the new version.')
4096
4097 def parseOpts():
4098 # Deferred imports
4099 import getpass
4100 import optparse
4101 import shlex
4102
4103 def _readOptions(filename):
4104 try:
4105 optionf = open(filename)
4106 except IOError:
4107 return [] # silently skip if file is not present
4108 try:
4109 res = []
4110 for l in optionf:
4111 res += shlex.split(l, comments=True)
4112 finally:
4113 optionf.close()
4114 return res
4115
4116 def _format_option_string(option):
4117 ''' ('-o', '--option') -> -o, --format METAVAR'''
4118
4119 opts = []
4120
4121 if option._short_opts: opts.append(option._short_opts[0])
4122 if option._long_opts: opts.append(option._long_opts[0])
4123 if len(opts) > 1: opts.insert(1, ', ')
4124
4125 if option.takes_value(): opts.append(' %s' % option.metavar)
4126
4127 return "".join(opts)
4128
4129 def _find_term_columns():
4130 columns = os.environ.get('COLUMNS', None)
4131 if columns:
4132 return int(columns)
4133
4134 try:
4135 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4136 out,err = sp.communicate()
4137 return int(out.split()[1])
4138 except:
4139 pass
4140 return None
4141
4142 max_width = 80
4143 max_help_position = 80
4144
4145 # No need to wrap help messages if we're on a wide console
4146 columns = _find_term_columns()
4147 if columns: max_width = columns
4148
4149 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4150 fmt.format_option_strings = _format_option_string
4151
4152 kw = {
4153 'version' : __version__,
4154 'formatter' : fmt,
4155 'usage' : '%prog [options] url [url...]',
4156 'conflict_handler' : 'resolve',
4157 }
4158
4159 parser = optparse.OptionParser(**kw)
4160
4161 # option groups
4162 general = optparse.OptionGroup(parser, 'General Options')
4163 selection = optparse.OptionGroup(parser, 'Video Selection')
4164 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4165 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4166 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4167 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4168 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4169
4170 general.add_option('-h', '--help',
4171 action='help', help='print this help text and exit')
4172 general.add_option('-v', '--version',
4173 action='version', help='print program version and exit')
4174 general.add_option('-U', '--update',
4175 action='store_true', dest='update_self', help='update this program to latest version')
4176 general.add_option('-i', '--ignore-errors',
4177 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4178 general.add_option('-r', '--rate-limit',
4179 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4180 general.add_option('-R', '--retries',
4181 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4182 general.add_option('--dump-user-agent',
4183 action='store_true', dest='dump_user_agent',
4184 help='display the current browser identification', default=False)
4185 general.add_option('--list-extractors',
4186 action='store_true', dest='list_extractors',
4187 help='List all supported extractors and the URLs they would handle', default=False)
4188
4189 selection.add_option('--playlist-start',
4190 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4191 selection.add_option('--playlist-end',
4192 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4193 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4194 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4195 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4196
4197 authentication.add_option('-u', '--username',
4198 dest='username', metavar='USERNAME', help='account username')
4199 authentication.add_option('-p', '--password',
4200 dest='password', metavar='PASSWORD', help='account password')
4201 authentication.add_option('-n', '--netrc',
4202 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4203
4204
4205 video_format.add_option('-f', '--format',
4206 action='store', dest='format', metavar='FORMAT', help='video format code')
4207 video_format.add_option('--all-formats',
4208 action='store_const', dest='format', help='download all available video formats', const='all')
4209 video_format.add_option('--prefer-free-formats',
4210 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4211 video_format.add_option('--max-quality',
4212 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4213 video_format.add_option('-F', '--list-formats',
4214 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4215
4216
4217 verbosity.add_option('-q', '--quiet',
4218 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4219 verbosity.add_option('-s', '--simulate',
4220 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4221 verbosity.add_option('--skip-download',
4222 action='store_true', dest='skip_download', help='do not download the video', default=False)
4223 verbosity.add_option('-g', '--get-url',
4224 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4225 verbosity.add_option('-e', '--get-title',
4226 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4227 verbosity.add_option('--get-thumbnail',
4228 action='store_true', dest='getthumbnail',
4229 help='simulate, quiet but print thumbnail URL', default=False)
4230 verbosity.add_option('--get-description',
4231 action='store_true', dest='getdescription',
4232 help='simulate, quiet but print video description', default=False)
4233 verbosity.add_option('--get-filename',
4234 action='store_true', dest='getfilename',
4235 help='simulate, quiet but print output filename', default=False)
4236 verbosity.add_option('--get-format',
4237 action='store_true', dest='getformat',
4238 help='simulate, quiet but print output format', default=False)
4239 verbosity.add_option('--no-progress',
4240 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4241 verbosity.add_option('--console-title',
4242 action='store_true', dest='consoletitle',
4243 help='display progress in console titlebar', default=False)
4244
4245
4246 filesystem.add_option('-t', '--title',
4247 action='store_true', dest='usetitle', help='use title in file name', default=False)
4248 filesystem.add_option('-l', '--literal',
4249 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4250 filesystem.add_option('-A', '--auto-number',
4251 action='store_true', dest='autonumber',
4252 help='number downloaded files starting from 00000', default=False)
4253 filesystem.add_option('-o', '--output',
4254 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4255 filesystem.add_option('-a', '--batch-file',
4256 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4257 filesystem.add_option('-w', '--no-overwrites',
4258 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4259 filesystem.add_option('-c', '--continue',
4260 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
4261 filesystem.add_option('--no-continue',
4262 action='store_false', dest='continue_dl',
4263 help='do not resume partially downloaded files (restart from beginning)')
4264 filesystem.add_option('--cookies',
4265 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4266 filesystem.add_option('--no-part',
4267 action='store_true', dest='nopart', help='do not use .part files', default=False)
4268 filesystem.add_option('--no-mtime',
4269 action='store_false', dest='updatetime',
4270 help='do not use the Last-modified header to set the file modification time', default=True)
4271 filesystem.add_option('--write-description',
4272 action='store_true', dest='writedescription',
4273 help='write video description to a .description file', default=False)
4274 filesystem.add_option('--write-info-json',
4275 action='store_true', dest='writeinfojson',
4276 help='write video metadata to a .info.json file', default=False)
4277
4278
4279 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4280 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4281 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4282 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4283 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4284 help='ffmpeg audio bitrate specification, 128k by default')
4285 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4286 help='keeps the video file on disk after the post-processing; the video is erased by default')
4287
4288
4289 parser.add_option_group(general)
4290 parser.add_option_group(selection)
4291 parser.add_option_group(filesystem)
4292 parser.add_option_group(verbosity)
4293 parser.add_option_group(video_format)
4294 parser.add_option_group(authentication)
4295 parser.add_option_group(postproc)
4296
4297 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4298 if xdg_config_home:
4299 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4300 else:
4301 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4302 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4303 opts, args = parser.parse_args(argv)
4304
4305 return parser, opts, args
4306
4307 def gen_extractors():
4308 """ Return a list of an instance of every supported extractor.
4309 The order does matter; the first extractor matched is the one handling the URL.
4310 """
4311 youtube_ie = YoutubeIE()
4312 google_ie = GoogleIE()
4313 yahoo_ie = YahooIE()
4314 return [
4315 YoutubePlaylistIE(youtube_ie),
4316 YoutubeUserIE(youtube_ie),
4317 YoutubeSearchIE(youtube_ie),
4318 youtube_ie,
4319 MetacafeIE(youtube_ie),
4320 DailymotionIE(),
4321 google_ie,
4322 GoogleSearchIE(google_ie),
4323 PhotobucketIE(),
4324 yahoo_ie,
4325 YahooSearchIE(yahoo_ie),
4326 DepositFilesIE(),
4327 FacebookIE(),
4328 BlipTVIE(),
4329 VimeoIE(),
4330 MyVideoIE(),
4331 ComedyCentralIE(),
4332 EscapistIE(),
4333 CollegeHumorIE(),
4334 XVideosIE(),
4335 SoundcloudIE(),
4336 InfoQIE(),
4337 MixcloudIE(),
4338 StanfordOpenClassroomIE(),
4339
4340 GenericIE()
4341 ]
4342
4343 def _real_main():
4344 parser, opts, args = parseOpts()
4345
4346 # Open appropriate CookieJar
4347 if opts.cookiefile is None:
4348 jar = cookielib.CookieJar()
4349 else:
4350 try:
4351 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4352 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4353 jar.load()
4354 except (IOError, OSError), err:
4355 sys.exit(u'ERROR: unable to open cookie file')
4356
4357 # Dump user agent
4358 if opts.dump_user_agent:
4359 print std_headers['User-Agent']
4360 sys.exit(0)
4361
4362 # Batch file verification
4363 batchurls = []
4364 if opts.batchfile is not None:
4365 try:
4366 if opts.batchfile == '-':
4367 batchfd = sys.stdin
4368 else:
4369 batchfd = open(opts.batchfile, 'r')
4370 batchurls = batchfd.readlines()
4371 batchurls = [x.strip() for x in batchurls]
4372 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4373 except IOError:
4374 sys.exit(u'ERROR: batch file could not be read')
4375 all_urls = batchurls + args
4376
4377 # General configuration
4378 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4379 opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler())
4380 urllib2.install_opener(opener)
4381 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4382
4383 extractors = gen_extractors()
4384
4385 if opts.list_extractors:
4386 for ie in extractors:
4387 print(ie.IE_NAME)
4388 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4389 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4390 for mu in matchedUrls:
4391 print(u' ' + mu)
4392 sys.exit(0)
4393
4394 # Conflicting, missing and erroneous options
4395 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4396 parser.error(u'using .netrc conflicts with giving username/password')
4397 if opts.password is not None and opts.username is None:
4398 parser.error(u'account username missing')
4399 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4400 parser.error(u'using output template conflicts with using title, literal title or auto number')
4401 if opts.usetitle and opts.useliteral:
4402 parser.error(u'using title conflicts with using literal title')
4403 if opts.username is not None and opts.password is None:
4404 opts.password = getpass.getpass(u'Type account password and press return:')
4405 if opts.ratelimit is not None:
4406 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4407 if numeric_limit is None:
4408 parser.error(u'invalid rate limit specified')
4409 opts.ratelimit = numeric_limit
4410 if opts.retries is not None:
4411 try:
4412 opts.retries = long(opts.retries)
4413 except (TypeError, ValueError), err:
4414 parser.error(u'invalid retry count specified')
4415 try:
4416 opts.playliststart = int(opts.playliststart)
4417 if opts.playliststart <= 0:
4418 raise ValueError(u'Playlist start must be positive')
4419 except (TypeError, ValueError), err:
4420 parser.error(u'invalid playlist start number specified')
4421 try:
4422 opts.playlistend = int(opts.playlistend)
4423 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4424 raise ValueError(u'Playlist end must be greater than playlist start')
4425 except (TypeError, ValueError), err:
4426 parser.error(u'invalid playlist end number specified')
4427 if opts.extractaudio:
4428 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4429 parser.error(u'invalid audio format specified')
4430
4431 # File downloader
4432 fd = FileDownloader({
4433 'usenetrc': opts.usenetrc,
4434 'username': opts.username,
4435 'password': opts.password,
4436 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4437 'forceurl': opts.geturl,
4438 'forcetitle': opts.gettitle,
4439 'forcethumbnail': opts.getthumbnail,
4440 'forcedescription': opts.getdescription,
4441 'forcefilename': opts.getfilename,
4442 'forceformat': opts.getformat,
4443 'simulate': opts.simulate,
4444 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4445 'format': opts.format,
4446 'format_limit': opts.format_limit,
4447 'listformats': opts.listformats,
4448 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4449 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4450 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4451 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4452 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4453 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4454 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4455 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4456 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4457 or u'%(id)s.%(ext)s'),
4458 'ignoreerrors': opts.ignoreerrors,
4459 'ratelimit': opts.ratelimit,
4460 'nooverwrites': opts.nooverwrites,
4461 'retries': opts.retries,
4462 'continuedl': opts.continue_dl,
4463 'noprogress': opts.noprogress,
4464 'playliststart': opts.playliststart,
4465 'playlistend': opts.playlistend,
4466 'logtostderr': opts.outtmpl == '-',
4467 'consoletitle': opts.consoletitle,
4468 'nopart': opts.nopart,
4469 'updatetime': opts.updatetime,
4470 'writedescription': opts.writedescription,
4471 'writeinfojson': opts.writeinfojson,
4472 'matchtitle': opts.matchtitle,
4473 'rejecttitle': opts.rejecttitle,
4474 'max_downloads': opts.max_downloads,
4475 'prefer_free_formats': opts.prefer_free_formats,
4476 })
4477 for extractor in extractors:
4478 fd.add_info_extractor(extractor)
4479
4480 # PostProcessors
4481 if opts.extractaudio:
4482 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4483
4484 # Update version
4485 if opts.update_self:
4486 updateSelf(fd, sys.argv[0])
4487
4488 # Maybe do nothing
4489 if len(all_urls) < 1:
4490 if not opts.update_self:
4491 parser.error(u'you must provide at least one URL')
4492 else:
4493 sys.exit()
4494
4495 try:
4496 retcode = fd.download(all_urls)
4497 except MaxDownloadsReached:
4498 fd.to_screen(u'--max-download limit reached, aborting.')
4499 retcode = 101
4500
4501 # Dump cookie jar if requested
4502 if opts.cookiefile is not None:
4503 try:
4504 jar.save()
4505 except (IOError, OSError), err:
4506 sys.exit(u'ERROR: unable to save cookie jar')
4507
4508 sys.exit(retcode)
4509
4510 def main():
4511 try:
4512 _real_main()
4513 except DownloadError:
4514 sys.exit(1)
4515 except SameFileError:
4516 sys.exit(u'ERROR: fixed output name but more than one file to download')
4517 except KeyboardInterrupt:
4518 sys.exit(u'\nERROR: Interrupted by user')
4519
4520 if __name__ == '__main__':
4521 main()
4522
4523 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: