]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/__init__.py
Update the manpage to cite which sites youtube-dl supports.
[youtubedl] / youtube_dl / __init__.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 __authors__ = (
5 'Ricardo Garcia Gonzalez',
6 'Danny Colligan',
7 'Benjamin Johnson',
8 'Vasyl\' Vavrychuk',
9 'Witold Baryluk',
10 'Paweł Paprota',
11 'Gergely Imreh',
12 'Rogério Brito',
13 'Philipp Hagemeister',
14 'Sören Schulze',
15 'Kevin Ngo',
16 'Ori Avtalion',
17 'shizeeg',
18 )
19
20 __license__ = 'Public Domain'
21 __version__ = '2012.02.27'
22
23 UPDATE_URL = 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
24
25
26 import cookielib
27 import datetime
28 import getpass
29 import gzip
30 import htmlentitydefs
31 import HTMLParser
32 import httplib
33 import locale
34 import math
35 import netrc
36 import optparse
37 import os
38 import os.path
39 import re
40 import shlex
41 import socket
42 import string
43 import subprocess
44 import sys
45 import time
46 import urllib
47 import urllib2
48 import warnings
49 import zlib
50
51 if os.name == 'nt':
52 import ctypes
53
54 try:
55 import email.utils
56 except ImportError: # Python 2.4
57 import email.Utils
58 try:
59 import cStringIO as StringIO
60 except ImportError:
61 import StringIO
62
63 # parse_qs was moved from the cgi module to the urlparse module recently.
64 try:
65 from urlparse import parse_qs
66 except ImportError:
67 from cgi import parse_qs
68
69 try:
70 import lxml.etree
71 except ImportError:
72 pass # Handled below
73
74 try:
75 import xml.etree.ElementTree
76 except ImportError: # Python<2.5: Not officially supported, but let it slip
77 warnings.warn('xml.etree.ElementTree support is missing. Consider upgrading to Python >= 2.5 if you get related errors.')
78
79 std_headers = {
80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
85 }
86
87 try:
88 import json
89 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
90 import re
91 class json(object):
92 @staticmethod
93 def loads(s):
94 s = s.decode('UTF-8')
95 def raiseError(msg, i):
96 raise ValueError(msg + ' at position ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]))
97 def skipSpace(i, expectMore=True):
98 while i < len(s) and s[i] in ' \t\r\n':
99 i += 1
100 if expectMore:
101 if i >= len(s):
102 raiseError('Premature end', i)
103 return i
104 def decodeEscape(match):
105 esc = match.group(1)
106 _STATIC = {
107 '"': '"',
108 '\\': '\\',
109 '/': '/',
110 'b': unichr(0x8),
111 'f': unichr(0xc),
112 'n': '\n',
113 'r': '\r',
114 't': '\t',
115 }
116 if esc in _STATIC:
117 return _STATIC[esc]
118 if esc[0] == 'u':
119 if len(esc) == 1+4:
120 return unichr(int(esc[1:5], 16))
121 if len(esc) == 5+6 and esc[5:7] == '\\u':
122 hi = int(esc[1:5], 16)
123 low = int(esc[7:11], 16)
124 return unichr((hi - 0xd800) * 0x400 + low - 0xdc00 + 0x10000)
125 raise ValueError('Unknown escape ' + str(esc))
126 def parseString(i):
127 i += 1
128 e = i
129 while True:
130 e = s.index('"', e)
131 bslashes = 0
132 while s[e-bslashes-1] == '\\':
133 bslashes += 1
134 if bslashes % 2 == 1:
135 e += 1
136 continue
137 break
138 rexp = re.compile(r'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
139 stri = rexp.sub(decodeEscape, s[i:e])
140 return (e+1,stri)
141 def parseObj(i):
142 i += 1
143 res = {}
144 i = skipSpace(i)
145 if s[i] == '}': # Empty dictionary
146 return (i+1,res)
147 while True:
148 if s[i] != '"':
149 raiseError('Expected a string object key', i)
150 i,key = parseString(i)
151 i = skipSpace(i)
152 if i >= len(s) or s[i] != ':':
153 raiseError('Expected a colon', i)
154 i,val = parse(i+1)
155 res[key] = val
156 i = skipSpace(i)
157 if s[i] == '}':
158 return (i+1, res)
159 if s[i] != ',':
160 raiseError('Expected comma or closing curly brace', i)
161 i = skipSpace(i+1)
162 def parseArray(i):
163 res = []
164 i = skipSpace(i+1)
165 if s[i] == ']': # Empty array
166 return (i+1,res)
167 while True:
168 i,val = parse(i)
169 res.append(val)
170 i = skipSpace(i) # Raise exception if premature end
171 if s[i] == ']':
172 return (i+1, res)
173 if s[i] != ',':
174 raiseError('Expected a comma or closing bracket', i)
175 i = skipSpace(i+1)
176 def parseDiscrete(i):
177 for k,v in {'true': True, 'false': False, 'null': None}.items():
178 if s.startswith(k, i):
179 return (i+len(k), v)
180 raiseError('Not a boolean (or null)', i)
181 def parseNumber(i):
182 mobj = re.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s[i:])
183 if mobj is None:
184 raiseError('Not a number', i)
185 nums = mobj.group(1)
186 if '.' in nums or 'e' in nums or 'E' in nums:
187 return (i+len(nums), float(nums))
188 return (i+len(nums), int(nums))
189 CHARMAP = {'{': parseObj, '[': parseArray, '"': parseString, 't': parseDiscrete, 'f': parseDiscrete, 'n': parseDiscrete}
190 def parse(i):
191 i = skipSpace(i)
192 i,res = CHARMAP.get(s[i], parseNumber)(i)
193 i = skipSpace(i, False)
194 return (i,res)
195 i,res = parse(0)
196 if i < len(s):
197 raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')')
198 return res
199
200 def preferredencoding():
201 """Get preferred encoding.
202
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 def yield_preferredencoding():
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
212 while True:
213 yield pref
214 return yield_preferredencoding().next()
215
216
217 def htmlentity_transform(matchobj):
218 """Transforms an HTML entity to a Unicode character.
219
220 This function receives a match object and is intended to be used with
221 the re.sub() function.
222 """
223 entity = matchobj.group(1)
224
225 # Known non-numeric HTML entity
226 if entity in htmlentitydefs.name2codepoint:
227 return unichr(htmlentitydefs.name2codepoint[entity])
228
229 # Unicode character
230 mobj = re.match(ur'(?u)#(x?\d+)', entity)
231 if mobj is not None:
232 numstr = mobj.group(1)
233 if numstr.startswith(u'x'):
234 base = 16
235 numstr = u'0%s' % numstr
236 else:
237 base = 10
238 return unichr(long(numstr, base))
239
240 # Unknown entity in name, return its literal representation
241 return (u'&%s;' % entity)
242
243
244 def sanitize_title(utitle):
245 """Sanitizes a video title so it could be used as part of a filename."""
246 utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
247 return utitle.replace(unicode(os.sep), u'%')
248
249
250 def sanitize_open(filename, open_mode):
251 """Try to open the given filename, and slightly tweak it if this fails.
252
253 Attempts to open the given filename. If this fails, it tries to change
254 the filename slightly, step by step, until it's either able to open it
255 or it fails and raises a final exception, like the standard open()
256 function.
257
258 It returns the tuple (stream, definitive_file_name).
259 """
260 try:
261 if filename == u'-':
262 if sys.platform == 'win32':
263 import msvcrt
264 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
265 return (sys.stdout, filename)
266 stream = open(_encodeFilename(filename), open_mode)
267 return (stream, filename)
268 except (IOError, OSError), err:
269 # In case of error, try to remove win32 forbidden chars
270 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
271
272 # An exception here should be caught in the caller
273 stream = open(_encodeFilename(filename), open_mode)
274 return (stream, filename)
275
276
277 def timeconvert(timestr):
278 """Convert RFC 2822 defined time string into system timestamp"""
279 timestamp = None
280 timetuple = email.utils.parsedate_tz(timestr)
281 if timetuple is not None:
282 timestamp = email.utils.mktime_tz(timetuple)
283 return timestamp
284
285 def _simplify_title(title):
286 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
287 return expr.sub(u'_', title).strip(u'_')
288
289 def _orderedSet(iterable):
290 """ Remove all duplicates from the input iterable """
291 res = []
292 for el in iterable:
293 if el not in res:
294 res.append(el)
295 return res
296
297 def _unescapeHTML(s):
298 """
299 @param s a string (of type unicode)
300 """
301 assert type(s) == type(u'')
302
303 htmlParser = HTMLParser.HTMLParser()
304 return htmlParser.unescape(s)
305
306 def _encodeFilename(s):
307 """
308 @param s The name of the file (of type unicode)
309 """
310
311 assert type(s) == type(u'')
312
313 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
314 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
315 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
316 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
317 return s
318 else:
319 return s.encode(sys.getfilesystemencoding(), 'ignore')
320
321 class DownloadError(Exception):
322 """Download Error exception.
323
324 This exception may be thrown by FileDownloader objects if they are not
325 configured to continue on errors. They will contain the appropriate
326 error message.
327 """
328 pass
329
330
331 class SameFileError(Exception):
332 """Same File exception.
333
334 This exception will be thrown by FileDownloader objects if they detect
335 multiple files would have to be downloaded to the same file on disk.
336 """
337 pass
338
339
340 class PostProcessingError(Exception):
341 """Post Processing exception.
342
343 This exception may be raised by PostProcessor's .run() method to
344 indicate an error in the postprocessing task.
345 """
346 pass
347
348 class MaxDownloadsReached(Exception):
349 """ --max-downloads limit has been reached. """
350 pass
351
352
353 class UnavailableVideoError(Exception):
354 """Unavailable Format exception.
355
356 This exception will be thrown when a video is requested
357 in a format that is not available for that video.
358 """
359 pass
360
361
362 class ContentTooShortError(Exception):
363 """Content Too Short exception.
364
365 This exception may be raised by FileDownloader objects when a file they
366 download is too small for what the server announced first, indicating
367 the connection was probably interrupted.
368 """
369 # Both in bytes
370 downloaded = None
371 expected = None
372
373 def __init__(self, downloaded, expected):
374 self.downloaded = downloaded
375 self.expected = expected
376
377
378 class YoutubeDLHandler(urllib2.HTTPHandler):
379 """Handler for HTTP requests and responses.
380
381 This class, when installed with an OpenerDirector, automatically adds
382 the standard headers to every HTTP request and handles gzipped and
383 deflated responses from web servers. If compression is to be avoided in
384 a particular request, the original request in the program code only has
385 to include the HTTP header "Youtubedl-No-Compression", which will be
386 removed before making the real request.
387
388 Part of this code was copied from:
389
390 http://techknack.net/python-urllib2-handlers/
391
392 Andrew Rowls, the author of that code, agreed to release it to the
393 public domain.
394 """
395
396 @staticmethod
397 def deflate(data):
398 try:
399 return zlib.decompress(data, -zlib.MAX_WBITS)
400 except zlib.error:
401 return zlib.decompress(data)
402
403 @staticmethod
404 def addinfourl_wrapper(stream, headers, url, code):
405 if hasattr(urllib2.addinfourl, 'getcode'):
406 return urllib2.addinfourl(stream, headers, url, code)
407 ret = urllib2.addinfourl(stream, headers, url)
408 ret.code = code
409 return ret
410
411 def http_request(self, req):
412 for h in std_headers:
413 if h in req.headers:
414 del req.headers[h]
415 req.add_header(h, std_headers[h])
416 if 'Youtubedl-no-compression' in req.headers:
417 if 'Accept-encoding' in req.headers:
418 del req.headers['Accept-encoding']
419 del req.headers['Youtubedl-no-compression']
420 return req
421
422 def http_response(self, req, resp):
423 old_resp = resp
424 # gzip
425 if resp.headers.get('Content-encoding', '') == 'gzip':
426 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
427 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
428 resp.msg = old_resp.msg
429 # deflate
430 if resp.headers.get('Content-encoding', '') == 'deflate':
431 gz = StringIO.StringIO(self.deflate(resp.read()))
432 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
433 resp.msg = old_resp.msg
434 return resp
435
436
437 class FileDownloader(object):
438 """File Downloader class.
439
440 File downloader objects are the ones responsible of downloading the
441 actual video file and writing it to disk if the user has requested
442 it, among some other tasks. In most cases there should be one per
443 program. As, given a video URL, the downloader doesn't know how to
444 extract all the needed information, task that InfoExtractors do, it
445 has to pass the URL to one of them.
446
447 For this, file downloader objects have a method that allows
448 InfoExtractors to be registered in a given order. When it is passed
449 a URL, the file downloader handles it to the first InfoExtractor it
450 finds that reports being able to handle it. The InfoExtractor extracts
451 all the information about the video or videos the URL refers to, and
452 asks the FileDownloader to process the video information, possibly
453 downloading the video.
454
455 File downloaders accept a lot of parameters. In order not to saturate
456 the object constructor with arguments, it receives a dictionary of
457 options instead. These options are available through the params
458 attribute for the InfoExtractors to use. The FileDownloader also
459 registers itself as the downloader in charge for the InfoExtractors
460 that are added to it, so this is a "mutual registration".
461
462 Available options:
463
464 username: Username for authentication purposes.
465 password: Password for authentication purposes.
466 usenetrc: Use netrc for authentication instead.
467 quiet: Do not print messages to stdout.
468 forceurl: Force printing final URL.
469 forcetitle: Force printing title.
470 forcethumbnail: Force printing thumbnail URL.
471 forcedescription: Force printing description.
472 forcefilename: Force printing final filename.
473 simulate: Do not download the video files.
474 format: Video format code.
475 format_limit: Highest quality format to try.
476 outtmpl: Template for output names.
477 ignoreerrors: Do not stop on download errors.
478 ratelimit: Download speed limit, in bytes/sec.
479 nooverwrites: Prevent overwriting files.
480 retries: Number of times to retry for HTTP error 5xx
481 continuedl: Try to continue downloads if possible.
482 noprogress: Do not print the progress bar.
483 playliststart: Playlist item to start at.
484 playlistend: Playlist item to end at.
485 matchtitle: Download only matching titles.
486 rejecttitle: Reject downloads for matching titles.
487 logtostderr: Log messages to stderr instead of stdout.
488 consoletitle: Display progress in console window's titlebar.
489 nopart: Do not use temporary .part files.
490 updatetime: Use the Last-modified header to set output file timestamps.
491 writedescription: Write the video description to a .description file
492 writeinfojson: Write the video description to a .info.json file
493 """
494
495 params = None
496 _ies = []
497 _pps = []
498 _download_retcode = None
499 _num_downloads = None
500 _screen_file = None
501
502 def __init__(self, params):
503 """Create a FileDownloader object with the given options."""
504 self._ies = []
505 self._pps = []
506 self._download_retcode = 0
507 self._num_downloads = 0
508 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
509 self.params = params
510
511 @staticmethod
512 def format_bytes(bytes):
513 if bytes is None:
514 return 'N/A'
515 if type(bytes) is str:
516 bytes = float(bytes)
517 if bytes == 0.0:
518 exponent = 0
519 else:
520 exponent = long(math.log(bytes, 1024.0))
521 suffix = 'bkMGTPEZY'[exponent]
522 converted = float(bytes) / float(1024 ** exponent)
523 return '%.2f%s' % (converted, suffix)
524
525 @staticmethod
526 def calc_percent(byte_counter, data_len):
527 if data_len is None:
528 return '---.-%'
529 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
530
531 @staticmethod
532 def calc_eta(start, now, total, current):
533 if total is None:
534 return '--:--'
535 dif = now - start
536 if current == 0 or dif < 0.001: # One millisecond
537 return '--:--'
538 rate = float(current) / dif
539 eta = long((float(total) - float(current)) / rate)
540 (eta_mins, eta_secs) = divmod(eta, 60)
541 if eta_mins > 99:
542 return '--:--'
543 return '%02d:%02d' % (eta_mins, eta_secs)
544
545 @staticmethod
546 def calc_speed(start, now, bytes):
547 dif = now - start
548 if bytes == 0 or dif < 0.001: # One millisecond
549 return '%10s' % '---b/s'
550 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
551
552 @staticmethod
553 def best_block_size(elapsed_time, bytes):
554 new_min = max(bytes / 2.0, 1.0)
555 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
556 if elapsed_time < 0.001:
557 return long(new_max)
558 rate = bytes / elapsed_time
559 if rate > new_max:
560 return long(new_max)
561 if rate < new_min:
562 return long(new_min)
563 return long(rate)
564
565 @staticmethod
566 def parse_bytes(bytestr):
567 """Parse a string indicating a byte quantity into a long integer."""
568 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
569 if matchobj is None:
570 return None
571 number = float(matchobj.group(1))
572 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
573 return long(round(number * multiplier))
574
575 def add_info_extractor(self, ie):
576 """Add an InfoExtractor object to the end of the list."""
577 self._ies.append(ie)
578 ie.set_downloader(self)
579
580 def add_post_processor(self, pp):
581 """Add a PostProcessor object to the end of the chain."""
582 self._pps.append(pp)
583 pp.set_downloader(self)
584
585 def to_screen(self, message, skip_eol=False):
586 """Print message to stdout if not in quiet mode."""
587 assert type(message) == type(u'')
588 if not self.params.get('quiet', False):
589 terminator = [u'\n', u''][skip_eol]
590 output = message + terminator
591
592 if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
593 output = output.encode(preferredencoding(), 'ignore')
594 self._screen_file.write(output)
595 self._screen_file.flush()
596
597 def to_stderr(self, message):
598 """Print message to stderr."""
599 print >>sys.stderr, message.encode(preferredencoding())
600
601 def to_cons_title(self, message):
602 """Set console/terminal window title to message."""
603 if not self.params.get('consoletitle', False):
604 return
605 if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
606 # c_wchar_p() might not be necessary if `message` is
607 # already of type unicode()
608 ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
609 elif 'TERM' in os.environ:
610 sys.stderr.write('\033]0;%s\007' % message.encode(preferredencoding()))
611
612 def fixed_template(self):
613 """Checks if the output template is fixed."""
614 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
615
616 def trouble(self, message=None):
617 """Determine action to take when a download problem appears.
618
619 Depending on if the downloader has been configured to ignore
620 download errors or not, this method may throw an exception or
621 not when errors are found, after printing the message.
622 """
623 if message is not None:
624 self.to_stderr(message)
625 if not self.params.get('ignoreerrors', False):
626 raise DownloadError(message)
627 self._download_retcode = 1
628
629 def slow_down(self, start_time, byte_counter):
630 """Sleep if the download speed is over the rate limit."""
631 rate_limit = self.params.get('ratelimit', None)
632 if rate_limit is None or byte_counter == 0:
633 return
634 now = time.time()
635 elapsed = now - start_time
636 if elapsed <= 0.0:
637 return
638 speed = float(byte_counter) / elapsed
639 if speed > rate_limit:
640 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
641
642 def temp_name(self, filename):
643 """Returns a temporary filename for the given filename."""
644 if self.params.get('nopart', False) or filename == u'-' or \
645 (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))):
646 return filename
647 return filename + u'.part'
648
649 def undo_temp_name(self, filename):
650 if filename.endswith(u'.part'):
651 return filename[:-len(u'.part')]
652 return filename
653
654 def try_rename(self, old_filename, new_filename):
655 try:
656 if old_filename == new_filename:
657 return
658 os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename))
659 except (IOError, OSError), err:
660 self.trouble(u'ERROR: unable to rename file')
661
662 def try_utime(self, filename, last_modified_hdr):
663 """Try to set the last-modified time of the given file."""
664 if last_modified_hdr is None:
665 return
666 if not os.path.isfile(_encodeFilename(filename)):
667 return
668 timestr = last_modified_hdr
669 if timestr is None:
670 return
671 filetime = timeconvert(timestr)
672 if filetime is None:
673 return filetime
674 try:
675 os.utime(filename, (time.time(), filetime))
676 except:
677 pass
678 return filetime
679
680 def report_writedescription(self, descfn):
681 """ Report that the description file is being written """
682 self.to_screen(u'[info] Writing video description to: ' + descfn)
683
684 def report_writeinfojson(self, infofn):
685 """ Report that the metadata file has been written """
686 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
687
688 def report_destination(self, filename):
689 """Report destination filename."""
690 self.to_screen(u'[download] Destination: ' + filename)
691
692 def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
693 """Report download progress."""
694 if self.params.get('noprogress', False):
695 return
696 self.to_screen(u'\r[download] %s of %s at %s ETA %s' %
697 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
698 self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
699 (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
700
701 def report_resuming_byte(self, resume_len):
702 """Report attempt to resume at given byte."""
703 self.to_screen(u'[download] Resuming download at byte %s' % resume_len)
704
705 def report_retry(self, count, retries):
706 """Report retry in case of HTTP error 5xx"""
707 self.to_screen(u'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count, retries))
708
709 def report_file_already_downloaded(self, file_name):
710 """Report file has already been fully downloaded."""
711 try:
712 self.to_screen(u'[download] %s has already been downloaded' % file_name)
713 except (UnicodeEncodeError), err:
714 self.to_screen(u'[download] The file has already been downloaded')
715
716 def report_unable_to_resume(self):
717 """Report it was impossible to resume download."""
718 self.to_screen(u'[download] Unable to resume')
719
720 def report_finish(self):
721 """Report download finished."""
722 if self.params.get('noprogress', False):
723 self.to_screen(u'[download] Download completed')
724 else:
725 self.to_screen(u'')
726
727 def increment_downloads(self):
728 """Increment the ordinal that assigns a number to each file."""
729 self._num_downloads += 1
730
731 def prepare_filename(self, info_dict):
732 """Generate the output filename."""
733 try:
734 template_dict = dict(info_dict)
735 template_dict['epoch'] = unicode(long(time.time()))
736 template_dict['autonumber'] = unicode('%05d' % self._num_downloads)
737 filename = self.params['outtmpl'] % template_dict
738 return filename
739 except (ValueError, KeyError), err:
740 self.trouble(u'ERROR: invalid system charset or erroneous output template')
741 return None
742
743 def _match_entry(self, info_dict):
744 """ Returns None iff the file should be downloaded """
745
746 title = info_dict['title']
747 matchtitle = self.params.get('matchtitle', False)
748 if matchtitle and not re.search(matchtitle, title, re.IGNORECASE):
749 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
750 rejecttitle = self.params.get('rejecttitle', False)
751 if rejecttitle and re.search(rejecttitle, title, re.IGNORECASE):
752 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
753 return None
754
755 def process_info(self, info_dict):
756 """Process a single dictionary returned by an InfoExtractor."""
757
758 reason = self._match_entry(info_dict)
759 if reason is not None:
760 self.to_screen(u'[download] ' + reason)
761 return
762
763 max_downloads = self.params.get('max_downloads')
764 if max_downloads is not None:
765 if self._num_downloads > int(max_downloads):
766 raise MaxDownloadsReached()
767
768 filename = self.prepare_filename(info_dict)
769
770 # Forced printings
771 if self.params.get('forcetitle', False):
772 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
773 if self.params.get('forceurl', False):
774 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
775 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
776 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
777 if self.params.get('forcedescription', False) and 'description' in info_dict:
778 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
779 if self.params.get('forcefilename', False) and filename is not None:
780 print filename.encode(preferredencoding(), 'xmlcharrefreplace')
781 if self.params.get('forceformat', False):
782 print info_dict['format'].encode(preferredencoding(), 'xmlcharrefreplace')
783
784 # Do nothing else if in simulate mode
785 if self.params.get('simulate', False):
786 return
787
788 if filename is None:
789 return
790
791 try:
792 dn = os.path.dirname(_encodeFilename(filename))
793 if dn != '' and not os.path.exists(dn): # dn is already encoded
794 os.makedirs(dn)
795 except (OSError, IOError), err:
796 self.trouble(u'ERROR: unable to create directory ' + unicode(err))
797 return
798
799 if self.params.get('writedescription', False):
800 try:
801 descfn = filename + u'.description'
802 self.report_writedescription(descfn)
803 descfile = open(_encodeFilename(descfn), 'wb')
804 try:
805 descfile.write(info_dict['description'].encode('utf-8'))
806 finally:
807 descfile.close()
808 except (OSError, IOError):
809 self.trouble(u'ERROR: Cannot write description file ' + descfn)
810 return
811
812 if self.params.get('writeinfojson', False):
813 infofn = filename + u'.info.json'
814 self.report_writeinfojson(infofn)
815 try:
816 json.dump
817 except (NameError,AttributeError):
818 self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
819 return
820 try:
821 infof = open(_encodeFilename(infofn), 'wb')
822 try:
823 json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',))
824 json.dump(json_info_dict, infof)
825 finally:
826 infof.close()
827 except (OSError, IOError):
828 self.trouble(u'ERROR: Cannot write metadata to JSON file ' + infofn)
829 return
830
831 if not self.params.get('skip_download', False):
832 if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)):
833 success = True
834 else:
835 try:
836 success = self._do_download(filename, info_dict)
837 except (OSError, IOError), err:
838 raise UnavailableVideoError
839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
840 self.trouble(u'ERROR: unable to download video data: %s' % str(err))
841 return
842 except (ContentTooShortError, ), err:
843 self.trouble(u'ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
844 return
845
846 if success:
847 try:
848 self.post_process(filename, info_dict)
849 except (PostProcessingError), err:
850 self.trouble(u'ERROR: postprocessing: %s' % str(err))
851 return
852
853 def download(self, url_list):
854 """Download a given list of URLs."""
855 if len(url_list) > 1 and self.fixed_template():
856 raise SameFileError(self.params['outtmpl'])
857
858 for url in url_list:
859 suitable_found = False
860 for ie in self._ies:
861 # Go to next InfoExtractor if not suitable
862 if not ie.suitable(url):
863 continue
864
865 # Suitable InfoExtractor found
866 suitable_found = True
867
868 # Extract information from URL and process it
869 ie.extract(url)
870
871 # Suitable InfoExtractor had been found; go to next URL
872 break
873
874 if not suitable_found:
875 self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
876
877 return self._download_retcode
878
879 def post_process(self, filename, ie_info):
880 """Run the postprocessing chain on the given file."""
881 info = dict(ie_info)
882 info['filepath'] = filename
883 for pp in self._pps:
884 info = pp.run(info)
885 if info is None:
886 break
887
888 def _download_with_rtmpdump(self, filename, url, player_url):
889 self.report_destination(filename)
890 tmpfilename = self.temp_name(filename)
891
892 # Check for rtmpdump first
893 try:
894 subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
895 except (OSError, IOError):
896 self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
897 return False
898
899 # Download using rtmpdump. rtmpdump returns exit code 2 when
900 # the connection was interrumpted and resuming appears to be
901 # possible. This is part of rtmpdump's normal usage, AFAIK.
902 basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename]
903 args = basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]
904 if self.params.get('verbose', False):
905 try:
906 import pipes
907 shell_quote = lambda args: ' '.join(map(pipes.quote, args))
908 except ImportError:
909 shell_quote = repr
910 self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
911 retval = subprocess.call(args)
912 while retval == 2 or retval == 1:
913 prevsize = os.path.getsize(_encodeFilename(tmpfilename))
914 self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
915 time.sleep(5.0) # This seems to be needed
916 retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
917 cursize = os.path.getsize(_encodeFilename(tmpfilename))
918 if prevsize == cursize and retval == 1:
919 break
920 # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
921 if prevsize == cursize and retval == 2 and cursize > 1024:
922 self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
923 retval = 0
924 break
925 if retval == 0:
926 self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename)))
927 self.try_rename(tmpfilename, filename)
928 return True
929 else:
930 self.trouble(u'\nERROR: rtmpdump exited with code %d' % retval)
931 return False
932
933 def _do_download(self, filename, info_dict):
934 url = info_dict['url']
935 player_url = info_dict.get('player_url', None)
936
937 # Check file already present
938 if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False):
939 self.report_file_already_downloaded(filename)
940 return True
941
942 # Attempt to download using rtmpdump
943 if url.startswith('rtmp'):
944 return self._download_with_rtmpdump(filename, url, player_url)
945
946 tmpfilename = self.temp_name(filename)
947 stream = None
948
949 # Do not include the Accept-Encoding header
950 headers = {'Youtubedl-no-compression': 'True'}
951 basic_request = urllib2.Request(url, None, headers)
952 request = urllib2.Request(url, None, headers)
953
954 # Establish possible resume length
955 if os.path.isfile(_encodeFilename(tmpfilename)):
956 resume_len = os.path.getsize(_encodeFilename(tmpfilename))
957 else:
958 resume_len = 0
959
960 open_mode = 'wb'
961 if resume_len != 0:
962 if self.params.get('continuedl', False):
963 self.report_resuming_byte(resume_len)
964 request.add_header('Range','bytes=%d-' % resume_len)
965 open_mode = 'ab'
966 else:
967 resume_len = 0
968
969 count = 0
970 retries = self.params.get('retries', 0)
971 while count <= retries:
972 # Establish connection
973 try:
974 if count == 0 and 'urlhandle' in info_dict:
975 data = info_dict['urlhandle']
976 data = urllib2.urlopen(request)
977 break
978 except (urllib2.HTTPError, ), err:
979 if (err.code < 500 or err.code >= 600) and err.code != 416:
980 # Unexpected HTTP error
981 raise
982 elif err.code == 416:
983 # Unable to resume (requested range not satisfiable)
984 try:
985 # Open the connection again without the range header
986 data = urllib2.urlopen(basic_request)
987 content_length = data.info()['Content-Length']
988 except (urllib2.HTTPError, ), err:
989 if err.code < 500 or err.code >= 600:
990 raise
991 else:
992 # Examine the reported length
993 if (content_length is not None and
994 (resume_len - 100 < long(content_length) < resume_len + 100)):
995 # The file had already been fully downloaded.
996 # Explanation to the above condition: in issue #175 it was revealed that
997 # YouTube sometimes adds or removes a few bytes from the end of the file,
998 # changing the file size slightly and causing problems for some users. So
999 # I decided to implement a suggested change and consider the file
1000 # completely downloaded if the file size differs less than 100 bytes from
1001 # the one in the hard drive.
1002 self.report_file_already_downloaded(filename)
1003 self.try_rename(tmpfilename, filename)
1004 return True
1005 else:
1006 # The length does not match, we start the download over
1007 self.report_unable_to_resume()
1008 open_mode = 'wb'
1009 break
1010 # Retry
1011 count += 1
1012 if count <= retries:
1013 self.report_retry(count, retries)
1014
1015 if count > retries:
1016 self.trouble(u'ERROR: giving up after %s retries' % retries)
1017 return False
1018
1019 data_len = data.info().get('Content-length', None)
1020 if data_len is not None:
1021 data_len = long(data_len) + resume_len
1022 data_len_str = self.format_bytes(data_len)
1023 byte_counter = 0 + resume_len
1024 block_size = 1024
1025 start = time.time()
1026 while True:
1027 # Download and write
1028 before = time.time()
1029 data_block = data.read(block_size)
1030 after = time.time()
1031 if len(data_block) == 0:
1032 break
1033 byte_counter += len(data_block)
1034
1035 # Open file just in time
1036 if stream is None:
1037 try:
1038 (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
1039 assert stream is not None
1040 filename = self.undo_temp_name(tmpfilename)
1041 self.report_destination(filename)
1042 except (OSError, IOError), err:
1043 self.trouble(u'ERROR: unable to open for writing: %s' % str(err))
1044 return False
1045 try:
1046 stream.write(data_block)
1047 except (IOError, OSError), err:
1048 self.trouble(u'\nERROR: unable to write data: %s' % str(err))
1049 return False
1050 block_size = self.best_block_size(after - before, len(data_block))
1051
1052 # Progress message
1053 speed_str = self.calc_speed(start, time.time(), byte_counter - resume_len)
1054 if data_len is None:
1055 self.report_progress('Unknown %', data_len_str, speed_str, 'Unknown ETA')
1056 else:
1057 percent_str = self.calc_percent(byte_counter, data_len)
1058 eta_str = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
1059 self.report_progress(percent_str, data_len_str, speed_str, eta_str)
1060
1061 # Apply rate limit
1062 self.slow_down(start, byte_counter - resume_len)
1063
1064 if stream is None:
1065 self.trouble(u'\nERROR: Did not get any data blocks')
1066 return False
1067 stream.close()
1068 self.report_finish()
1069 if data_len is not None and byte_counter != data_len:
1070 raise ContentTooShortError(byte_counter, long(data_len))
1071 self.try_rename(tmpfilename, filename)
1072
1073 # Update file modification time
1074 if self.params.get('updatetime', True):
1075 info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
1076
1077 return True
1078
1079
1080 class InfoExtractor(object):
1081 """Information Extractor class.
1082
1083 Information extractors are the classes that, given a URL, extract
1084 information from the video (or videos) the URL refers to. This
1085 information includes the real video URL, the video title and simplified
1086 title, author and others. The information is stored in a dictionary
1087 which is then passed to the FileDownloader. The FileDownloader
1088 processes this information possibly downloading the video to the file
1089 system, among other possible outcomes. The dictionaries must include
1090 the following fields:
1091
1092 id: Video identifier.
1093 url: Final video URL.
1094 uploader: Nickname of the video uploader.
1095 title: Literal title.
1096 stitle: Simplified title.
1097 ext: Video filename extension.
1098 format: Video format.
1099 player_url: SWF Player URL (may be None).
1100
1101 The following fields are optional. Their primary purpose is to allow
1102 youtube-dl to serve as the backend for a video search function, such
1103 as the one in youtube2mp3. They are only used when their respective
1104 forced printing functions are called:
1105
1106 thumbnail: Full URL to a video thumbnail image.
1107 description: One-line video description.
1108
1109 Subclasses of this one should re-define the _real_initialize() and
1110 _real_extract() methods and define a _VALID_URL regexp.
1111 Probably, they should also be added to the list of extractors.
1112 """
1113
1114 _ready = False
1115 _downloader = None
1116
1117 def __init__(self, downloader=None):
1118 """Constructor. Receives an optional downloader."""
1119 self._ready = False
1120 self.set_downloader(downloader)
1121
1122 def suitable(self, url):
1123 """Receives a URL and returns True if suitable for this IE."""
1124 return re.match(self._VALID_URL, url) is not None
1125
1126 def initialize(self):
1127 """Initializes an instance (authentication, etc)."""
1128 if not self._ready:
1129 self._real_initialize()
1130 self._ready = True
1131
1132 def extract(self, url):
1133 """Extracts URL information and returns it in list of dicts."""
1134 self.initialize()
1135 return self._real_extract(url)
1136
1137 def set_downloader(self, downloader):
1138 """Sets the downloader for this IE."""
1139 self._downloader = downloader
1140
1141 def _real_initialize(self):
1142 """Real initialization process. Redefine in subclasses."""
1143 pass
1144
1145 def _real_extract(self, url):
1146 """Real extraction process. Redefine in subclasses."""
1147 pass
1148
1149
1150 class YoutubeIE(InfoExtractor):
1151 """Information extractor for youtube.com."""
1152
1153 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1154 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1155 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1156 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1157 _NETRC_MACHINE = 'youtube'
1158 # Listed in order of quality
1159 _available_formats = ['38', '37', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
1160 _available_formats_prefer_free = ['38', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
1161 _video_extensions = {
1162 '13': '3gp',
1163 '17': 'mp4',
1164 '18': 'mp4',
1165 '22': 'mp4',
1166 '37': 'mp4',
1167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1168 '43': 'webm',
1169 '44': 'webm',
1170 '45': 'webm',
1171 }
1172 _video_dimensions = {
1173 '5': '240x400',
1174 '6': '???',
1175 '13': '???',
1176 '17': '144x176',
1177 '18': '360x640',
1178 '22': '720x1280',
1179 '34': '360x640',
1180 '35': '480x854',
1181 '37': '1080x1920',
1182 '38': '3072x4096',
1183 '43': '360x640',
1184 '44': '480x854',
1185 '45': '720x1280',
1186 }
1187 IE_NAME = u'youtube'
1188
1189 def report_lang(self):
1190 """Report attempt to set language."""
1191 self._downloader.to_screen(u'[youtube] Setting language')
1192
1193 def report_login(self):
1194 """Report attempt to log in."""
1195 self._downloader.to_screen(u'[youtube] Logging in')
1196
1197 def report_age_confirmation(self):
1198 """Report attempt to confirm age."""
1199 self._downloader.to_screen(u'[youtube] Confirming age')
1200
1201 def report_video_webpage_download(self, video_id):
1202 """Report attempt to download video webpage."""
1203 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
1204
1205 def report_video_info_webpage_download(self, video_id):
1206 """Report attempt to download video info webpage."""
1207 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
1208
1209 def report_information_extraction(self, video_id):
1210 """Report attempt to extract video information."""
1211 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
1212
1213 def report_unavailable_format(self, video_id, format):
1214 """Report extracted video URL."""
1215 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
1216
1217 def report_rtmp_download(self):
1218 """Indicate the download will use the RTMP protocol."""
1219 self._downloader.to_screen(u'[youtube] RTMP download detected')
1220
1221 def _print_formats(self, formats):
1222 print 'Available formats:'
1223 for x in formats:
1224 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
1225
1226 def _real_initialize(self):
1227 if self._downloader is None:
1228 return
1229
1230 username = None
1231 password = None
1232 downloader_params = self._downloader.params
1233
1234 # Attempt to use provided username and password or .netrc data
1235 if downloader_params.get('username', None) is not None:
1236 username = downloader_params['username']
1237 password = downloader_params['password']
1238 elif downloader_params.get('usenetrc', False):
1239 try:
1240 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1241 if info is not None:
1242 username = info[0]
1243 password = info[2]
1244 else:
1245 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1246 except (IOError, netrc.NetrcParseError), err:
1247 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1248 return
1249
1250 # Set language
1251 request = urllib2.Request(self._LANG_URL)
1252 try:
1253 self.report_lang()
1254 urllib2.urlopen(request).read()
1255 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1256 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
1257 return
1258
1259 # No authentication to be performed
1260 if username is None:
1261 return
1262
1263 # Log in
1264 login_form = {
1265 'current_form': 'loginForm',
1266 'next': '/',
1267 'action_login': 'Log In',
1268 'username': username,
1269 'password': password,
1270 }
1271 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1272 try:
1273 self.report_login()
1274 login_results = urllib2.urlopen(request).read()
1275 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
1276 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
1277 return
1278 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1279 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1280 return
1281
1282 # Confirm age
1283 age_form = {
1284 'next_url': '/',
1285 'action_confirm': 'Confirm',
1286 }
1287 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
1288 try:
1289 self.report_age_confirmation()
1290 age_results = urllib2.urlopen(request).read()
1291 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1292 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1293 return
1294
1295 def _real_extract(self, url):
1296 # Extract video id from URL
1297 mobj = re.match(self._VALID_URL, url)
1298 if mobj is None:
1299 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1300 return
1301 video_id = mobj.group(2)
1302
1303 # Get video webpage
1304 self.report_video_webpage_download(video_id)
1305 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
1306 try:
1307 video_webpage = urllib2.urlopen(request).read()
1308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1309 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1310 return
1311
1312 # Attempt to extract SWF player URL
1313 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1314 if mobj is not None:
1315 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1316 else:
1317 player_url = None
1318
1319 # Get video info
1320 self.report_video_info_webpage_download(video_id)
1321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323 % (video_id, el_type))
1324 request = urllib2.Request(video_info_url)
1325 try:
1326 video_info_webpage = urllib2.urlopen(request).read()
1327 video_info = parse_qs(video_info_webpage)
1328 if 'token' in video_info:
1329 break
1330 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1331 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
1332 return
1333 if 'token' not in video_info:
1334 if 'reason' in video_info:
1335 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
1336 else:
1337 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
1338 return
1339
1340 # Start extracting information
1341 self.report_information_extraction(video_id)
1342
1343 # uploader
1344 if 'author' not in video_info:
1345 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1346 return
1347 video_uploader = urllib.unquote_plus(video_info['author'][0])
1348
1349 # title
1350 if 'title' not in video_info:
1351 self._downloader.trouble(u'ERROR: unable to extract video title')
1352 return
1353 video_title = urllib.unquote_plus(video_info['title'][0])
1354 video_title = video_title.decode('utf-8')
1355 video_title = sanitize_title(video_title)
1356
1357 # simplified title
1358 simple_title = _simplify_title(video_title)
1359
1360 # thumbnail image
1361 if 'thumbnail_url' not in video_info:
1362 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1363 video_thumbnail = ''
1364 else: # don't panic if we can't find it
1365 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
1366
1367 # upload date
1368 upload_date = u'NA'
1369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370 if mobj is not None:
1371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1373 for expression in format_expressions:
1374 try:
1375 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
1376 except:
1377 pass
1378
1379 # description
1380 try:
1381 lxml.etree
1382 except NameError:
1383 video_description = u'No description available.'
1384 mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
1385 if mobj is not None:
1386 video_description = mobj.group(1).decode('utf-8')
1387 else:
1388 html_parser = lxml.etree.HTMLParser(encoding='utf-8')
1389 vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
1390 video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
1391 # TODO use another parser
1392
1393 # token
1394 video_token = urllib.unquote_plus(video_info['token'][0])
1395
1396 # Decide which formats to download
1397 req_format = self._downloader.params.get('format', None)
1398
1399 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1400 self.report_rtmp_download()
1401 video_url_list = [(None, video_info['conn'][0])]
1402 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1403 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
1404 url_data = [parse_qs(uds) for uds in url_data_strs]
1405 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
1406 url_map = dict((ud['itag'][0], ud['url'][0]) for ud in url_data)
1407
1408 format_limit = self._downloader.params.get('format_limit', None)
1409 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1410 if format_limit is not None and format_limit in available_formats:
1411 format_list = available_formats[available_formats.index(format_limit):]
1412 else:
1413 format_list = available_formats
1414 existing_formats = [x for x in format_list if x in url_map]
1415 if len(existing_formats) == 0:
1416 self._downloader.trouble(u'ERROR: no known formats available for video')
1417 return
1418 if self._downloader.params.get('listformats', None):
1419 self._print_formats(existing_formats)
1420 return
1421 if req_format is None or req_format == 'best':
1422 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1423 elif req_format == 'worst':
1424 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1425 elif req_format in ('-1', 'all'):
1426 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1427 else:
1428 # Specific formats. We pick the first in a slash-delimeted sequence.
1429 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1430 req_formats = req_format.split('/')
1431 video_url_list = None
1432 for rf in req_formats:
1433 if rf in url_map:
1434 video_url_list = [(rf, url_map[rf])]
1435 break
1436 if video_url_list is None:
1437 self._downloader.trouble(u'ERROR: requested format not available')
1438 return
1439 else:
1440 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1441 return
1442
1443 for format_param, video_real_url in video_url_list:
1444 # At this point we have a new video
1445 self._downloader.increment_downloads()
1446
1447 # Extension
1448 video_extension = self._video_extensions.get(format_param, 'flv')
1449
1450 try:
1451 # Process video information
1452 self._downloader.process_info({
1453 'id': video_id.decode('utf-8'),
1454 'url': video_real_url.decode('utf-8'),
1455 'uploader': video_uploader.decode('utf-8'),
1456 'upload_date': upload_date,
1457 'title': video_title,
1458 'stitle': simple_title,
1459 'ext': video_extension.decode('utf-8'),
1460 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1461 'thumbnail': video_thumbnail.decode('utf-8'),
1462 'description': video_description,
1463 'player_url': player_url,
1464 })
1465 except UnavailableVideoError, err:
1466 self._downloader.trouble(u'\nERROR: unable to download video')
1467
1468
1469 class MetacafeIE(InfoExtractor):
1470 """Information Extractor for metacafe.com."""
1471
1472 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1473 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
1474 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1475 _youtube_ie = None
1476 IE_NAME = u'metacafe'
1477
1478 def __init__(self, youtube_ie, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1480 self._youtube_ie = youtube_ie
1481
1482 def report_disclaimer(self):
1483 """Report disclaimer retrieval."""
1484 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
1485
1486 def report_age_confirmation(self):
1487 """Report attempt to confirm age."""
1488 self._downloader.to_screen(u'[metacafe] Confirming age')
1489
1490 def report_download_webpage(self, video_id):
1491 """Report webpage download."""
1492 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
1493
1494 def report_extraction(self, video_id):
1495 """Report information extraction."""
1496 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
1497
1498 def _real_initialize(self):
1499 # Retrieve disclaimer
1500 request = urllib2.Request(self._DISCLAIMER)
1501 try:
1502 self.report_disclaimer()
1503 disclaimer = urllib2.urlopen(request).read()
1504 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1505 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
1506 return
1507
1508 # Confirm age
1509 disclaimer_form = {
1510 'filters': '0',
1511 'submit': "Continue - I'm over 18",
1512 }
1513 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
1514 try:
1515 self.report_age_confirmation()
1516 disclaimer = urllib2.urlopen(request).read()
1517 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1518 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
1519 return
1520
1521 def _real_extract(self, url):
1522 # Extract id and simplified title from URL
1523 mobj = re.match(self._VALID_URL, url)
1524 if mobj is None:
1525 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1526 return
1527
1528 video_id = mobj.group(1)
1529
1530 # Check if video comes from YouTube
1531 mobj2 = re.match(r'^yt-(.*)$', video_id)
1532 if mobj2 is not None:
1533 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
1534 return
1535
1536 # At this point we have a new video
1537 self._downloader.increment_downloads()
1538
1539 simple_title = mobj.group(2).decode('utf-8')
1540
1541 # Retrieve video webpage to extract further information
1542 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
1543 try:
1544 self.report_download_webpage(video_id)
1545 webpage = urllib2.urlopen(request).read()
1546 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1547 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1548 return
1549
1550 # Extract URL, uploader and title from webpage
1551 self.report_extraction(video_id)
1552 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1553 if mobj is not None:
1554 mediaURL = urllib.unquote(mobj.group(1))
1555 video_extension = mediaURL[-3:]
1556
1557 # Extract gdaKey if available
1558 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1559 if mobj is None:
1560 video_url = mediaURL
1561 else:
1562 gdaKey = mobj.group(1)
1563 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1564 else:
1565 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
1566 if mobj is None:
1567 self._downloader.trouble(u'ERROR: unable to extract media URL')
1568 return
1569 vardict = parse_qs(mobj.group(1))
1570 if 'mediaData' not in vardict:
1571 self._downloader.trouble(u'ERROR: unable to extract media URL')
1572 return
1573 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
1574 if mobj is None:
1575 self._downloader.trouble(u'ERROR: unable to extract media URL')
1576 return
1577 mediaURL = mobj.group(1).replace('\\/', '/')
1578 video_extension = mediaURL[-3:]
1579 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
1580
1581 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1582 if mobj is None:
1583 self._downloader.trouble(u'ERROR: unable to extract title')
1584 return
1585 video_title = mobj.group(1).decode('utf-8')
1586 video_title = sanitize_title(video_title)
1587
1588 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1589 if mobj is None:
1590 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1591 return
1592 video_uploader = mobj.group(1)
1593
1594 try:
1595 # Process video information
1596 self._downloader.process_info({
1597 'id': video_id.decode('utf-8'),
1598 'url': video_url.decode('utf-8'),
1599 'uploader': video_uploader.decode('utf-8'),
1600 'upload_date': u'NA',
1601 'title': video_title,
1602 'stitle': simple_title,
1603 'ext': video_extension.decode('utf-8'),
1604 'format': u'NA',
1605 'player_url': None,
1606 })
1607 except UnavailableVideoError:
1608 self._downloader.trouble(u'\nERROR: unable to download video')
1609
1610
1611 class DailymotionIE(InfoExtractor):
1612 """Information Extractor for Dailymotion"""
1613
1614 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1615 IE_NAME = u'dailymotion'
1616
1617 def __init__(self, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1619
1620 def report_download_webpage(self, video_id):
1621 """Report webpage download."""
1622 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
1623
1624 def report_extraction(self, video_id):
1625 """Report information extraction."""
1626 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
1627
1628 def _real_extract(self, url):
1629 # Extract id and simplified title from URL
1630 mobj = re.match(self._VALID_URL, url)
1631 if mobj is None:
1632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1633 return
1634
1635 # At this point we have a new video
1636 self._downloader.increment_downloads()
1637 video_id = mobj.group(1)
1638
1639 video_extension = 'flv'
1640
1641 # Retrieve video webpage to extract further information
1642 request = urllib2.Request(url)
1643 request.add_header('Cookie', 'family_filter=off')
1644 try:
1645 self.report_download_webpage(video_id)
1646 webpage = urllib2.urlopen(request).read()
1647 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1648 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1649 return
1650
1651 # Extract URL, uploader and title from webpage
1652 self.report_extraction(video_id)
1653 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
1654 if mobj is None:
1655 self._downloader.trouble(u'ERROR: unable to extract media URL')
1656 return
1657 sequence = urllib.unquote(mobj.group(1))
1658 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
1659 if mobj is None:
1660 self._downloader.trouble(u'ERROR: unable to extract media URL')
1661 return
1662 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
1663
1664 # if needed add http://www.dailymotion.com/ if relative URL
1665
1666 video_url = mediaURL
1667
1668 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
1669 if mobj is None:
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1671 return
1672 video_title = _unescapeHTML(mobj.group('title').decode('utf-8'))
1673 video_title = sanitize_title(video_title)
1674 simple_title = _simplify_title(video_title)
1675
1676 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
1677 if mobj is None:
1678 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1679 return
1680 video_uploader = mobj.group(1)
1681
1682 try:
1683 # Process video information
1684 self._downloader.process_info({
1685 'id': video_id.decode('utf-8'),
1686 'url': video_url.decode('utf-8'),
1687 'uploader': video_uploader.decode('utf-8'),
1688 'upload_date': u'NA',
1689 'title': video_title,
1690 'stitle': simple_title,
1691 'ext': video_extension.decode('utf-8'),
1692 'format': u'NA',
1693 'player_url': None,
1694 })
1695 except UnavailableVideoError:
1696 self._downloader.trouble(u'\nERROR: unable to download video')
1697
1698
1699 class GoogleIE(InfoExtractor):
1700 """Information extractor for video.google.com."""
1701
1702 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1703 IE_NAME = u'video.google'
1704
1705 def __init__(self, downloader=None):
1706 InfoExtractor.__init__(self, downloader)
1707
1708 def report_download_webpage(self, video_id):
1709 """Report webpage download."""
1710 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
1711
1712 def report_extraction(self, video_id):
1713 """Report information extraction."""
1714 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
1715
1716 def _real_extract(self, url):
1717 # Extract id from URL
1718 mobj = re.match(self._VALID_URL, url)
1719 if mobj is None:
1720 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1721 return
1722
1723 # At this point we have a new video
1724 self._downloader.increment_downloads()
1725 video_id = mobj.group(1)
1726
1727 video_extension = 'mp4'
1728
1729 # Retrieve video webpage to extract further information
1730 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1731 try:
1732 self.report_download_webpage(video_id)
1733 webpage = urllib2.urlopen(request).read()
1734 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1735 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1736 return
1737
1738 # Extract URL, uploader, and title from webpage
1739 self.report_extraction(video_id)
1740 mobj = re.search(r"download_url:'([^']+)'", webpage)
1741 if mobj is None:
1742 video_extension = 'flv'
1743 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1744 if mobj is None:
1745 self._downloader.trouble(u'ERROR: unable to extract media URL')
1746 return
1747 mediaURL = urllib.unquote(mobj.group(1))
1748 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1749 mediaURL = mediaURL.replace('\\x26', '\x26')
1750
1751 video_url = mediaURL
1752
1753 mobj = re.search(r'<title>(.*)</title>', webpage)
1754 if mobj is None:
1755 self._downloader.trouble(u'ERROR: unable to extract title')
1756 return
1757 video_title = mobj.group(1).decode('utf-8')
1758 video_title = sanitize_title(video_title)
1759 simple_title = _simplify_title(video_title)
1760
1761 # Extract video description
1762 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1763 if mobj is None:
1764 self._downloader.trouble(u'ERROR: unable to extract video description')
1765 return
1766 video_description = mobj.group(1).decode('utf-8')
1767 if not video_description:
1768 video_description = 'No description available.'
1769
1770 # Extract video thumbnail
1771 if self._downloader.params.get('forcethumbnail', False):
1772 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1773 try:
1774 webpage = urllib2.urlopen(request).read()
1775 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1776 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1777 return
1778 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1779 if mobj is None:
1780 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1781 return
1782 video_thumbnail = mobj.group(1)
1783 else: # we need something to pass to process_info
1784 video_thumbnail = ''
1785
1786 try:
1787 # Process video information
1788 self._downloader.process_info({
1789 'id': video_id.decode('utf-8'),
1790 'url': video_url.decode('utf-8'),
1791 'uploader': u'NA',
1792 'upload_date': u'NA',
1793 'title': video_title,
1794 'stitle': simple_title,
1795 'ext': video_extension.decode('utf-8'),
1796 'format': u'NA',
1797 'player_url': None,
1798 })
1799 except UnavailableVideoError:
1800 self._downloader.trouble(u'\nERROR: unable to download video')
1801
1802
1803 class PhotobucketIE(InfoExtractor):
1804 """Information extractor for photobucket.com."""
1805
1806 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1807 IE_NAME = u'photobucket'
1808
1809 def __init__(self, downloader=None):
1810 InfoExtractor.__init__(self, downloader)
1811
1812 def report_download_webpage(self, video_id):
1813 """Report webpage download."""
1814 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
1815
1816 def report_extraction(self, video_id):
1817 """Report information extraction."""
1818 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
1819
1820 def _real_extract(self, url):
1821 # Extract id from URL
1822 mobj = re.match(self._VALID_URL, url)
1823 if mobj is None:
1824 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1825 return
1826
1827 # At this point we have a new video
1828 self._downloader.increment_downloads()
1829 video_id = mobj.group(1)
1830
1831 video_extension = 'flv'
1832
1833 # Retrieve video webpage to extract further information
1834 request = urllib2.Request(url)
1835 try:
1836 self.report_download_webpage(video_id)
1837 webpage = urllib2.urlopen(request).read()
1838 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1840 return
1841
1842 # Extract URL, uploader, and title from webpage
1843 self.report_extraction(video_id)
1844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1845 if mobj is None:
1846 self._downloader.trouble(u'ERROR: unable to extract media URL')
1847 return
1848 mediaURL = urllib.unquote(mobj.group(1))
1849
1850 video_url = mediaURL
1851
1852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1853 if mobj is None:
1854 self._downloader.trouble(u'ERROR: unable to extract title')
1855 return
1856 video_title = mobj.group(1).decode('utf-8')
1857 video_title = sanitize_title(video_title)
1858 simple_title = _simplify_title(vide_title)
1859
1860 video_uploader = mobj.group(2).decode('utf-8')
1861
1862 try:
1863 # Process video information
1864 self._downloader.process_info({
1865 'id': video_id.decode('utf-8'),
1866 'url': video_url.decode('utf-8'),
1867 'uploader': video_uploader,
1868 'upload_date': u'NA',
1869 'title': video_title,
1870 'stitle': simple_title,
1871 'ext': video_extension.decode('utf-8'),
1872 'format': u'NA',
1873 'player_url': None,
1874 })
1875 except UnavailableVideoError:
1876 self._downloader.trouble(u'\nERROR: unable to download video')
1877
1878
1879 class YahooIE(InfoExtractor):
1880 """Information extractor for video.yahoo.com."""
1881
1882 # _VALID_URL matches all Yahoo! Video URLs
1883 # _VPAGE_URL matches only the extractable '/watch/' URLs
1884 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1885 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1886 IE_NAME = u'video.yahoo'
1887
1888 def __init__(self, downloader=None):
1889 InfoExtractor.__init__(self, downloader)
1890
1891 def report_download_webpage(self, video_id):
1892 """Report webpage download."""
1893 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
1894
1895 def report_extraction(self, video_id):
1896 """Report information extraction."""
1897 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
1898
1899 def _real_extract(self, url, new_video=True):
1900 # Extract ID from URL
1901 mobj = re.match(self._VALID_URL, url)
1902 if mobj is None:
1903 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1904 return
1905
1906 # At this point we have a new video
1907 self._downloader.increment_downloads()
1908 video_id = mobj.group(2)
1909 video_extension = 'flv'
1910
1911 # Rewrite valid but non-extractable URLs as
1912 # extractable English language /watch/ URLs
1913 if re.match(self._VPAGE_URL, url) is None:
1914 request = urllib2.Request(url)
1915 try:
1916 webpage = urllib2.urlopen(request).read()
1917 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1918 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1919 return
1920
1921 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1922 if mobj is None:
1923 self._downloader.trouble(u'ERROR: Unable to extract id field')
1924 return
1925 yahoo_id = mobj.group(1)
1926
1927 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1928 if mobj is None:
1929 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1930 return
1931 yahoo_vid = mobj.group(1)
1932
1933 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1934 return self._real_extract(url, new_video=False)
1935
1936 # Retrieve video webpage to extract further information
1937 request = urllib2.Request(url)
1938 try:
1939 self.report_download_webpage(video_id)
1940 webpage = urllib2.urlopen(request).read()
1941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1943 return
1944
1945 # Extract uploader and title from webpage
1946 self.report_extraction(video_id)
1947 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1948 if mobj is None:
1949 self._downloader.trouble(u'ERROR: unable to extract video title')
1950 return
1951 video_title = mobj.group(1).decode('utf-8')
1952 simple_title = _simplify_title(video_title)
1953
1954 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1955 if mobj is None:
1956 self._downloader.trouble(u'ERROR: unable to extract video uploader')
1957 return
1958 video_uploader = mobj.group(1).decode('utf-8')
1959
1960 # Extract video thumbnail
1961 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1962 if mobj is None:
1963 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1964 return
1965 video_thumbnail = mobj.group(1).decode('utf-8')
1966
1967 # Extract video description
1968 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1969 if mobj is None:
1970 self._downloader.trouble(u'ERROR: unable to extract video description')
1971 return
1972 video_description = mobj.group(1).decode('utf-8')
1973 if not video_description:
1974 video_description = 'No description available.'
1975
1976 # Extract video height and width
1977 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1978 if mobj is None:
1979 self._downloader.trouble(u'ERROR: unable to extract video height')
1980 return
1981 yv_video_height = mobj.group(1)
1982
1983 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1984 if mobj is None:
1985 self._downloader.trouble(u'ERROR: unable to extract video width')
1986 return
1987 yv_video_width = mobj.group(1)
1988
1989 # Retrieve video playlist to extract media URL
1990 # I'm not completely sure what all these options are, but we
1991 # seem to need most of them, otherwise the server sends a 401.
1992 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1993 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1994 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1995 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1996 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1997 try:
1998 self.report_download_webpage(video_id)
1999 webpage = urllib2.urlopen(request).read()
2000 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2001 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2002 return
2003
2004 # Extract media URL from playlist XML
2005 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
2006 if mobj is None:
2007 self._downloader.trouble(u'ERROR: Unable to extract media URL')
2008 return
2009 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
2010 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
2011
2012 try:
2013 # Process video information
2014 self._downloader.process_info({
2015 'id': video_id.decode('utf-8'),
2016 'url': video_url,
2017 'uploader': video_uploader,
2018 'upload_date': u'NA',
2019 'title': video_title,
2020 'stitle': simple_title,
2021 'ext': video_extension.decode('utf-8'),
2022 'thumbnail': video_thumbnail.decode('utf-8'),
2023 'description': video_description,
2024 'thumbnail': video_thumbnail,
2025 'player_url': None,
2026 })
2027 except UnavailableVideoError:
2028 self._downloader.trouble(u'\nERROR: unable to download video')
2029
2030
2031 class VimeoIE(InfoExtractor):
2032 """Information extractor for vimeo.com."""
2033
2034 # _VALID_URL matches Vimeo URLs
2035 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
2036 IE_NAME = u'vimeo'
2037
2038 def __init__(self, downloader=None):
2039 InfoExtractor.__init__(self, downloader)
2040
2041 def report_download_webpage(self, video_id):
2042 """Report webpage download."""
2043 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
2044
2045 def report_extraction(self, video_id):
2046 """Report information extraction."""
2047 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
2048
2049 def _real_extract(self, url, new_video=True):
2050 # Extract ID from URL
2051 mobj = re.match(self._VALID_URL, url)
2052 if mobj is None:
2053 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2054 return
2055
2056 # At this point we have a new video
2057 self._downloader.increment_downloads()
2058 video_id = mobj.group(1)
2059
2060 # Retrieve video webpage to extract further information
2061 request = urllib2.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id, None, std_headers)
2062 try:
2063 self.report_download_webpage(video_id)
2064 webpage = urllib2.urlopen(request).read()
2065 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2066 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2067 return
2068
2069 # Now we begin extracting as much information as we can from what we
2070 # retrieved. First we extract the information common to all extractors,
2071 # and latter we extract those that are Vimeo specific.
2072 self.report_extraction(video_id)
2073
2074 # Extract title
2075 mobj = re.search(r'<caption>(.*?)</caption>', webpage)
2076 if mobj is None:
2077 self._downloader.trouble(u'ERROR: unable to extract video title')
2078 return
2079 video_title = mobj.group(1).decode('utf-8')
2080 simple_title = _simplify_title(video_title)
2081
2082 # Extract uploader
2083 mobj = re.search(r'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage)
2084 if mobj is None:
2085 self._downloader.trouble(u'ERROR: unable to extract video uploader')
2086 return
2087 video_uploader = mobj.group(1).decode('utf-8')
2088
2089 # Extract video thumbnail
2090 mobj = re.search(r'<thumbnail>(.*?)</thumbnail>', webpage)
2091 if mobj is None:
2092 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2093 return
2094 video_thumbnail = mobj.group(1).decode('utf-8')
2095
2096 # # Extract video description
2097 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2098 # if mobj is None:
2099 # self._downloader.trouble(u'ERROR: unable to extract video description')
2100 # return
2101 # video_description = mobj.group(1).decode('utf-8')
2102 # if not video_description: video_description = 'No description available.'
2103 video_description = 'Foo.'
2104
2105 # Vimeo specific: extract request signature
2106 mobj = re.search(r'<request_signature>(.*?)</request_signature>', webpage)
2107 if mobj is None:
2108 self._downloader.trouble(u'ERROR: unable to extract request signature')
2109 return
2110 sig = mobj.group(1).decode('utf-8')
2111
2112 # Vimeo specific: extract video quality information
2113 mobj = re.search(r'<isHD>(\d+)</isHD>', webpage)
2114 if mobj is None:
2115 self._downloader.trouble(u'ERROR: unable to extract video quality information')
2116 return
2117 quality = mobj.group(1).decode('utf-8')
2118
2119 if int(quality) == 1:
2120 quality = 'hd'
2121 else:
2122 quality = 'sd'
2123
2124 # Vimeo specific: Extract request signature expiration
2125 mobj = re.search(r'<request_signature_expires>(.*?)</request_signature_expires>', webpage)
2126 if mobj is None:
2127 self._downloader.trouble(u'ERROR: unable to extract request signature expiration')
2128 return
2129 sig_exp = mobj.group(1).decode('utf-8')
2130
2131 video_url = "http://vimeo.com/moogaloop/play/clip:%s/%s/%s/?q=%s" % (video_id, sig, sig_exp, quality)
2132
2133 try:
2134 # Process video information
2135 self._downloader.process_info({
2136 'id': video_id.decode('utf-8'),
2137 'url': video_url,
2138 'uploader': video_uploader,
2139 'upload_date': u'NA',
2140 'title': video_title,
2141 'stitle': simple_title,
2142 'ext': u'mp4',
2143 'thumbnail': video_thumbnail.decode('utf-8'),
2144 'description': video_description,
2145 'thumbnail': video_thumbnail,
2146 'description': video_description,
2147 'player_url': None,
2148 })
2149 except UnavailableVideoError:
2150 self._downloader.trouble(u'ERROR: unable to download video')
2151
2152
2153 class GenericIE(InfoExtractor):
2154 """Generic last-resort information extractor."""
2155
2156 _VALID_URL = r'.*'
2157 IE_NAME = u'generic'
2158
2159 def __init__(self, downloader=None):
2160 InfoExtractor.__init__(self, downloader)
2161
2162 def report_download_webpage(self, video_id):
2163 """Report webpage download."""
2164 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
2165 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
2166
2167 def report_extraction(self, video_id):
2168 """Report information extraction."""
2169 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
2170
2171 def _real_extract(self, url):
2172 # At this point we have a new video
2173 self._downloader.increment_downloads()
2174
2175 video_id = url.split('/')[-1]
2176 request = urllib2.Request(url)
2177 try:
2178 self.report_download_webpage(video_id)
2179 webpage = urllib2.urlopen(request).read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2182 return
2183 except ValueError, err:
2184 # since this is the last-resort InfoExtractor, if
2185 # this error is thrown, it'll be thrown here
2186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2187 return
2188
2189 self.report_extraction(video_id)
2190 # Start with something easy: JW Player in SWFObject
2191 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2192 if mobj is None:
2193 # Broaden the search a little bit
2194 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
2195 if mobj is None:
2196 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2197 return
2198
2199 # It's possible that one of the regexes
2200 # matched, but returned an empty group:
2201 if mobj.group(1) is None:
2202 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2203 return
2204
2205 video_url = urllib.unquote(mobj.group(1))
2206 video_id = os.path.basename(video_url)
2207
2208 # here's a fun little line of code for you:
2209 video_extension = os.path.splitext(video_id)[1][1:]
2210 video_id = os.path.splitext(video_id)[0]
2211
2212 # it's tempting to parse this further, but you would
2213 # have to take into account all the variations like
2214 # Video Title - Site Name
2215 # Site Name | Video Title
2216 # Video Title - Tagline | Site Name
2217 # and so on and so forth; it's just not practical
2218 mobj = re.search(r'<title>(.*)</title>', webpage)
2219 if mobj is None:
2220 self._downloader.trouble(u'ERROR: unable to extract title')
2221 return
2222 video_title = mobj.group(1).decode('utf-8')
2223 video_title = sanitize_title(video_title)
2224 simple_title = _simplify_title(video_title)
2225
2226 # video uploader is domain name
2227 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2228 if mobj is None:
2229 self._downloader.trouble(u'ERROR: unable to extract title')
2230 return
2231 video_uploader = mobj.group(1).decode('utf-8')
2232
2233 try:
2234 # Process video information
2235 self._downloader.process_info({
2236 'id': video_id.decode('utf-8'),
2237 'url': video_url.decode('utf-8'),
2238 'uploader': video_uploader,
2239 'upload_date': u'NA',
2240 'title': video_title,
2241 'stitle': simple_title,
2242 'ext': video_extension.decode('utf-8'),
2243 'format': u'NA',
2244 'player_url': None,
2245 })
2246 except UnavailableVideoError, err:
2247 self._downloader.trouble(u'\nERROR: unable to download video')
2248
2249
2250 class YoutubeSearchIE(InfoExtractor):
2251 """Information Extractor for YouTube search queries."""
2252 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
2253 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2254 _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
2255 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2256 _youtube_ie = None
2257 _max_youtube_results = 1000
2258 IE_NAME = u'youtube:search'
2259
2260 def __init__(self, youtube_ie, downloader=None):
2261 InfoExtractor.__init__(self, downloader)
2262 self._youtube_ie = youtube_ie
2263
2264 def report_download_page(self, query, pagenum):
2265 """Report attempt to download playlist page with given number."""
2266 query = query.decode(preferredencoding())
2267 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2268
2269 def _real_initialize(self):
2270 self._youtube_ie.initialize()
2271
2272 def _real_extract(self, query):
2273 mobj = re.match(self._VALID_URL, query)
2274 if mobj is None:
2275 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2276 return
2277
2278 prefix, query = query.split(':')
2279 prefix = prefix[8:]
2280 query = query.encode('utf-8')
2281 if prefix == '':
2282 self._download_n_results(query, 1)
2283 return
2284 elif prefix == 'all':
2285 self._download_n_results(query, self._max_youtube_results)
2286 return
2287 else:
2288 try:
2289 n = long(prefix)
2290 if n <= 0:
2291 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2292 return
2293 elif n > self._max_youtube_results:
2294 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2295 n = self._max_youtube_results
2296 self._download_n_results(query, n)
2297 return
2298 except ValueError: # parsing prefix as integer fails
2299 self._download_n_results(query, 1)
2300 return
2301
2302 def _download_n_results(self, query, n):
2303 """Downloads a specified number of results for a query"""
2304
2305 video_ids = []
2306 already_seen = set()
2307 pagenum = 1
2308
2309 while True:
2310 self.report_download_page(query, pagenum)
2311 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2312 request = urllib2.Request(result_url)
2313 try:
2314 page = urllib2.urlopen(request).read()
2315 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2316 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2317 return
2318
2319 # Extract video identifiers
2320 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2321 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2322 if video_id not in already_seen:
2323 video_ids.append(video_id)
2324 already_seen.add(video_id)
2325 if len(video_ids) == n:
2326 # Specified n videos reached
2327 for id in video_ids:
2328 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2329 return
2330
2331 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2332 for id in video_ids:
2333 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2334 return
2335
2336 pagenum = pagenum + 1
2337
2338
2339 class GoogleSearchIE(InfoExtractor):
2340 """Information Extractor for Google Video search queries."""
2341 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
2342 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2343 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2344 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2345 _google_ie = None
2346 _max_google_results = 1000
2347 IE_NAME = u'video.google:search'
2348
2349 def __init__(self, google_ie, downloader=None):
2350 InfoExtractor.__init__(self, downloader)
2351 self._google_ie = google_ie
2352
2353 def report_download_page(self, query, pagenum):
2354 """Report attempt to download playlist page with given number."""
2355 query = query.decode(preferredencoding())
2356 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2357
2358 def _real_initialize(self):
2359 self._google_ie.initialize()
2360
2361 def _real_extract(self, query):
2362 mobj = re.match(self._VALID_URL, query)
2363 if mobj is None:
2364 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2365 return
2366
2367 prefix, query = query.split(':')
2368 prefix = prefix[8:]
2369 query = query.encode('utf-8')
2370 if prefix == '':
2371 self._download_n_results(query, 1)
2372 return
2373 elif prefix == 'all':
2374 self._download_n_results(query, self._max_google_results)
2375 return
2376 else:
2377 try:
2378 n = long(prefix)
2379 if n <= 0:
2380 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2381 return
2382 elif n > self._max_google_results:
2383 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2384 n = self._max_google_results
2385 self._download_n_results(query, n)
2386 return
2387 except ValueError: # parsing prefix as integer fails
2388 self._download_n_results(query, 1)
2389 return
2390
2391 def _download_n_results(self, query, n):
2392 """Downloads a specified number of results for a query"""
2393
2394 video_ids = []
2395 already_seen = set()
2396 pagenum = 1
2397
2398 while True:
2399 self.report_download_page(query, pagenum)
2400 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2401 request = urllib2.Request(result_url)
2402 try:
2403 page = urllib2.urlopen(request).read()
2404 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2405 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2406 return
2407
2408 # Extract video identifiers
2409 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2410 video_id = mobj.group(1)
2411 if video_id not in already_seen:
2412 video_ids.append(video_id)
2413 already_seen.add(video_id)
2414 if len(video_ids) == n:
2415 # Specified n videos reached
2416 for id in video_ids:
2417 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2418 return
2419
2420 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2421 for id in video_ids:
2422 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2423 return
2424
2425 pagenum = pagenum + 1
2426
2427
2428 class YahooSearchIE(InfoExtractor):
2429 """Information Extractor for Yahoo! Video search queries."""
2430 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
2431 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2432 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
2433 _MORE_PAGES_INDICATOR = r'\s*Next'
2434 _yahoo_ie = None
2435 _max_yahoo_results = 1000
2436 IE_NAME = u'video.yahoo:search'
2437
2438 def __init__(self, yahoo_ie, downloader=None):
2439 InfoExtractor.__init__(self, downloader)
2440 self._yahoo_ie = yahoo_ie
2441
2442 def report_download_page(self, query, pagenum):
2443 """Report attempt to download playlist page with given number."""
2444 query = query.decode(preferredencoding())
2445 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2446
2447 def _real_initialize(self):
2448 self._yahoo_ie.initialize()
2449
2450 def _real_extract(self, query):
2451 mobj = re.match(self._VALID_URL, query)
2452 if mobj is None:
2453 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2454 return
2455
2456 prefix, query = query.split(':')
2457 prefix = prefix[8:]
2458 query = query.encode('utf-8')
2459 if prefix == '':
2460 self._download_n_results(query, 1)
2461 return
2462 elif prefix == 'all':
2463 self._download_n_results(query, self._max_yahoo_results)
2464 return
2465 else:
2466 try:
2467 n = long(prefix)
2468 if n <= 0:
2469 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2470 return
2471 elif n > self._max_yahoo_results:
2472 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2473 n = self._max_yahoo_results
2474 self._download_n_results(query, n)
2475 return
2476 except ValueError: # parsing prefix as integer fails
2477 self._download_n_results(query, 1)
2478 return
2479
2480 def _download_n_results(self, query, n):
2481 """Downloads a specified number of results for a query"""
2482
2483 video_ids = []
2484 already_seen = set()
2485 pagenum = 1
2486
2487 while True:
2488 self.report_download_page(query, pagenum)
2489 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2490 request = urllib2.Request(result_url)
2491 try:
2492 page = urllib2.urlopen(request).read()
2493 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2495 return
2496
2497 # Extract video identifiers
2498 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2499 video_id = mobj.group(1)
2500 if video_id not in already_seen:
2501 video_ids.append(video_id)
2502 already_seen.add(video_id)
2503 if len(video_ids) == n:
2504 # Specified n videos reached
2505 for id in video_ids:
2506 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2507 return
2508
2509 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2510 for id in video_ids:
2511 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2512 return
2513
2514 pagenum = pagenum + 1
2515
2516
2517 class YoutubePlaylistIE(InfoExtractor):
2518 """Information Extractor for YouTube playlists."""
2519
2520 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2521 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2522 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2523 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2524 _youtube_ie = None
2525 IE_NAME = u'youtube:playlist'
2526
2527 def __init__(self, youtube_ie, downloader=None):
2528 InfoExtractor.__init__(self, downloader)
2529 self._youtube_ie = youtube_ie
2530
2531 def report_download_page(self, playlist_id, pagenum):
2532 """Report attempt to download playlist page with given number."""
2533 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2534
2535 def _real_initialize(self):
2536 self._youtube_ie.initialize()
2537
2538 def _real_extract(self, url):
2539 # Extract playlist id
2540 mobj = re.match(self._VALID_URL, url)
2541 if mobj is None:
2542 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2543 return
2544
2545 # Single video case
2546 if mobj.group(3) is not None:
2547 self._youtube_ie.extract(mobj.group(3))
2548 return
2549
2550 # Download playlist pages
2551 # prefix is 'p' as default for playlists but there are other types that need extra care
2552 playlist_prefix = mobj.group(1)
2553 if playlist_prefix == 'a':
2554 playlist_access = 'artist'
2555 else:
2556 playlist_prefix = 'p'
2557 playlist_access = 'view_play_list'
2558 playlist_id = mobj.group(2)
2559 video_ids = []
2560 pagenum = 1
2561
2562 while True:
2563 self.report_download_page(playlist_id, pagenum)
2564 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
2565 request = urllib2.Request(url)
2566 try:
2567 page = urllib2.urlopen(request).read()
2568 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2569 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2570 return
2571
2572 # Extract video identifiers
2573 ids_in_page = []
2574 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2575 if mobj.group(1) not in ids_in_page:
2576 ids_in_page.append(mobj.group(1))
2577 video_ids.extend(ids_in_page)
2578
2579 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2580 break
2581 pagenum = pagenum + 1
2582
2583 playliststart = self._downloader.params.get('playliststart', 1) - 1
2584 playlistend = self._downloader.params.get('playlistend', -1)
2585 video_ids = video_ids[playliststart:playlistend]
2586
2587 for id in video_ids:
2588 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2589 return
2590
2591
2592 class YoutubeUserIE(InfoExtractor):
2593 """Information Extractor for YouTube users."""
2594
2595 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2596 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2597 _GDATA_PAGE_SIZE = 50
2598 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2599 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
2600 _youtube_ie = None
2601 IE_NAME = u'youtube:user'
2602
2603 def __init__(self, youtube_ie, downloader=None):
2604 InfoExtractor.__init__(self, downloader)
2605 self._youtube_ie = youtube_ie
2606
2607 def report_download_page(self, username, start_index):
2608 """Report attempt to download user page."""
2609 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2610 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2611
2612 def _real_initialize(self):
2613 self._youtube_ie.initialize()
2614
2615 def _real_extract(self, url):
2616 # Extract username
2617 mobj = re.match(self._VALID_URL, url)
2618 if mobj is None:
2619 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2620 return
2621
2622 username = mobj.group(1)
2623
2624 # Download video ids using YouTube Data API. Result size per
2625 # query is limited (currently to 50 videos) so we need to query
2626 # page by page until there are no video ids - it means we got
2627 # all of them.
2628
2629 video_ids = []
2630 pagenum = 0
2631
2632 while True:
2633 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2634 self.report_download_page(username, start_index)
2635
2636 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2637
2638 try:
2639 page = urllib2.urlopen(request).read()
2640 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2641 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2642 return
2643
2644 # Extract video identifiers
2645 ids_in_page = []
2646
2647 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2648 if mobj.group(1) not in ids_in_page:
2649 ids_in_page.append(mobj.group(1))
2650
2651 video_ids.extend(ids_in_page)
2652
2653 # A little optimization - if current page is not
2654 # "full", ie. does not contain PAGE_SIZE video ids then
2655 # we can assume that this page is the last one - there
2656 # are no more ids on further pages - no need to query
2657 # again.
2658
2659 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2660 break
2661
2662 pagenum += 1
2663
2664 all_ids_count = len(video_ids)
2665 playliststart = self._downloader.params.get('playliststart', 1) - 1
2666 playlistend = self._downloader.params.get('playlistend', -1)
2667
2668 if playlistend == -1:
2669 video_ids = video_ids[playliststart:]
2670 else:
2671 video_ids = video_ids[playliststart:playlistend]
2672
2673 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
2674 (username, all_ids_count, len(video_ids)))
2675
2676 for video_id in video_ids:
2677 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2678
2679
2680 class DepositFilesIE(InfoExtractor):
2681 """Information extractor for depositfiles.com"""
2682
2683 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2684 IE_NAME = u'DepositFiles'
2685
2686 def __init__(self, downloader=None):
2687 InfoExtractor.__init__(self, downloader)
2688
2689 def report_download_webpage(self, file_id):
2690 """Report webpage download."""
2691 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2692
2693 def report_extraction(self, file_id):
2694 """Report information extraction."""
2695 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2696
2697 def _real_extract(self, url):
2698 # At this point we have a new file
2699 self._downloader.increment_downloads()
2700
2701 file_id = url.split('/')[-1]
2702 # Rebuild url in english locale
2703 url = 'http://depositfiles.com/en/files/' + file_id
2704
2705 # Retrieve file webpage with 'Free download' button pressed
2706 free_download_indication = { 'gateway_result' : '1' }
2707 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2708 try:
2709 self.report_download_webpage(file_id)
2710 webpage = urllib2.urlopen(request).read()
2711 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2712 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2713 return
2714
2715 # Search for the real file URL
2716 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2717 if (mobj is None) or (mobj.group(1) is None):
2718 # Try to figure out reason of the error.
2719 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2720 if (mobj is not None) and (mobj.group(1) is not None):
2721 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2722 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2723 else:
2724 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2725 return
2726
2727 file_url = mobj.group(1)
2728 file_extension = os.path.splitext(file_url)[1][1:]
2729
2730 # Search for file title
2731 mobj = re.search(r'<b title="(.*?)">', webpage)
2732 if mobj is None:
2733 self._downloader.trouble(u'ERROR: unable to extract title')
2734 return
2735 file_title = mobj.group(1).decode('utf-8')
2736
2737 try:
2738 # Process file information
2739 self._downloader.process_info({
2740 'id': file_id.decode('utf-8'),
2741 'url': file_url.decode('utf-8'),
2742 'uploader': u'NA',
2743 'upload_date': u'NA',
2744 'title': file_title,
2745 'stitle': file_title,
2746 'ext': file_extension.decode('utf-8'),
2747 'format': u'NA',
2748 'player_url': None,
2749 })
2750 except UnavailableVideoError, err:
2751 self._downloader.trouble(u'ERROR: unable to download file')
2752
2753
2754 class FacebookIE(InfoExtractor):
2755 """Information Extractor for Facebook"""
2756
2757 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2758 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2759 _NETRC_MACHINE = 'facebook'
2760 _available_formats = ['video', 'highqual', 'lowqual']
2761 _video_extensions = {
2762 'video': 'mp4',
2763 'highqual': 'mp4',
2764 'lowqual': 'mp4',
2765 }
2766 IE_NAME = u'facebook'
2767
2768 def __init__(self, downloader=None):
2769 InfoExtractor.__init__(self, downloader)
2770
2771 def _reporter(self, message):
2772 """Add header and report message."""
2773 self._downloader.to_screen(u'[facebook] %s' % message)
2774
2775 def report_login(self):
2776 """Report attempt to log in."""
2777 self._reporter(u'Logging in')
2778
2779 def report_video_webpage_download(self, video_id):
2780 """Report attempt to download video webpage."""
2781 self._reporter(u'%s: Downloading video webpage' % video_id)
2782
2783 def report_information_extraction(self, video_id):
2784 """Report attempt to extract video information."""
2785 self._reporter(u'%s: Extracting video information' % video_id)
2786
2787 def _parse_page(self, video_webpage):
2788 """Extract video information from page"""
2789 # General data
2790 data = {'title': r'\("video_title", "(.*?)"\)',
2791 'description': r'<div class="datawrap">(.*?)</div>',
2792 'owner': r'\("video_owner_name", "(.*?)"\)',
2793 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2794 }
2795 video_info = {}
2796 for piece in data.keys():
2797 mobj = re.search(data[piece], video_webpage)
2798 if mobj is not None:
2799 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2800
2801 # Video urls
2802 video_urls = {}
2803 for fmt in self._available_formats:
2804 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2805 if mobj is not None:
2806 # URL is in a Javascript segment inside an escaped Unicode format within
2807 # the generally utf-8 page
2808 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2809 video_info['video_urls'] = video_urls
2810
2811 return video_info
2812
2813 def _real_initialize(self):
2814 if self._downloader is None:
2815 return
2816
2817 useremail = None
2818 password = None
2819 downloader_params = self._downloader.params
2820
2821 # Attempt to use provided username and password or .netrc data
2822 if downloader_params.get('username', None) is not None:
2823 useremail = downloader_params['username']
2824 password = downloader_params['password']
2825 elif downloader_params.get('usenetrc', False):
2826 try:
2827 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2828 if info is not None:
2829 useremail = info[0]
2830 password = info[2]
2831 else:
2832 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2833 except (IOError, netrc.NetrcParseError), err:
2834 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
2835 return
2836
2837 if useremail is None:
2838 return
2839
2840 # Log in
2841 login_form = {
2842 'email': useremail,
2843 'pass': password,
2844 'login': 'Log+In'
2845 }
2846 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2847 try:
2848 self.report_login()
2849 login_results = urllib2.urlopen(request).read()
2850 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2851 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2852 return
2853 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2854 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
2855 return
2856
2857 def _real_extract(self, url):
2858 mobj = re.match(self._VALID_URL, url)
2859 if mobj is None:
2860 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2861 return
2862 video_id = mobj.group('ID')
2863
2864 # Get video webpage
2865 self.report_video_webpage_download(video_id)
2866 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2867 try:
2868 page = urllib2.urlopen(request)
2869 video_webpage = page.read()
2870 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2871 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2872 return
2873
2874 # Start extracting information
2875 self.report_information_extraction(video_id)
2876
2877 # Extract information
2878 video_info = self._parse_page(video_webpage)
2879
2880 # uploader
2881 if 'owner' not in video_info:
2882 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2883 return
2884 video_uploader = video_info['owner']
2885
2886 # title
2887 if 'title' not in video_info:
2888 self._downloader.trouble(u'ERROR: unable to extract video title')
2889 return
2890 video_title = video_info['title']
2891 video_title = video_title.decode('utf-8')
2892 video_title = sanitize_title(video_title)
2893
2894 simple_title = _simplify_title(video_title)
2895
2896 # thumbnail image
2897 if 'thumbnail' not in video_info:
2898 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2899 video_thumbnail = ''
2900 else:
2901 video_thumbnail = video_info['thumbnail']
2902
2903 # upload date
2904 upload_date = u'NA'
2905 if 'upload_date' in video_info:
2906 upload_time = video_info['upload_date']
2907 timetuple = email.utils.parsedate_tz(upload_time)
2908 if timetuple is not None:
2909 try:
2910 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2911 except:
2912 pass
2913
2914 # description
2915 video_description = video_info.get('description', 'No description available.')
2916
2917 url_map = video_info['video_urls']
2918 if len(url_map.keys()) > 0:
2919 # Decide which formats to download
2920 req_format = self._downloader.params.get('format', None)
2921 format_limit = self._downloader.params.get('format_limit', None)
2922
2923 if format_limit is not None and format_limit in self._available_formats:
2924 format_list = self._available_formats[self._available_formats.index(format_limit):]
2925 else:
2926 format_list = self._available_formats
2927 existing_formats = [x for x in format_list if x in url_map]
2928 if len(existing_formats) == 0:
2929 self._downloader.trouble(u'ERROR: no known formats available for video')
2930 return
2931 if req_format is None:
2932 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2933 elif req_format == 'worst':
2934 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2935 elif req_format == '-1':
2936 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2937 else:
2938 # Specific format
2939 if req_format not in url_map:
2940 self._downloader.trouble(u'ERROR: requested format not available')
2941 return
2942 video_url_list = [(req_format, url_map[req_format])] # Specific format
2943
2944 for format_param, video_real_url in video_url_list:
2945
2946 # At this point we have a new video
2947 self._downloader.increment_downloads()
2948
2949 # Extension
2950 video_extension = self._video_extensions.get(format_param, 'mp4')
2951
2952 try:
2953 # Process video information
2954 self._downloader.process_info({
2955 'id': video_id.decode('utf-8'),
2956 'url': video_real_url.decode('utf-8'),
2957 'uploader': video_uploader.decode('utf-8'),
2958 'upload_date': upload_date,
2959 'title': video_title,
2960 'stitle': simple_title,
2961 'ext': video_extension.decode('utf-8'),
2962 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2963 'thumbnail': video_thumbnail.decode('utf-8'),
2964 'description': video_description.decode('utf-8'),
2965 'player_url': None,
2966 })
2967 except UnavailableVideoError, err:
2968 self._downloader.trouble(u'\nERROR: unable to download video')
2969
2970 class BlipTVIE(InfoExtractor):
2971 """Information extractor for blip.tv"""
2972
2973 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2974 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2975 IE_NAME = u'blip.tv'
2976
2977 def report_extraction(self, file_id):
2978 """Report information extraction."""
2979 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2980
2981 def report_direct_download(self, title):
2982 """Report information extraction."""
2983 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2984
2985 def _real_extract(self, url):
2986 mobj = re.match(self._VALID_URL, url)
2987 if mobj is None:
2988 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2989 return
2990
2991 if '?' in url:
2992 cchar = '&'
2993 else:
2994 cchar = '?'
2995 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2996 request = urllib2.Request(json_url)
2997 self.report_extraction(mobj.group(1))
2998 info = None
2999 try:
3000 urlh = urllib2.urlopen(request)
3001 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
3002 basename = url.split('/')[-1]
3003 title,ext = os.path.splitext(basename)
3004 title = title.decode('UTF-8')
3005 ext = ext.replace('.', '')
3006 self.report_direct_download(title)
3007 info = {
3008 'id': title,
3009 'url': url,
3010 'title': title,
3011 'stitle': _simplify_title(title),
3012 'ext': ext,
3013 'urlhandle': urlh
3014 }
3015 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3016 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
3017 return
3018 if info is None: # Regular URL
3019 try:
3020 json_code = urlh.read()
3021 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3022 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
3023 return
3024
3025 try:
3026 json_data = json.loads(json_code)
3027 if 'Post' in json_data:
3028 data = json_data['Post']
3029 else:
3030 data = json_data
3031
3032 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
3033 video_url = data['media']['url']
3034 umobj = re.match(self._URL_EXT, video_url)
3035 if umobj is None:
3036 raise ValueError('Can not determine filename extension')
3037 ext = umobj.group(1)
3038
3039 info = {
3040 'id': data['item_id'],
3041 'url': video_url,
3042 'uploader': data['display_name'],
3043 'upload_date': upload_date,
3044 'title': data['title'],
3045 'stitle': _simplify_title(data['title']),
3046 'ext': ext,
3047 'format': data['media']['mimeType'],
3048 'thumbnail': data['thumbnailUrl'],
3049 'description': data['description'],
3050 'player_url': data['embedUrl']
3051 }
3052 except (ValueError,KeyError), err:
3053 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
3054 return
3055
3056 self._downloader.increment_downloads()
3057
3058 try:
3059 self._downloader.process_info(info)
3060 except UnavailableVideoError, err:
3061 self._downloader.trouble(u'\nERROR: unable to download video')
3062
3063
3064 class MyVideoIE(InfoExtractor):
3065 """Information Extractor for myvideo.de."""
3066
3067 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
3068 IE_NAME = u'myvideo'
3069
3070 def __init__(self, downloader=None):
3071 InfoExtractor.__init__(self, downloader)
3072
3073 def report_download_webpage(self, video_id):
3074 """Report webpage download."""
3075 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
3076
3077 def report_extraction(self, video_id):
3078 """Report information extraction."""
3079 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
3080
3081 def _real_extract(self,url):
3082 mobj = re.match(self._VALID_URL, url)
3083 if mobj is None:
3084 self._download.trouble(u'ERROR: invalid URL: %s' % url)
3085 return
3086
3087 video_id = mobj.group(1)
3088
3089 # Get video webpage
3090 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
3091 try:
3092 self.report_download_webpage(video_id)
3093 webpage = urllib2.urlopen(request).read()
3094 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3095 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
3096 return
3097
3098 self.report_extraction(video_id)
3099 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
3100 webpage)
3101 if mobj is None:
3102 self._downloader.trouble(u'ERROR: unable to extract media URL')
3103 return
3104 video_url = mobj.group(1) + ('/%s.flv' % video_id)
3105
3106 mobj = re.search('<title>([^<]+)</title>', webpage)
3107 if mobj is None:
3108 self._downloader.trouble(u'ERROR: unable to extract title')
3109 return
3110
3111 video_title = mobj.group(1)
3112 video_title = sanitize_title(video_title)
3113
3114 simple_title = _simplify_title(video_title)
3115
3116 try:
3117 self._downloader.process_info({
3118 'id': video_id,
3119 'url': video_url,
3120 'uploader': u'NA',
3121 'upload_date': u'NA',
3122 'title': video_title,
3123 'stitle': simple_title,
3124 'ext': u'flv',
3125 'format': u'NA',
3126 'player_url': None,
3127 })
3128 except UnavailableVideoError:
3129 self._downloader.trouble(u'\nERROR: Unable to download video')
3130
3131 class ComedyCentralIE(InfoExtractor):
3132 """Information extractor for The Daily Show and Colbert Report """
3133
3134 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
3135 IE_NAME = u'comedycentral'
3136
3137 def report_extraction(self, episode_id):
3138 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
3139
3140 def report_config_download(self, episode_id):
3141 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
3142
3143 def report_index_download(self, episode_id):
3144 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
3145
3146 def report_player_url(self, episode_id):
3147 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
3148
3149 def _real_extract(self, url):
3150 mobj = re.match(self._VALID_URL, url)
3151 if mobj is None:
3152 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3153 return
3154
3155 if mobj.group('shortname'):
3156 if mobj.group('shortname') in ('tds', 'thedailyshow'):
3157 url = u'http://www.thedailyshow.com/full-episodes/'
3158 else:
3159 url = u'http://www.colbertnation.com/full-episodes/'
3160 mobj = re.match(self._VALID_URL, url)
3161 assert mobj is not None
3162
3163 dlNewest = not mobj.group('episode')
3164 if dlNewest:
3165 epTitle = mobj.group('showname')
3166 else:
3167 epTitle = mobj.group('episode')
3168
3169 req = urllib2.Request(url)
3170 self.report_extraction(epTitle)
3171 try:
3172 htmlHandle = urllib2.urlopen(req)
3173 html = htmlHandle.read()
3174 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3175 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3176 return
3177 if dlNewest:
3178 url = htmlHandle.geturl()
3179 mobj = re.match(self._VALID_URL, url)
3180 if mobj is None:
3181 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
3182 return
3183 if mobj.group('episode') == '':
3184 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
3185 return
3186 epTitle = mobj.group('episode')
3187
3188 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
3189 if len(mMovieParams) == 0:
3190 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
3191 return
3192
3193 playerUrl_raw = mMovieParams[0][0]
3194 self.report_player_url(epTitle)
3195 try:
3196 urlHandle = urllib2.urlopen(playerUrl_raw)
3197 playerUrl = urlHandle.geturl()
3198 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3199 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
3200 return
3201
3202 uri = mMovieParams[0][1]
3203 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
3204 self.report_index_download(epTitle)
3205 try:
3206 indexXml = urllib2.urlopen(indexUrl).read()
3207 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3208 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
3209 return
3210
3211 idoc = xml.etree.ElementTree.fromstring(indexXml)
3212 itemEls = idoc.findall('.//item')
3213 for itemEl in itemEls:
3214 mediaId = itemEl.findall('./guid')[0].text
3215 shortMediaId = mediaId.split(':')[-1]
3216 showId = mediaId.split(':')[-2].replace('.com', '')
3217 officialTitle = itemEl.findall('./title')[0].text
3218 officialDate = itemEl.findall('./pubDate')[0].text
3219
3220 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
3221 urllib.urlencode({'uri': mediaId}))
3222 configReq = urllib2.Request(configUrl)
3223 self.report_config_download(epTitle)
3224 try:
3225 configXml = urllib2.urlopen(configReq).read()
3226 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3227 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
3228 return
3229
3230 cdoc = xml.etree.ElementTree.fromstring(configXml)
3231 turls = []
3232 for rendition in cdoc.findall('.//rendition'):
3233 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
3234 turls.append(finfo)
3235
3236 if len(turls) == 0:
3237 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
3238 continue
3239
3240 # For now, just pick the highest bitrate
3241 format,video_url = turls[-1]
3242
3243 self._downloader.increment_downloads()
3244
3245 effTitle = showId + u'-' + epTitle
3246 info = {
3247 'id': shortMediaId,
3248 'url': video_url,
3249 'uploader': showId,
3250 'upload_date': officialDate,
3251 'title': effTitle,
3252 'stitle': _simplify_title(effTitle),
3253 'ext': 'mp4',
3254 'format': format,
3255 'thumbnail': None,
3256 'description': officialTitle,
3257 'player_url': playerUrl
3258 }
3259
3260 try:
3261 self._downloader.process_info(info)
3262 except UnavailableVideoError, err:
3263 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId)
3264 continue
3265
3266
3267 class EscapistIE(InfoExtractor):
3268 """Information extractor for The Escapist """
3269
3270 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
3271 IE_NAME = u'escapist'
3272
3273 def report_extraction(self, showName):
3274 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
3275
3276 def report_config_download(self, showName):
3277 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
3278
3279 def _real_extract(self, url):
3280 htmlParser = HTMLParser.HTMLParser()
3281
3282 mobj = re.match(self._VALID_URL, url)
3283 if mobj is None:
3284 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3285 return
3286 showName = mobj.group('showname')
3287 videoId = mobj.group('episode')
3288
3289 self.report_extraction(showName)
3290 try:
3291 webPage = urllib2.urlopen(url).read()
3292 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3293 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
3294 return
3295
3296 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
3297 description = htmlParser.unescape(descMatch.group(1))
3298 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
3299 imgUrl = htmlParser.unescape(imgMatch.group(1))
3300 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
3301 playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
3302 configUrlMatch = re.search('config=(.*)$', playerUrl)
3303 configUrl = urllib2.unquote(configUrlMatch.group(1))
3304
3305 self.report_config_download(showName)
3306 try:
3307 configJSON = urllib2.urlopen(configUrl).read()
3308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3309 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
3310 return
3311
3312 # Technically, it's JavaScript, not JSON
3313 configJSON = configJSON.replace("'", '"')
3314
3315 try:
3316 config = json.loads(configJSON)
3317 except (ValueError,), err:
3318 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
3319 return
3320
3321 playlist = config['playlist']
3322 videoUrl = playlist[1]['url']
3323
3324 self._downloader.increment_downloads()
3325 info = {
3326 'id': videoId,
3327 'url': videoUrl,
3328 'uploader': showName,
3329 'upload_date': None,
3330 'title': showName,
3331 'stitle': _simplify_title(showName),
3332 'ext': 'flv',
3333 'format': 'flv',
3334 'thumbnail': imgUrl,
3335 'description': description,
3336 'player_url': playerUrl,
3337 }
3338
3339 try:
3340 self._downloader.process_info(info)
3341 except UnavailableVideoError, err:
3342 self._downloader.trouble(u'\nERROR: unable to download ' + videoId)
3343
3344
3345 class CollegeHumorIE(InfoExtractor):
3346 """Information extractor for collegehumor.com"""
3347
3348 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
3349 IE_NAME = u'collegehumor'
3350
3351 def report_webpage(self, video_id):
3352 """Report information extraction."""
3353 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3354
3355 def report_extraction(self, video_id):
3356 """Report information extraction."""
3357 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3358
3359 def _real_extract(self, url):
3360 htmlParser = HTMLParser.HTMLParser()
3361
3362 mobj = re.match(self._VALID_URL, url)
3363 if mobj is None:
3364 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3365 return
3366 video_id = mobj.group('videoid')
3367
3368 self.report_webpage(video_id)
3369 request = urllib2.Request(url)
3370 try:
3371 webpage = urllib2.urlopen(request).read()
3372 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3373 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3374 return
3375
3376 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
3377 if m is None:
3378 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
3379 return
3380 internal_video_id = m.group('internalvideoid')
3381
3382 info = {
3383 'id': video_id,
3384 'internal_id': internal_video_id,
3385 }
3386
3387 self.report_extraction(video_id)
3388 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
3389 try:
3390 metaXml = urllib2.urlopen(xmlUrl).read()
3391 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3392 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
3393 return
3394
3395 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3396 try:
3397 videoNode = mdoc.findall('./video')[0]
3398 info['description'] = videoNode.findall('./description')[0].text
3399 info['title'] = videoNode.findall('./caption')[0].text
3400 info['stitle'] = _simplify_title(info['title'])
3401 info['url'] = videoNode.findall('./file')[0].text
3402 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
3403 info['ext'] = info['url'].rpartition('.')[2]
3404 info['format'] = info['ext']
3405 except IndexError:
3406 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3407 return
3408
3409 self._downloader.increment_downloads()
3410
3411 try:
3412 self._downloader.process_info(info)
3413 except UnavailableVideoError, err:
3414 self._downloader.trouble(u'\nERROR: unable to download video')
3415
3416
3417 class XVideosIE(InfoExtractor):
3418 """Information extractor for xvideos.com"""
3419
3420 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
3421 IE_NAME = u'xvideos'
3422
3423 def report_webpage(self, video_id):
3424 """Report information extraction."""
3425 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3426
3427 def report_extraction(self, video_id):
3428 """Report information extraction."""
3429 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3430
3431 def _real_extract(self, url):
3432 htmlParser = HTMLParser.HTMLParser()
3433
3434 mobj = re.match(self._VALID_URL, url)
3435 if mobj is None:
3436 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3437 return
3438 video_id = mobj.group(1).decode('utf-8')
3439
3440 self.report_webpage(video_id)
3441
3442 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
3443 try:
3444 webpage = urllib2.urlopen(request).read()
3445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3446 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3447 return
3448
3449 self.report_extraction(video_id)
3450
3451
3452 # Extract video URL
3453 mobj = re.search(r'flv_url=(.+?)&', webpage)
3454 if mobj is None:
3455 self._downloader.trouble(u'ERROR: unable to extract video url')
3456 return
3457 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
3458
3459
3460 # Extract title
3461 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
3462 if mobj is None:
3463 self._downloader.trouble(u'ERROR: unable to extract video title')
3464 return
3465 video_title = mobj.group(1).decode('utf-8')
3466
3467
3468 # Extract video thumbnail
3469 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]/[a-fA-F0-9]/[a-fA-F0-9]/([a-fA-F0-9.]+jpg)', webpage)
3470 if mobj is None:
3471 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3472 return
3473 video_thumbnail = mobj.group(1).decode('utf-8')
3474
3475
3476
3477 self._downloader.increment_downloads()
3478 info = {
3479 'id': video_id,
3480 'url': video_url,
3481 'uploader': None,
3482 'upload_date': None,
3483 'title': video_title,
3484 'stitle': _simplify_title(video_title),
3485 'ext': 'flv',
3486 'format': 'flv',
3487 'thumbnail': video_thumbnail,
3488 'description': None,
3489 'player_url': None,
3490 }
3491
3492 try:
3493 self._downloader.process_info(info)
3494 except UnavailableVideoError, err:
3495 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
3496
3497
3498 class SoundcloudIE(InfoExtractor):
3499 """Information extractor for soundcloud.com
3500 To access the media, the uid of the song and a stream token
3501 must be extracted from the page source and the script must make
3502 a request to media.soundcloud.com/crossdomain.xml. Then
3503 the media can be grabbed by requesting from an url composed
3504 of the stream token and uid
3505 """
3506
3507 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
3508 IE_NAME = u'soundcloud'
3509
3510 def __init__(self, downloader=None):
3511 InfoExtractor.__init__(self, downloader)
3512
3513 def report_webpage(self, video_id):
3514 """Report information extraction."""
3515 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3516
3517 def report_extraction(self, video_id):
3518 """Report information extraction."""
3519 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3520
3521 def _real_extract(self, url):
3522 htmlParser = HTMLParser.HTMLParser()
3523
3524 mobj = re.match(self._VALID_URL, url)
3525 if mobj is None:
3526 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527 return
3528
3529 # extract uploader (which is in the url)
3530 uploader = mobj.group(1).decode('utf-8')
3531 # extract simple title (uploader + slug of song title)
3532 slug_title = mobj.group(2).decode('utf-8')
3533 simple_title = uploader + '-' + slug_title
3534
3535 self.report_webpage('%s/%s' % (uploader, slug_title))
3536
3537 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
3538 try:
3539 webpage = urllib2.urlopen(request).read()
3540 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3541 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3542 return
3543
3544 self.report_extraction('%s/%s' % (uploader, slug_title))
3545
3546 # extract uid and stream token that soundcloud hands out for access
3547 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
3548 if mobj:
3549 video_id = mobj.group(1)
3550 stream_token = mobj.group(2)
3551
3552 # extract unsimplified title
3553 mobj = re.search('"title":"(.*?)",', webpage)
3554 if mobj:
3555 title = mobj.group(1)
3556
3557 # construct media url (with uid/token)
3558 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
3559 mediaURL = mediaURL % (video_id, stream_token)
3560
3561 # description
3562 description = u'No description available'
3563 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
3564 if mobj:
3565 description = mobj.group(1)
3566
3567 # upload date
3568 upload_date = None
3569 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
3570 if mobj:
3571 try:
3572 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
3573 except Exception, e:
3574 print str(e)
3575
3576 # for soundcloud, a request to a cross domain is required for cookies
3577 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
3578
3579 try:
3580 self._downloader.process_info({
3581 'id': video_id.decode('utf-8'),
3582 'url': mediaURL,
3583 'uploader': uploader.decode('utf-8'),
3584 'upload_date': upload_date,
3585 'title': simple_title.decode('utf-8'),
3586 'stitle': simple_title.decode('utf-8'),
3587 'ext': u'mp3',
3588 'format': u'NA',
3589 'player_url': None,
3590 'description': description.decode('utf-8')
3591 })
3592 except UnavailableVideoError:
3593 self._downloader.trouble(u'\nERROR: unable to download video')
3594
3595
3596 class InfoQIE(InfoExtractor):
3597 """Information extractor for infoq.com"""
3598
3599 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
3600 IE_NAME = u'infoq'
3601
3602 def report_webpage(self, video_id):
3603 """Report information extraction."""
3604 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3605
3606 def report_extraction(self, video_id):
3607 """Report information extraction."""
3608 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3609
3610 def _real_extract(self, url):
3611 htmlParser = HTMLParser.HTMLParser()
3612
3613 mobj = re.match(self._VALID_URL, url)
3614 if mobj is None:
3615 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3616 return
3617
3618 self.report_webpage(url)
3619
3620 request = urllib2.Request(url)
3621 try:
3622 webpage = urllib2.urlopen(request).read()
3623 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3624 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3625 return
3626
3627 self.report_extraction(url)
3628
3629
3630 # Extract video URL
3631 mobj = re.search(r"jsclassref='([^']*)'", webpage)
3632 if mobj is None:
3633 self._downloader.trouble(u'ERROR: unable to extract video url')
3634 return
3635 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
3636
3637
3638 # Extract title
3639 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
3640 if mobj is None:
3641 self._downloader.trouble(u'ERROR: unable to extract video title')
3642 return
3643 video_title = mobj.group(1).decode('utf-8')
3644
3645 # Extract description
3646 video_description = u'No description available.'
3647 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
3648 if mobj is not None:
3649 video_description = mobj.group(1).decode('utf-8')
3650
3651 video_filename = video_url.split('/')[-1]
3652 video_id, extension = video_filename.split('.')
3653
3654 self._downloader.increment_downloads()
3655 info = {
3656 'id': video_id,
3657 'url': video_url,
3658 'uploader': None,
3659 'upload_date': None,
3660 'title': video_title,
3661 'stitle': _simplify_title(video_title),
3662 'ext': extension,
3663 'format': extension, # Extension is always(?) mp4, but seems to be flv
3664 'thumbnail': None,
3665 'description': video_description,
3666 'player_url': None,
3667 }
3668
3669 try:
3670 self._downloader.process_info(info)
3671 except UnavailableVideoError, err:
3672 self._downloader.trouble(u'\nERROR: unable to download ' + video_url)
3673
3674 class MixcloudIE(InfoExtractor):
3675 """Information extractor for www.mixcloud.com"""
3676 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
3677 IE_NAME = u'mixcloud'
3678
3679 def __init__(self, downloader=None):
3680 InfoExtractor.__init__(self, downloader)
3681
3682 def report_download_json(self, file_id):
3683 """Report JSON download."""
3684 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
3685
3686 def report_extraction(self, file_id):
3687 """Report information extraction."""
3688 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3689
3690 def get_urls(self, jsonData, fmt, bitrate='best'):
3691 """Get urls from 'audio_formats' section in json"""
3692 file_url = None
3693 try:
3694 bitrate_list = jsonData[fmt]
3695 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
3696 bitrate = max(bitrate_list) # select highest
3697
3698 url_list = jsonData[fmt][bitrate]
3699 except TypeError: # we have no bitrate info.
3700 url_list = jsonData[fmt]
3701
3702 return url_list
3703
3704 def check_urls(self, url_list):
3705 """Returns 1st active url from list"""
3706 for url in url_list:
3707 try:
3708 urllib2.urlopen(url)
3709 return url
3710 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3711 url = None
3712
3713 return None
3714
3715 def _print_formats(self, formats):
3716 print 'Available formats:'
3717 for fmt in formats.keys():
3718 for b in formats[fmt]:
3719 try:
3720 ext = formats[fmt][b][0]
3721 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
3722 except TypeError: # we have no bitrate info
3723 ext = formats[fmt][0]
3724 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
3725 break
3726
3727 def _real_extract(self, url):
3728 mobj = re.match(self._VALID_URL, url)
3729 if mobj is None:
3730 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3731 return
3732 # extract uploader & filename from url
3733 uploader = mobj.group(1).decode('utf-8')
3734 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3735
3736 # construct API request
3737 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3738 # retrieve .json file with links to files
3739 request = urllib2.Request(file_url)
3740 try:
3741 self.report_download_json(file_url)
3742 jsonData = urllib2.urlopen(request).read()
3743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3744 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
3745 return
3746
3747 # parse JSON
3748 json_data = json.loads(jsonData)
3749 player_url = json_data['player_swf_url']
3750 formats = dict(json_data['audio_formats'])
3751
3752 req_format = self._downloader.params.get('format', None)
3753 bitrate = None
3754
3755 if self._downloader.params.get('listformats', None):
3756 self._print_formats(formats)
3757 return
3758
3759 if req_format is None or req_format == 'best':
3760 for format_param in formats.keys():
3761 url_list = self.get_urls(formats, format_param)
3762 # check urls
3763 file_url = self.check_urls(url_list)
3764 if file_url is not None:
3765 break # got it!
3766 else:
3767 if req_format not in formats.keys():
3768 self._downloader.trouble(u'ERROR: format is not available')
3769 return
3770
3771 url_list = self.get_urls(formats, req_format)
3772 file_url = self.check_urls(url_list)
3773 format_param = req_format
3774
3775 # We have audio
3776 self._downloader.increment_downloads()
3777 try:
3778 # Process file information
3779 self._downloader.process_info({
3780 'id': file_id.decode('utf-8'),
3781 'url': file_url.decode('utf-8'),
3782 'uploader': uploader.decode('utf-8'),
3783 'upload_date': u'NA',
3784 'title': json_data['name'],
3785 'stitle': _simplify_title(json_data['name']),
3786 'ext': file_url.split('.')[-1].decode('utf-8'),
3787 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3788 'thumbnail': json_data['thumbnail_url'],
3789 'description': json_data['description'],
3790 'player_url': player_url.decode('utf-8'),
3791 })
3792 except UnavailableVideoError, err:
3793 self._downloader.trouble(u'ERROR: unable to download file')
3794
3795 class StanfordOpenClassroomIE(InfoExtractor):
3796 """Information extractor for Stanford's Open ClassRoom"""
3797
3798 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3799 IE_NAME = u'stanfordoc'
3800
3801 def report_download_webpage(self, objid):
3802 """Report information extraction."""
3803 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3804
3805 def report_extraction(self, video_id):
3806 """Report information extraction."""
3807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3808
3809 def _real_extract(self, url):
3810 mobj = re.match(self._VALID_URL, url)
3811 if mobj is None:
3812 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3813 return
3814
3815 if mobj.group('course') and mobj.group('video'): # A specific video
3816 course = mobj.group('course')
3817 video = mobj.group('video')
3818 info = {
3819 'id': _simplify_title(course + '_' + video),
3820 }
3821
3822 self.report_extraction(info['id'])
3823 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3824 xmlUrl = baseUrl + video + '.xml'
3825 try:
3826 metaXml = urllib2.urlopen(xmlUrl).read()
3827 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3828 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
3829 return
3830 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3831 try:
3832 info['title'] = mdoc.findall('./title')[0].text
3833 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3834 except IndexError:
3835 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3836 return
3837 info['stitle'] = _simplify_title(info['title'])
3838 info['ext'] = info['url'].rpartition('.')[2]
3839 info['format'] = info['ext']
3840 self._downloader.increment_downloads()
3841 try:
3842 self._downloader.process_info(info)
3843 except UnavailableVideoError, err:
3844 self._downloader.trouble(u'\nERROR: unable to download video')
3845 elif mobj.group('course'): # A course page
3846 unescapeHTML = HTMLParser.HTMLParser().unescape
3847
3848 course = mobj.group('course')
3849 info = {
3850 'id': _simplify_title(course),
3851 'type': 'playlist',
3852 }
3853
3854 self.report_download_webpage(info['id'])
3855 try:
3856 coursepage = urllib2.urlopen(url).read()
3857 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3858 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3859 return
3860
3861 m = re.search('<h1>([^<]+)</h1>', coursepage)
3862 if m:
3863 info['title'] = unescapeHTML(m.group(1))
3864 else:
3865 info['title'] = info['id']
3866 info['stitle'] = _simplify_title(info['title'])
3867
3868 m = re.search('<description>([^<]+)</description>', coursepage)
3869 if m:
3870 info['description'] = unescapeHTML(m.group(1))
3871
3872 links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3873 info['list'] = [
3874 {
3875 'type': 'reference',
3876 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3877 }
3878 for vpage in links]
3879
3880 for entry in info['list']:
3881 assert entry['type'] == 'reference'
3882 self.extract(entry['url'])
3883 else: # Root page
3884 unescapeHTML = HTMLParser.HTMLParser().unescape
3885
3886 info = {
3887 'id': 'Stanford OpenClassroom',
3888 'type': 'playlist',
3889 }
3890
3891 self.report_download_webpage(info['id'])
3892 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3893 try:
3894 rootpage = urllib2.urlopen(rootURL).read()
3895 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3896 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
3897 return
3898
3899 info['title'] = info['id']
3900 info['stitle'] = _simplify_title(info['title'])
3901
3902 links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3903 info['list'] = [
3904 {
3905 'type': 'reference',
3906 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3907 }
3908 for cpage in links]
3909
3910 for entry in info['list']:
3911 assert entry['type'] == 'reference'
3912 self.extract(entry['url'])
3913
3914 class MTVIE(InfoExtractor):
3915 """Information extractor for MTV.com"""
3916
3917 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3918 IE_NAME = u'mtv'
3919
3920 def report_webpage(self, video_id):
3921 """Report information extraction."""
3922 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3923
3924 def report_extraction(self, video_id):
3925 """Report information extraction."""
3926 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3927
3928 def _real_extract(self, url):
3929 mobj = re.match(self._VALID_URL, url)
3930 if mobj is None:
3931 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3932 return
3933 if not mobj.group('proto'):
3934 url = 'http://' + url
3935 video_id = mobj.group('videoid')
3936 self.report_webpage(video_id)
3937
3938 request = urllib2.Request(url)
3939 try:
3940 webpage = urllib2.urlopen(request).read()
3941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3942 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
3943 return
3944
3945 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3946 if mobj is None:
3947 self._downloader.trouble(u'ERROR: unable to extract song name')
3948 return
3949 song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3950 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3951 if mobj is None:
3952 self._downloader.trouble(u'ERROR: unable to extract performer')
3953 return
3954 performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3955 video_title = performer + ' - ' + song_name
3956
3957 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3958 if mobj is None:
3959 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3960 return
3961 mtvn_uri = mobj.group(1)
3962
3963 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3964 if mobj is None:
3965 self._downloader.trouble(u'ERROR: unable to extract content id')
3966 return
3967 content_id = mobj.group(1)
3968
3969 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3970 self.report_extraction(video_id)
3971 request = urllib2.Request(videogen_url)
3972 try:
3973 metadataXml = urllib2.urlopen(request).read()
3974 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3975 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
3976 return
3977
3978 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3979 renditions = mdoc.findall('.//rendition')
3980
3981 # For now, always pick the highest quality.
3982 rendition = renditions[-1]
3983
3984 try:
3985 _,_,ext = rendition.attrib['type'].partition('/')
3986 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3987 video_url = rendition.find('./src').text
3988 except KeyError:
3989 self._downloader.trouble('Invalid rendition field.')
3990 return
3991
3992 self._downloader.increment_downloads()
3993 info = {
3994 'id': video_id,
3995 'url': video_url,
3996 'uploader': performer,
3997 'title': video_title,
3998 'stitle': _simplify_title(video_title),
3999 'ext': ext,
4000 'format': format,
4001 }
4002
4003 try:
4004 self._downloader.process_info(info)
4005 except UnavailableVideoError, err:
4006 self._downloader.trouble(u'\nERROR: unable to download ' + video_id)
4007
4008
4009 class PostProcessor(object):
4010 """Post Processor class.
4011
4012 PostProcessor objects can be added to downloaders with their
4013 add_post_processor() method. When the downloader has finished a
4014 successful download, it will take its internal chain of PostProcessors
4015 and start calling the run() method on each one of them, first with
4016 an initial argument and then with the returned value of the previous
4017 PostProcessor.
4018
4019 The chain will be stopped if one of them ever returns None or the end
4020 of the chain is reached.
4021
4022 PostProcessor objects follow a "mutual registration" process similar
4023 to InfoExtractor objects.
4024 """
4025
4026 _downloader = None
4027
4028 def __init__(self, downloader=None):
4029 self._downloader = downloader
4030
4031 def set_downloader(self, downloader):
4032 """Sets the downloader for this PP."""
4033 self._downloader = downloader
4034
4035 def run(self, information):
4036 """Run the PostProcessor.
4037
4038 The "information" argument is a dictionary like the ones
4039 composed by InfoExtractors. The only difference is that this
4040 one has an extra field called "filepath" that points to the
4041 downloaded file.
4042
4043 When this method returns None, the postprocessing chain is
4044 stopped. However, this method may return an information
4045 dictionary that will be passed to the next postprocessing
4046 object in the chain. It can be the one it received after
4047 changing some fields.
4048
4049 In addition, this method may raise a PostProcessingError
4050 exception that will be taken into account by the downloader
4051 it was called from.
4052 """
4053 return information # by default, do nothing
4054
4055 class AudioConversionError(BaseException):
4056 def __init__(self, message):
4057 self.message = message
4058
4059 class FFmpegExtractAudioPP(PostProcessor):
4060
4061 def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, keepvideo=False):
4062 PostProcessor.__init__(self, downloader)
4063 if preferredcodec is None:
4064 preferredcodec = 'best'
4065 self._preferredcodec = preferredcodec
4066 self._preferredquality = preferredquality
4067 self._keepvideo = keepvideo
4068
4069 @staticmethod
4070 def get_audio_codec(path):
4071 try:
4072 cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)]
4073 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE)
4074 output = handle.communicate()[0]
4075 if handle.wait() != 0:
4076 return None
4077 except (IOError, OSError):
4078 return None
4079 audio_codec = None
4080 for line in output.split('\n'):
4081 if line.startswith('codec_name='):
4082 audio_codec = line.split('=')[1].strip()
4083 elif line.strip() == 'codec_type=audio' and audio_codec is not None:
4084 return audio_codec
4085 return None
4086
4087 @staticmethod
4088 def run_ffmpeg(path, out_path, codec, more_opts):
4089 if codec is None:
4090 acodec_opts = []
4091 else:
4092 acodec_opts = ['-acodec', codec]
4093 cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)]
4094 try:
4095 p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4096 stdout,stderr = p.communicate()
4097 except (IOError, OSError):
4098 e = sys.exc_info()[1]
4099 if isinstance(e, OSError) and e.errno == 2:
4100 raise AudioConversionError('ffmpeg not found. Please install ffmpeg.')
4101 else:
4102 raise e
4103 if p.returncode != 0:
4104 msg = stderr.strip().split('\n')[-1]
4105 raise AudioConversionError(msg)
4106
4107 def run(self, information):
4108 path = information['filepath']
4109
4110 filecodec = self.get_audio_codec(path)
4111 if filecodec is None:
4112 self._downloader.to_stderr(u'WARNING: unable to obtain file audio codec with ffprobe')
4113 return None
4114
4115 more_opts = []
4116 if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'):
4117 if self._preferredcodec == 'm4a' and filecodec == 'aac':
4118 # Lossless, but in another container
4119 acodec = 'copy'
4120 extension = self._preferredcodec
4121 more_opts = ['-absf', 'aac_adtstoasc']
4122 elif filecodec in ['aac', 'mp3', 'vorbis']:
4123 # Lossless if possible
4124 acodec = 'copy'
4125 extension = filecodec
4126 if filecodec == 'aac':
4127 more_opts = ['-f', 'adts']
4128 if filecodec == 'vorbis':
4129 extension = 'ogg'
4130 else:
4131 # MP3 otherwise.
4132 acodec = 'libmp3lame'
4133 extension = 'mp3'
4134 more_opts = []
4135 if self._preferredquality is not None:
4136 more_opts += ['-ab', self._preferredquality]
4137 else:
4138 # We convert the audio (lossy)
4139 acodec = {'mp3': 'libmp3lame', 'aac': 'aac', 'm4a': 'aac', 'vorbis': 'libvorbis', 'wav': None}[self._preferredcodec]
4140 extension = self._preferredcodec
4141 more_opts = []
4142 if self._preferredquality is not None:
4143 more_opts += ['-ab', self._preferredquality]
4144 if self._preferredcodec == 'aac':
4145 more_opts += ['-f', 'adts']
4146 if self._preferredcodec == 'm4a':
4147 more_opts += ['-absf', 'aac_adtstoasc']
4148 if self._preferredcodec == 'vorbis':
4149 extension = 'ogg'
4150 if self._preferredcodec == 'wav':
4151 extension = 'wav'
4152 more_opts += ['-f', 'wav']
4153
4154 prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups
4155 new_path = prefix + sep + extension
4156 self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path)
4157 try:
4158 self.run_ffmpeg(path, new_path, acodec, more_opts)
4159 except:
4160 etype,e,tb = sys.exc_info()
4161 if isinstance(e, AudioConversionError):
4162 self._downloader.to_stderr(u'ERROR: audio conversion failed: ' + e.message)
4163 else:
4164 self._downloader.to_stderr(u'ERROR: error running ffmpeg')
4165 return None
4166
4167 # Try to update the date time for extracted audio file.
4168 if information.get('filetime') is not None:
4169 try:
4170 os.utime(_encodeFilename(new_path), (time.time(), information['filetime']))
4171 except:
4172 self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
4173
4174 if not self._keepvideo:
4175 try:
4176 os.remove(_encodeFilename(path))
4177 except (IOError, OSError):
4178 self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file')
4179 return None
4180
4181 information['filepath'] = new_path
4182 return information
4183
4184
4185 def updateSelf(downloader, filename):
4186 ''' Update the program file with the latest version from the repository '''
4187 # Note: downloader only used for options
4188 if not os.access(filename, os.W_OK):
4189 sys.exit('ERROR: no write permissions on %s' % filename)
4190
4191 downloader.to_screen(u'Updating to latest version...')
4192
4193 try:
4194 try:
4195 urlh = urllib.urlopen(UPDATE_URL)
4196 newcontent = urlh.read()
4197
4198 vmatch = re.search("__version__ = '([^']+)'", newcontent)
4199 if vmatch is not None and vmatch.group(1) == __version__:
4200 downloader.to_screen(u'youtube-dl is up-to-date (' + __version__ + ')')
4201 return
4202 finally:
4203 urlh.close()
4204 except (IOError, OSError), err:
4205 sys.exit('ERROR: unable to download latest version')
4206
4207 try:
4208 outf = open(filename, 'wb')
4209 try:
4210 outf.write(newcontent)
4211 finally:
4212 outf.close()
4213 except (IOError, OSError), err:
4214 sys.exit('ERROR: unable to overwrite current version')
4215
4216 downloader.to_screen(u'Updated youtube-dl. Restart youtube-dl to use the new version.')
4217
4218 def parseOpts():
4219 def _readOptions(filename_bytes):
4220 try:
4221 optionf = open(filename_bytes)
4222 except IOError:
4223 return [] # silently skip if file is not present
4224 try:
4225 res = []
4226 for l in optionf:
4227 res += shlex.split(l, comments=True)
4228 finally:
4229 optionf.close()
4230 return res
4231
4232 def _format_option_string(option):
4233 ''' ('-o', '--option') -> -o, --format METAVAR'''
4234
4235 opts = []
4236
4237 if option._short_opts: opts.append(option._short_opts[0])
4238 if option._long_opts: opts.append(option._long_opts[0])
4239 if len(opts) > 1: opts.insert(1, ', ')
4240
4241 if option.takes_value(): opts.append(' %s' % option.metavar)
4242
4243 return "".join(opts)
4244
4245 def _find_term_columns():
4246 columns = os.environ.get('COLUMNS', None)
4247 if columns:
4248 return int(columns)
4249
4250 try:
4251 sp = subprocess.Popen(['stty', 'size'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
4252 out,err = sp.communicate()
4253 return int(out.split()[1])
4254 except:
4255 pass
4256 return None
4257
4258 max_width = 80
4259 max_help_position = 80
4260
4261 # No need to wrap help messages if we're on a wide console
4262 columns = _find_term_columns()
4263 if columns: max_width = columns
4264
4265 fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position)
4266 fmt.format_option_strings = _format_option_string
4267
4268 kw = {
4269 'version' : __version__,
4270 'formatter' : fmt,
4271 'usage' : '%prog [options] url [url...]',
4272 'conflict_handler' : 'resolve',
4273 }
4274
4275 parser = optparse.OptionParser(**kw)
4276
4277 # option groups
4278 general = optparse.OptionGroup(parser, 'General Options')
4279 selection = optparse.OptionGroup(parser, 'Video Selection')
4280 authentication = optparse.OptionGroup(parser, 'Authentication Options')
4281 video_format = optparse.OptionGroup(parser, 'Video Format Options')
4282 postproc = optparse.OptionGroup(parser, 'Post-processing Options')
4283 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
4284 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
4285
4286 general.add_option('-h', '--help',
4287 action='help', help='print this help text and exit')
4288 general.add_option('-v', '--version',
4289 action='version', help='print program version and exit')
4290 general.add_option('-U', '--update',
4291 action='store_true', dest='update_self', help='update this program to latest version')
4292 general.add_option('-i', '--ignore-errors',
4293 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
4294 general.add_option('-r', '--rate-limit',
4295 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
4296 general.add_option('-R', '--retries',
4297 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
4298 general.add_option('--dump-user-agent',
4299 action='store_true', dest='dump_user_agent',
4300 help='display the current browser identification', default=False)
4301 general.add_option('--list-extractors',
4302 action='store_true', dest='list_extractors',
4303 help='List all supported extractors and the URLs they would handle', default=False)
4304
4305 selection.add_option('--playlist-start',
4306 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
4307 selection.add_option('--playlist-end',
4308 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
4309 selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
4310 selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
4311 selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
4312
4313 authentication.add_option('-u', '--username',
4314 dest='username', metavar='USERNAME', help='account username')
4315 authentication.add_option('-p', '--password',
4316 dest='password', metavar='PASSWORD', help='account password')
4317 authentication.add_option('-n', '--netrc',
4318 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
4319
4320
4321 video_format.add_option('-f', '--format',
4322 action='store', dest='format', metavar='FORMAT', help='video format code')
4323 video_format.add_option('--all-formats',
4324 action='store_const', dest='format', help='download all available video formats', const='all')
4325 video_format.add_option('--prefer-free-formats',
4326 action='store_true', dest='prefer_free_formats', default=False, help='prefer free video formats unless a specific one is requested')
4327 video_format.add_option('--max-quality',
4328 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
4329 video_format.add_option('-F', '--list-formats',
4330 action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
4331
4332
4333 verbosity.add_option('-q', '--quiet',
4334 action='store_true', dest='quiet', help='activates quiet mode', default=False)
4335 verbosity.add_option('-s', '--simulate',
4336 action='store_true', dest='simulate', help='do not download the video and do not write anything to disk', default=False)
4337 verbosity.add_option('--skip-download',
4338 action='store_true', dest='skip_download', help='do not download the video', default=False)
4339 verbosity.add_option('-g', '--get-url',
4340 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
4341 verbosity.add_option('-e', '--get-title',
4342 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
4343 verbosity.add_option('--get-thumbnail',
4344 action='store_true', dest='getthumbnail',
4345 help='simulate, quiet but print thumbnail URL', default=False)
4346 verbosity.add_option('--get-description',
4347 action='store_true', dest='getdescription',
4348 help='simulate, quiet but print video description', default=False)
4349 verbosity.add_option('--get-filename',
4350 action='store_true', dest='getfilename',
4351 help='simulate, quiet but print output filename', default=False)
4352 verbosity.add_option('--get-format',
4353 action='store_true', dest='getformat',
4354 help='simulate, quiet but print output format', default=False)
4355 verbosity.add_option('--no-progress',
4356 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
4357 verbosity.add_option('--console-title',
4358 action='store_true', dest='consoletitle',
4359 help='display progress in console titlebar', default=False)
4360 verbosity.add_option('-v', '--verbose',
4361 action='store_true', dest='verbose', help='print various debugging information', default=False)
4362
4363
4364 filesystem.add_option('-t', '--title',
4365 action='store_true', dest='usetitle', help='use title in file name', default=False)
4366 filesystem.add_option('-l', '--literal',
4367 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
4368 filesystem.add_option('-A', '--auto-number',
4369 action='store_true', dest='autonumber',
4370 help='number downloaded files starting from 00000', default=False)
4371 filesystem.add_option('-o', '--output',
4372 dest='outtmpl', metavar='TEMPLATE', help='output filename template. Use %(stitle)s to get the title, %(uploader)s for the uploader name, %(autonumber)s to get an automatically incremented number, %(ext)s for the filename extension, %(upload_date)s for the upload date (YYYYMMDD), and %% for a literal percent. Use - to output to stdout.')
4373 filesystem.add_option('-a', '--batch-file',
4374 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
4375 filesystem.add_option('-w', '--no-overwrites',
4376 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
4377 filesystem.add_option('-c', '--continue',
4378 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=True)
4379 filesystem.add_option('--no-continue',
4380 action='store_false', dest='continue_dl',
4381 help='do not resume partially downloaded files (restart from beginning)')
4382 filesystem.add_option('--cookies',
4383 dest='cookiefile', metavar='FILE', help='file to read cookies from and dump cookie jar in')
4384 filesystem.add_option('--no-part',
4385 action='store_true', dest='nopart', help='do not use .part files', default=False)
4386 filesystem.add_option('--no-mtime',
4387 action='store_false', dest='updatetime',
4388 help='do not use the Last-modified header to set the file modification time', default=True)
4389 filesystem.add_option('--write-description',
4390 action='store_true', dest='writedescription',
4391 help='write video description to a .description file', default=False)
4392 filesystem.add_option('--write-info-json',
4393 action='store_true', dest='writeinfojson',
4394 help='write video metadata to a .info.json file', default=False)
4395
4396
4397 postproc.add_option('--extract-audio', action='store_true', dest='extractaudio', default=False,
4398 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
4399 postproc.add_option('--audio-format', metavar='FORMAT', dest='audioformat', default='best',
4400 help='"best", "aac", "vorbis", "mp3", "m4a", or "wav"; best by default')
4401 postproc.add_option('--audio-quality', metavar='QUALITY', dest='audioquality', default='128K',
4402 help='ffmpeg audio bitrate specification, 128k by default')
4403 postproc.add_option('-k', '--keep-video', action='store_true', dest='keepvideo', default=False,
4404 help='keeps the video file on disk after the post-processing; the video is erased by default')
4405
4406
4407 parser.add_option_group(general)
4408 parser.add_option_group(selection)
4409 parser.add_option_group(filesystem)
4410 parser.add_option_group(verbosity)
4411 parser.add_option_group(video_format)
4412 parser.add_option_group(authentication)
4413 parser.add_option_group(postproc)
4414
4415 xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
4416 if xdg_config_home:
4417 userConf = os.path.join(xdg_config_home, 'youtube-dl.conf')
4418 else:
4419 userConf = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
4420 argv = _readOptions('/etc/youtube-dl.conf') + _readOptions(userConf) + sys.argv[1:]
4421 opts, args = parser.parse_args(argv)
4422
4423 return parser, opts, args
4424
4425 def gen_extractors():
4426 """ Return a list of an instance of every supported extractor.
4427 The order does matter; the first extractor matched is the one handling the URL.
4428 """
4429 youtube_ie = YoutubeIE()
4430 google_ie = GoogleIE()
4431 yahoo_ie = YahooIE()
4432 return [
4433 YoutubePlaylistIE(youtube_ie),
4434 YoutubeUserIE(youtube_ie),
4435 YoutubeSearchIE(youtube_ie),
4436 youtube_ie,
4437 MetacafeIE(youtube_ie),
4438 DailymotionIE(),
4439 google_ie,
4440 GoogleSearchIE(google_ie),
4441 PhotobucketIE(),
4442 yahoo_ie,
4443 YahooSearchIE(yahoo_ie),
4444 DepositFilesIE(),
4445 FacebookIE(),
4446 BlipTVIE(),
4447 VimeoIE(),
4448 MyVideoIE(),
4449 ComedyCentralIE(),
4450 EscapistIE(),
4451 CollegeHumorIE(),
4452 XVideosIE(),
4453 SoundcloudIE(),
4454 InfoQIE(),
4455 MixcloudIE(),
4456 StanfordOpenClassroomIE(),
4457 MTVIE(),
4458
4459 GenericIE()
4460 ]
4461
4462 def _real_main():
4463 parser, opts, args = parseOpts()
4464
4465 # Open appropriate CookieJar
4466 if opts.cookiefile is None:
4467 jar = cookielib.CookieJar()
4468 else:
4469 try:
4470 jar = cookielib.MozillaCookieJar(opts.cookiefile)
4471 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
4472 jar.load()
4473 except (IOError, OSError), err:
4474 sys.exit(u'ERROR: unable to open cookie file')
4475
4476 # Dump user agent
4477 if opts.dump_user_agent:
4478 print std_headers['User-Agent']
4479 sys.exit(0)
4480
4481 # Batch file verification
4482 batchurls = []
4483 if opts.batchfile is not None:
4484 try:
4485 if opts.batchfile == '-':
4486 batchfd = sys.stdin
4487 else:
4488 batchfd = open(opts.batchfile, 'r')
4489 batchurls = batchfd.readlines()
4490 batchurls = [x.strip() for x in batchurls]
4491 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
4492 except IOError:
4493 sys.exit(u'ERROR: batch file could not be read')
4494 all_urls = batchurls + args
4495
4496 # General configuration
4497 cookie_processor = urllib2.HTTPCookieProcessor(jar)
4498 proxy_handler = urllib2.ProxyHandler()
4499 opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler())
4500 urllib2.install_opener(opener)
4501 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
4502
4503 if opts.verbose:
4504 print(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
4505
4506 extractors = gen_extractors()
4507
4508 if opts.list_extractors:
4509 for ie in extractors:
4510 print(ie.IE_NAME)
4511 matchedUrls = filter(lambda url: ie.suitable(url), all_urls)
4512 all_urls = filter(lambda url: url not in matchedUrls, all_urls)
4513 for mu in matchedUrls:
4514 print(u' ' + mu)
4515 sys.exit(0)
4516
4517 # Conflicting, missing and erroneous options
4518 if opts.usenetrc and (opts.username is not None or opts.password is not None):
4519 parser.error(u'using .netrc conflicts with giving username/password')
4520 if opts.password is not None and opts.username is None:
4521 parser.error(u'account username missing')
4522 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
4523 parser.error(u'using output template conflicts with using title, literal title or auto number')
4524 if opts.usetitle and opts.useliteral:
4525 parser.error(u'using title conflicts with using literal title')
4526 if opts.username is not None and opts.password is None:
4527 opts.password = getpass.getpass(u'Type account password and press return:')
4528 if opts.ratelimit is not None:
4529 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
4530 if numeric_limit is None:
4531 parser.error(u'invalid rate limit specified')
4532 opts.ratelimit = numeric_limit
4533 if opts.retries is not None:
4534 try:
4535 opts.retries = long(opts.retries)
4536 except (TypeError, ValueError), err:
4537 parser.error(u'invalid retry count specified')
4538 try:
4539 opts.playliststart = int(opts.playliststart)
4540 if opts.playliststart <= 0:
4541 raise ValueError(u'Playlist start must be positive')
4542 except (TypeError, ValueError), err:
4543 parser.error(u'invalid playlist start number specified')
4544 try:
4545 opts.playlistend = int(opts.playlistend)
4546 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
4547 raise ValueError(u'Playlist end must be greater than playlist start')
4548 except (TypeError, ValueError), err:
4549 parser.error(u'invalid playlist end number specified')
4550 if opts.extractaudio:
4551 if opts.audioformat not in ['best', 'aac', 'mp3', 'vorbis', 'm4a', 'wav']:
4552 parser.error(u'invalid audio format specified')
4553
4554 # File downloader
4555 fd = FileDownloader({
4556 'usenetrc': opts.usenetrc,
4557 'username': opts.username,
4558 'password': opts.password,
4559 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4560 'forceurl': opts.geturl,
4561 'forcetitle': opts.gettitle,
4562 'forcethumbnail': opts.getthumbnail,
4563 'forcedescription': opts.getdescription,
4564 'forcefilename': opts.getfilename,
4565 'forceformat': opts.getformat,
4566 'simulate': opts.simulate,
4567 'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
4568 'format': opts.format,
4569 'format_limit': opts.format_limit,
4570 'listformats': opts.listformats,
4571 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
4572 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
4573 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
4574 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
4575 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
4576 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
4577 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
4578 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
4579 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
4580 or u'%(id)s.%(ext)s'),
4581 'ignoreerrors': opts.ignoreerrors,
4582 'ratelimit': opts.ratelimit,
4583 'nooverwrites': opts.nooverwrites,
4584 'retries': opts.retries,
4585 'continuedl': opts.continue_dl,
4586 'noprogress': opts.noprogress,
4587 'playliststart': opts.playliststart,
4588 'playlistend': opts.playlistend,
4589 'logtostderr': opts.outtmpl == '-',
4590 'consoletitle': opts.consoletitle,
4591 'nopart': opts.nopart,
4592 'updatetime': opts.updatetime,
4593 'writedescription': opts.writedescription,
4594 'writeinfojson': opts.writeinfojson,
4595 'matchtitle': opts.matchtitle,
4596 'rejecttitle': opts.rejecttitle,
4597 'max_downloads': opts.max_downloads,
4598 'prefer_free_formats': opts.prefer_free_formats,
4599 'verbose': opts.verbose,
4600 })
4601 for extractor in extractors:
4602 fd.add_info_extractor(extractor)
4603
4604 # PostProcessors
4605 if opts.extractaudio:
4606 fd.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, keepvideo=opts.keepvideo))
4607
4608 # Update version
4609 if opts.update_self:
4610 updateSelf(fd, sys.argv[0])
4611
4612 # Maybe do nothing
4613 if len(all_urls) < 1:
4614 if not opts.update_self:
4615 parser.error(u'you must provide at least one URL')
4616 else:
4617 sys.exit()
4618
4619 try:
4620 retcode = fd.download(all_urls)
4621 except MaxDownloadsReached:
4622 fd.to_screen(u'--max-download limit reached, aborting.')
4623 retcode = 101
4624
4625 # Dump cookie jar if requested
4626 if opts.cookiefile is not None:
4627 try:
4628 jar.save()
4629 except (IOError, OSError), err:
4630 sys.exit(u'ERROR: unable to save cookie jar')
4631
4632 sys.exit(retcode)
4633
4634 def main():
4635 try:
4636 _real_main()
4637 except DownloadError:
4638 sys.exit(1)
4639 except SameFileError:
4640 sys.exit(u'ERROR: fixed output name but more than one file to download')
4641 except KeyboardInterrupt:
4642 sys.exit(u'\nERROR: Interrupted by user')
4643
4644 if __name__ == '__main__':
4645 main()
4646
4647 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: