]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/YoutubeDL.py
Imported Upstream version 2013.06.33
[youtubedl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import io
7 import os
8 import re
9 import shutil
10 import socket
11 import sys
12 import time
13 import traceback
14
15 from .utils import *
16 from .extractor import get_info_extractor
17 from .FileDownloader import FileDownloader
18
19
20 class YoutubeDL(object):
21 """YoutubeDL class.
22
23 YoutubeDL objects are the ones responsible of downloading the
24 actual video file and writing it to disk if the user has requested
25 it, among some other tasks. In most cases there should be one per
26 program. As, given a video URL, the downloader doesn't know how to
27 extract all the needed information, task that InfoExtractors do, it
28 has to pass the URL to one of them.
29
30 For this, YoutubeDL objects have a method that allows
31 InfoExtractors to be registered in a given order. When it is passed
32 a URL, the YoutubeDL object handles it to the first InfoExtractor it
33 finds that reports being able to handle it. The InfoExtractor extracts
34 all the information about the video or videos the URL refers to, and
35 YoutubeDL process the extracted information, possibly using a File
36 Downloader to download the video.
37
38 YoutubeDL objects accept a lot of parameters. In order not to saturate
39 the object constructor with arguments, it receives a dictionary of
40 options instead. These options are available through the params
41 attribute for the InfoExtractors to use. The YoutubeDL also
42 registers itself as the downloader in charge for the InfoExtractors
43 that are added to it, so this is a "mutual registration".
44
45 Available options:
46
47 username: Username for authentication purposes.
48 password: Password for authentication purposes.
49 videopassword: Password for acces a video.
50 usenetrc: Use netrc for authentication instead.
51 verbose: Print additional info to stdout.
52 quiet: Do not print messages to stdout.
53 forceurl: Force printing final URL.
54 forcetitle: Force printing title.
55 forceid: Force printing ID.
56 forcethumbnail: Force printing thumbnail URL.
57 forcedescription: Force printing description.
58 forcefilename: Force printing final filename.
59 simulate: Do not download the video files.
60 format: Video format code.
61 format_limit: Highest quality format to try.
62 outtmpl: Template for output names.
63 restrictfilenames: Do not allow "&" and spaces in file names
64 ignoreerrors: Do not stop on download errors.
65 nooverwrites: Prevent overwriting files.
66 playliststart: Playlist item to start at.
67 playlistend: Playlist item to end at.
68 matchtitle: Download only matching titles.
69 rejecttitle: Reject downloads for matching titles.
70 logtostderr: Log messages to stderr instead of stdout.
71 writedescription: Write the video description to a .description file
72 writeinfojson: Write the video description to a .info.json file
73 writethumbnail: Write the thumbnail image to a file
74 writesubtitles: Write the video subtitles to a file
75 allsubtitles: Downloads all the subtitles of the video
76 listsubtitles: Lists all available subtitles for the video
77 subtitlesformat: Subtitle format [sbv/srt] (default=srt)
78 subtitleslang: Language of the subtitles to download
79 keepvideo: Keep the video file after post-processing
80 daterange: A DateRange object, download only if the upload_date is in the range.
81 skip_download: Skip the actual download of the video file
82
83 The following parameters are not used by YoutubeDL itself, they are used by
84 the FileDownloader:
85 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
86 noresizebuffer, retries, continuedl, noprogress, consoletitle
87 """
88
89 params = None
90 _ies = []
91 _pps = []
92 _download_retcode = None
93 _num_downloads = None
94 _screen_file = None
95
96 def __init__(self, params):
97 """Create a FileDownloader object with the given options."""
98 self._ies = []
99 self._pps = []
100 self._progress_hooks = []
101 self._download_retcode = 0
102 self._num_downloads = 0
103 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
104 self.params = params
105 self.fd = FileDownloader(self, self.params)
106
107 if '%(stitle)s' in self.params['outtmpl']:
108 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
109
110 def add_info_extractor(self, ie):
111 """Add an InfoExtractor object to the end of the list."""
112 self._ies.append(ie)
113 ie.set_downloader(self)
114
115 def add_post_processor(self, pp):
116 """Add a PostProcessor object to the end of the chain."""
117 self._pps.append(pp)
118 pp.set_downloader(self)
119
120 def to_screen(self, message, skip_eol=False):
121 """Print message to stdout if not in quiet mode."""
122 assert type(message) == type(u'')
123 if not self.params.get('quiet', False):
124 terminator = [u'\n', u''][skip_eol]
125 output = message + terminator
126 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
127 output = output.encode(preferredencoding(), 'ignore')
128 self._screen_file.write(output)
129 self._screen_file.flush()
130
131 def to_stderr(self, message):
132 """Print message to stderr."""
133 assert type(message) == type(u'')
134 output = message + u'\n'
135 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
136 output = output.encode(preferredencoding())
137 sys.stderr.write(output)
138
139 def fixed_template(self):
140 """Checks if the output template is fixed."""
141 return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
142
143 def trouble(self, message=None, tb=None):
144 """Determine action to take when a download problem appears.
145
146 Depending on if the downloader has been configured to ignore
147 download errors or not, this method may throw an exception or
148 not when errors are found, after printing the message.
149
150 tb, if given, is additional traceback information.
151 """
152 if message is not None:
153 self.to_stderr(message)
154 if self.params.get('verbose'):
155 if tb is None:
156 if sys.exc_info()[0]: # if .trouble has been called from an except block
157 tb = u''
158 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
159 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
160 tb += compat_str(traceback.format_exc())
161 else:
162 tb_data = traceback.format_list(traceback.extract_stack())
163 tb = u''.join(tb_data)
164 self.to_stderr(tb)
165 if not self.params.get('ignoreerrors', False):
166 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
167 exc_info = sys.exc_info()[1].exc_info
168 else:
169 exc_info = sys.exc_info()
170 raise DownloadError(message, exc_info)
171 self._download_retcode = 1
172
173 def report_warning(self, message):
174 '''
175 Print the message to stderr, it will be prefixed with 'WARNING:'
176 If stderr is a tty file the 'WARNING:' will be colored
177 '''
178 if sys.stderr.isatty() and os.name != 'nt':
179 _msg_header=u'\033[0;33mWARNING:\033[0m'
180 else:
181 _msg_header=u'WARNING:'
182 warning_message=u'%s %s' % (_msg_header,message)
183 self.to_stderr(warning_message)
184
185 def report_error(self, message, tb=None):
186 '''
187 Do the same as trouble, but prefixes the message with 'ERROR:', colored
188 in red if stderr is a tty file.
189 '''
190 if sys.stderr.isatty() and os.name != 'nt':
191 _msg_header = u'\033[0;31mERROR:\033[0m'
192 else:
193 _msg_header = u'ERROR:'
194 error_message = u'%s %s' % (_msg_header, message)
195 self.trouble(error_message, tb)
196
197 def slow_down(self, start_time, byte_counter):
198 """Sleep if the download speed is over the rate limit."""
199 rate_limit = self.params.get('ratelimit', None)
200 if rate_limit is None or byte_counter == 0:
201 return
202 now = time.time()
203 elapsed = now - start_time
204 if elapsed <= 0.0:
205 return
206 speed = float(byte_counter) / elapsed
207 if speed > rate_limit:
208 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
209
210 def report_writedescription(self, descfn):
211 """ Report that the description file is being written """
212 self.to_screen(u'[info] Writing video description to: ' + descfn)
213
214 def report_writesubtitles(self, sub_filename):
215 """ Report that the subtitles file is being written """
216 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
217
218 def report_writeinfojson(self, infofn):
219 """ Report that the metadata file has been written """
220 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
221
222 def report_file_already_downloaded(self, file_name):
223 """Report file has already been fully downloaded."""
224 try:
225 self.to_screen(u'[download] %s has already been downloaded' % file_name)
226 except (UnicodeEncodeError) as err:
227 self.to_screen(u'[download] The file has already been downloaded')
228
229 def increment_downloads(self):
230 """Increment the ordinal that assigns a number to each file."""
231 self._num_downloads += 1
232
233 def prepare_filename(self, info_dict):
234 """Generate the output filename."""
235 try:
236 template_dict = dict(info_dict)
237
238 template_dict['epoch'] = int(time.time())
239 autonumber_size = self.params.get('autonumber_size')
240 if autonumber_size is None:
241 autonumber_size = 5
242 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
243 template_dict['autonumber'] = autonumber_templ % self._num_downloads
244 if template_dict['playlist_index'] is not None:
245 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
246
247 sanitize = lambda k,v: sanitize_filename(
248 u'NA' if v is None else compat_str(v),
249 restricted=self.params.get('restrictfilenames'),
250 is_id=(k==u'id'))
251 template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
252
253 filename = self.params['outtmpl'] % template_dict
254 return filename
255 except KeyError as err:
256 self.report_error(u'Erroneous output template')
257 return None
258 except ValueError as err:
259 self.report_error(u'Insufficient system charset ' + repr(preferredencoding()))
260 return None
261
262 def _match_entry(self, info_dict):
263 """ Returns None iff the file should be downloaded """
264
265 title = info_dict['title']
266 matchtitle = self.params.get('matchtitle', False)
267 if matchtitle:
268 if not re.search(matchtitle, title, re.IGNORECASE):
269 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
270 rejecttitle = self.params.get('rejecttitle', False)
271 if rejecttitle:
272 if re.search(rejecttitle, title, re.IGNORECASE):
273 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
274 date = info_dict.get('upload_date', None)
275 if date is not None:
276 dateRange = self.params.get('daterange', DateRange())
277 if date not in dateRange:
278 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
279 return None
280
281 def extract_info(self, url, download=True, ie_key=None, extra_info={}):
282 '''
283 Returns a list with a dictionary for each video we find.
284 If 'download', also downloads the videos.
285 extra_info is a dict containing the extra values to add to each result
286 '''
287
288 if ie_key:
289 ie = get_info_extractor(ie_key)()
290 ie.set_downloader(self)
291 ies = [ie]
292 else:
293 ies = self._ies
294
295 for ie in ies:
296 if not ie.suitable(url):
297 continue
298
299 if not ie.working():
300 self.report_warning(u'The program functionality for this site has been marked as broken, '
301 u'and will probably not work.')
302
303 try:
304 ie_result = ie.extract(url)
305 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
306 break
307 if isinstance(ie_result, list):
308 # Backwards compatibility: old IE result format
309 for result in ie_result:
310 result.update(extra_info)
311 ie_result = {
312 '_type': 'compat_list',
313 'entries': ie_result,
314 }
315 else:
316 ie_result.update(extra_info)
317 if 'extractor' not in ie_result:
318 ie_result['extractor'] = ie.IE_NAME
319 return self.process_ie_result(ie_result, download=download)
320 except ExtractorError as de: # An error we somewhat expected
321 self.report_error(compat_str(de), de.format_traceback())
322 break
323 except Exception as e:
324 if self.params.get('ignoreerrors', False):
325 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
326 break
327 else:
328 raise
329 else:
330 self.report_error(u'no suitable InfoExtractor: %s' % url)
331
332 def process_ie_result(self, ie_result, download=True, extra_info={}):
333 """
334 Take the result of the ie(may be modified) and resolve all unresolved
335 references (URLs, playlist items).
336
337 It will also download the videos if 'download'.
338 Returns the resolved ie_result.
339 """
340
341 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
342 if result_type == 'video':
343 if 'playlist' not in ie_result:
344 # It isn't part of a playlist
345 ie_result['playlist'] = None
346 ie_result['playlist_index'] = None
347 if download:
348 self.process_info(ie_result)
349 return ie_result
350 elif result_type == 'url':
351 # We have to add extra_info to the results because it may be
352 # contained in a playlist
353 return self.extract_info(ie_result['url'],
354 download,
355 ie_key=ie_result.get('ie_key'),
356 extra_info=extra_info)
357 elif result_type == 'playlist':
358 # We process each entry in the playlist
359 playlist = ie_result.get('title', None) or ie_result.get('id', None)
360 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
361
362 playlist_results = []
363
364 n_all_entries = len(ie_result['entries'])
365 playliststart = self.params.get('playliststart', 1) - 1
366 playlistend = self.params.get('playlistend', -1)
367
368 if playlistend == -1:
369 entries = ie_result['entries'][playliststart:]
370 else:
371 entries = ie_result['entries'][playliststart:playlistend]
372
373 n_entries = len(entries)
374
375 self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
376 (ie_result['extractor'], playlist, n_all_entries, n_entries))
377
378 for i,entry in enumerate(entries,1):
379 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
380 extra = {
381 'playlist': playlist,
382 'playlist_index': i + playliststart,
383 }
384 if not 'extractor' in entry:
385 # We set the extractor, if it's an url it will be set then to
386 # the new extractor, but if it's already a video we must make
387 # sure it's present: see issue #877
388 entry['extractor'] = ie_result['extractor']
389 entry_result = self.process_ie_result(entry,
390 download=download,
391 extra_info=extra)
392 playlist_results.append(entry_result)
393 ie_result['entries'] = playlist_results
394 return ie_result
395 elif result_type == 'compat_list':
396 def _fixup(r):
397 r.setdefault('extractor', ie_result['extractor'])
398 return r
399 ie_result['entries'] = [
400 self.process_ie_result(_fixup(r), download=download)
401 for r in ie_result['entries']
402 ]
403 return ie_result
404 else:
405 raise Exception('Invalid result type: %s' % result_type)
406
407 def process_info(self, info_dict):
408 """Process a single resolved IE result."""
409
410 assert info_dict.get('_type', 'video') == 'video'
411 #We increment the download the download count here to match the previous behaviour.
412 self.increment_downloads()
413
414 info_dict['fulltitle'] = info_dict['title']
415 if len(info_dict['title']) > 200:
416 info_dict['title'] = info_dict['title'][:197] + u'...'
417
418 # Keep for backwards compatibility
419 info_dict['stitle'] = info_dict['title']
420
421 if not 'format' in info_dict:
422 info_dict['format'] = info_dict['ext']
423
424 reason = self._match_entry(info_dict)
425 if reason is not None:
426 self.to_screen(u'[download] ' + reason)
427 return
428
429 max_downloads = self.params.get('max_downloads')
430 if max_downloads is not None:
431 if self._num_downloads > int(max_downloads):
432 raise MaxDownloadsReached()
433
434 filename = self.prepare_filename(info_dict)
435
436 # Forced printings
437 if self.params.get('forcetitle', False):
438 compat_print(info_dict['title'])
439 if self.params.get('forceid', False):
440 compat_print(info_dict['id'])
441 if self.params.get('forceurl', False):
442 compat_print(info_dict['url'])
443 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
444 compat_print(info_dict['thumbnail'])
445 if self.params.get('forcedescription', False) and 'description' in info_dict:
446 compat_print(info_dict['description'])
447 if self.params.get('forcefilename', False) and filename is not None:
448 compat_print(filename)
449 if self.params.get('forceformat', False):
450 compat_print(info_dict['format'])
451
452 # Do nothing else if in simulate mode
453 if self.params.get('simulate', False):
454 return
455
456 if filename is None:
457 return
458
459 try:
460 dn = os.path.dirname(encodeFilename(filename))
461 if dn != '' and not os.path.exists(dn):
462 os.makedirs(dn)
463 except (OSError, IOError) as err:
464 self.report_error(u'unable to create directory ' + compat_str(err))
465 return
466
467 if self.params.get('writedescription', False):
468 try:
469 descfn = filename + u'.description'
470 self.report_writedescription(descfn)
471 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
472 descfile.write(info_dict['description'])
473 except (OSError, IOError):
474 self.report_error(u'Cannot write description file ' + descfn)
475 return
476
477 if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
478 # subtitles download errors are already managed as troubles in relevant IE
479 # that way it will silently go on when used with unsupporting IE
480 subtitle = info_dict['subtitles'][0]
481 (sub_error, sub_lang, sub) = subtitle
482 sub_format = self.params.get('subtitlesformat')
483 if sub_error:
484 self.report_warning("Some error while getting the subtitles")
485 else:
486 try:
487 sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
488 self.report_writesubtitles(sub_filename)
489 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
490 subfile.write(sub)
491 except (OSError, IOError):
492 self.report_error(u'Cannot write subtitles file ' + descfn)
493 return
494
495 if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
496 subtitles = info_dict['subtitles']
497 sub_format = self.params.get('subtitlesformat')
498 for subtitle in subtitles:
499 (sub_error, sub_lang, sub) = subtitle
500 if sub_error:
501 self.report_warning("Some error while getting the subtitles")
502 else:
503 try:
504 sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
505 self.report_writesubtitles(sub_filename)
506 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
507 subfile.write(sub)
508 except (OSError, IOError):
509 self.report_error(u'Cannot write subtitles file ' + descfn)
510 return
511
512 if self.params.get('writeinfojson', False):
513 infofn = filename + u'.info.json'
514 self.report_writeinfojson(infofn)
515 try:
516 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
517 write_json_file(json_info_dict, encodeFilename(infofn))
518 except (OSError, IOError):
519 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
520 return
521
522 if self.params.get('writethumbnail', False):
523 if 'thumbnail' in info_dict:
524 thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
525 if not thumb_format:
526 thumb_format = 'jpg'
527 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
528 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
529 (info_dict['extractor'], info_dict['id']))
530 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
531 with open(thumb_filename, 'wb') as thumbf:
532 shutil.copyfileobj(uf, thumbf)
533 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
534 (info_dict['extractor'], info_dict['id'], thumb_filename))
535
536 if not self.params.get('skip_download', False):
537 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
538 success = True
539 else:
540 try:
541 success = self.fd._do_download(filename, info_dict)
542 except (OSError, IOError) as err:
543 raise UnavailableVideoError()
544 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
545 self.report_error(u'unable to download video data: %s' % str(err))
546 return
547 except (ContentTooShortError, ) as err:
548 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
549 return
550
551 if success:
552 try:
553 self.post_process(filename, info_dict)
554 except (PostProcessingError) as err:
555 self.report_error(u'postprocessing: %s' % str(err))
556 return
557
558 def download(self, url_list):
559 """Download a given list of URLs."""
560 if len(url_list) > 1 and self.fixed_template():
561 raise SameFileError(self.params['outtmpl'])
562
563 for url in url_list:
564 try:
565 #It also downloads the videos
566 videos = self.extract_info(url)
567 except UnavailableVideoError:
568 self.report_error(u'unable to download video')
569 except MaxDownloadsReached:
570 self.to_screen(u'[info] Maximum number of downloaded files reached.')
571 raise
572
573 return self._download_retcode
574
575 def post_process(self, filename, ie_info):
576 """Run all the postprocessors on the given file."""
577 info = dict(ie_info)
578 info['filepath'] = filename
579 keep_video = None
580 for pp in self._pps:
581 try:
582 keep_video_wish,new_info = pp.run(info)
583 if keep_video_wish is not None:
584 if keep_video_wish:
585 keep_video = keep_video_wish
586 elif keep_video is None:
587 # No clear decision yet, let IE decide
588 keep_video = keep_video_wish
589 except PostProcessingError as e:
590 self.to_stderr(u'ERROR: ' + e.msg)
591 if keep_video is False and not self.params.get('keepvideo', False):
592 try:
593 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
594 os.remove(encodeFilename(filename))
595 except (IOError, OSError):
596 self.report_warning(u'Unable to remove downloaded video file')