]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/YoutubeDL.py
Imported Upstream version 2013.06.34
[youtubedl] / youtube_dl / YoutubeDL.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import absolute_import
5
6 import io
7 import os
8 import re
9 import shutil
10 import socket
11 import sys
12 import time
13 import traceback
14
15 from .utils import *
16 from .extractor import get_info_extractor
17 from .FileDownloader import FileDownloader
18
19
20 class YoutubeDL(object):
21 """YoutubeDL class.
22
23 YoutubeDL objects are the ones responsible of downloading the
24 actual video file and writing it to disk if the user has requested
25 it, among some other tasks. In most cases there should be one per
26 program. As, given a video URL, the downloader doesn't know how to
27 extract all the needed information, task that InfoExtractors do, it
28 has to pass the URL to one of them.
29
30 For this, YoutubeDL objects have a method that allows
31 InfoExtractors to be registered in a given order. When it is passed
32 a URL, the YoutubeDL object handles it to the first InfoExtractor it
33 finds that reports being able to handle it. The InfoExtractor extracts
34 all the information about the video or videos the URL refers to, and
35 YoutubeDL process the extracted information, possibly using a File
36 Downloader to download the video.
37
38 YoutubeDL objects accept a lot of parameters. In order not to saturate
39 the object constructor with arguments, it receives a dictionary of
40 options instead. These options are available through the params
41 attribute for the InfoExtractors to use. The YoutubeDL also
42 registers itself as the downloader in charge for the InfoExtractors
43 that are added to it, so this is a "mutual registration".
44
45 Available options:
46
47 username: Username for authentication purposes.
48 password: Password for authentication purposes.
49 videopassword: Password for acces a video.
50 usenetrc: Use netrc for authentication instead.
51 verbose: Print additional info to stdout.
52 quiet: Do not print messages to stdout.
53 forceurl: Force printing final URL.
54 forcetitle: Force printing title.
55 forceid: Force printing ID.
56 forcethumbnail: Force printing thumbnail URL.
57 forcedescription: Force printing description.
58 forcefilename: Force printing final filename.
59 simulate: Do not download the video files.
60 format: Video format code.
61 format_limit: Highest quality format to try.
62 outtmpl: Template for output names.
63 restrictfilenames: Do not allow "&" and spaces in file names
64 ignoreerrors: Do not stop on download errors.
65 nooverwrites: Prevent overwriting files.
66 playliststart: Playlist item to start at.
67 playlistend: Playlist item to end at.
68 matchtitle: Download only matching titles.
69 rejecttitle: Reject downloads for matching titles.
70 logtostderr: Log messages to stderr instead of stdout.
71 writedescription: Write the video description to a .description file
72 writeinfojson: Write the video description to a .info.json file
73 writethumbnail: Write the thumbnail image to a file
74 writesubtitles: Write the video subtitles to a file
75 writeautomaticsub: Write the automatic subtitles to a file
76 allsubtitles: Downloads all the subtitles of the video
77 listsubtitles: Lists all available subtitles for the video
78 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
79 subtitleslang: Language of the subtitles to download
80 keepvideo: Keep the video file after post-processing
81 daterange: A DateRange object, download only if the upload_date is in the range.
82 skip_download: Skip the actual download of the video file
83
84 The following parameters are not used by YoutubeDL itself, they are used by
85 the FileDownloader:
86 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
87 noresizebuffer, retries, continuedl, noprogress, consoletitle
88 """
89
90 params = None
91 _ies = []
92 _pps = []
93 _download_retcode = None
94 _num_downloads = None
95 _screen_file = None
96
97 def __init__(self, params):
98 """Create a FileDownloader object with the given options."""
99 self._ies = []
100 self._pps = []
101 self._progress_hooks = []
102 self._download_retcode = 0
103 self._num_downloads = 0
104 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
105 self.params = params
106 self.fd = FileDownloader(self, self.params)
107
108 if '%(stitle)s' in self.params['outtmpl']:
109 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
110
111 def add_info_extractor(self, ie):
112 """Add an InfoExtractor object to the end of the list."""
113 self._ies.append(ie)
114 ie.set_downloader(self)
115
116 def add_post_processor(self, pp):
117 """Add a PostProcessor object to the end of the chain."""
118 self._pps.append(pp)
119 pp.set_downloader(self)
120
121 def to_screen(self, message, skip_eol=False):
122 """Print message to stdout if not in quiet mode."""
123 assert type(message) == type(u'')
124 if not self.params.get('quiet', False):
125 terminator = [u'\n', u''][skip_eol]
126 output = message + terminator
127 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
128 output = output.encode(preferredencoding(), 'ignore')
129 self._screen_file.write(output)
130 self._screen_file.flush()
131
132 def to_stderr(self, message):
133 """Print message to stderr."""
134 assert type(message) == type(u'')
135 output = message + u'\n'
136 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
137 output = output.encode(preferredencoding())
138 sys.stderr.write(output)
139
140 def fixed_template(self):
141 """Checks if the output template is fixed."""
142 return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
143
144 def trouble(self, message=None, tb=None):
145 """Determine action to take when a download problem appears.
146
147 Depending on if the downloader has been configured to ignore
148 download errors or not, this method may throw an exception or
149 not when errors are found, after printing the message.
150
151 tb, if given, is additional traceback information.
152 """
153 if message is not None:
154 self.to_stderr(message)
155 if self.params.get('verbose'):
156 if tb is None:
157 if sys.exc_info()[0]: # if .trouble has been called from an except block
158 tb = u''
159 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
160 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
161 tb += compat_str(traceback.format_exc())
162 else:
163 tb_data = traceback.format_list(traceback.extract_stack())
164 tb = u''.join(tb_data)
165 self.to_stderr(tb)
166 if not self.params.get('ignoreerrors', False):
167 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
168 exc_info = sys.exc_info()[1].exc_info
169 else:
170 exc_info = sys.exc_info()
171 raise DownloadError(message, exc_info)
172 self._download_retcode = 1
173
174 def report_warning(self, message):
175 '''
176 Print the message to stderr, it will be prefixed with 'WARNING:'
177 If stderr is a tty file the 'WARNING:' will be colored
178 '''
179 if sys.stderr.isatty() and os.name != 'nt':
180 _msg_header=u'\033[0;33mWARNING:\033[0m'
181 else:
182 _msg_header=u'WARNING:'
183 warning_message=u'%s %s' % (_msg_header,message)
184 self.to_stderr(warning_message)
185
186 def report_error(self, message, tb=None):
187 '''
188 Do the same as trouble, but prefixes the message with 'ERROR:', colored
189 in red if stderr is a tty file.
190 '''
191 if sys.stderr.isatty() and os.name != 'nt':
192 _msg_header = u'\033[0;31mERROR:\033[0m'
193 else:
194 _msg_header = u'ERROR:'
195 error_message = u'%s %s' % (_msg_header, message)
196 self.trouble(error_message, tb)
197
198 def slow_down(self, start_time, byte_counter):
199 """Sleep if the download speed is over the rate limit."""
200 rate_limit = self.params.get('ratelimit', None)
201 if rate_limit is None or byte_counter == 0:
202 return
203 now = time.time()
204 elapsed = now - start_time
205 if elapsed <= 0.0:
206 return
207 speed = float(byte_counter) / elapsed
208 if speed > rate_limit:
209 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
210
211 def report_writedescription(self, descfn):
212 """ Report that the description file is being written """
213 self.to_screen(u'[info] Writing video description to: ' + descfn)
214
215 def report_writesubtitles(self, sub_filename):
216 """ Report that the subtitles file is being written """
217 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
218
219 def report_writeinfojson(self, infofn):
220 """ Report that the metadata file has been written """
221 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
222
223 def report_file_already_downloaded(self, file_name):
224 """Report file has already been fully downloaded."""
225 try:
226 self.to_screen(u'[download] %s has already been downloaded' % file_name)
227 except (UnicodeEncodeError) as err:
228 self.to_screen(u'[download] The file has already been downloaded')
229
230 def increment_downloads(self):
231 """Increment the ordinal that assigns a number to each file."""
232 self._num_downloads += 1
233
234 def prepare_filename(self, info_dict):
235 """Generate the output filename."""
236 try:
237 template_dict = dict(info_dict)
238
239 template_dict['epoch'] = int(time.time())
240 autonumber_size = self.params.get('autonumber_size')
241 if autonumber_size is None:
242 autonumber_size = 5
243 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
244 template_dict['autonumber'] = autonumber_templ % self._num_downloads
245 if template_dict['playlist_index'] is not None:
246 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
247
248 sanitize = lambda k,v: sanitize_filename(
249 u'NA' if v is None else compat_str(v),
250 restricted=self.params.get('restrictfilenames'),
251 is_id=(k==u'id'))
252 template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
253
254 filename = self.params['outtmpl'] % template_dict
255 return filename
256 except KeyError as err:
257 self.report_error(u'Erroneous output template')
258 return None
259 except ValueError as err:
260 self.report_error(u'Insufficient system charset ' + repr(preferredencoding()))
261 return None
262
263 def _match_entry(self, info_dict):
264 """ Returns None iff the file should be downloaded """
265
266 title = info_dict['title']
267 matchtitle = self.params.get('matchtitle', False)
268 if matchtitle:
269 if not re.search(matchtitle, title, re.IGNORECASE):
270 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
271 rejecttitle = self.params.get('rejecttitle', False)
272 if rejecttitle:
273 if re.search(rejecttitle, title, re.IGNORECASE):
274 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
275 date = info_dict.get('upload_date', None)
276 if date is not None:
277 dateRange = self.params.get('daterange', DateRange())
278 if date not in dateRange:
279 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
280 return None
281
282 def extract_info(self, url, download=True, ie_key=None, extra_info={}):
283 '''
284 Returns a list with a dictionary for each video we find.
285 If 'download', also downloads the videos.
286 extra_info is a dict containing the extra values to add to each result
287 '''
288
289 if ie_key:
290 ie = get_info_extractor(ie_key)()
291 ie.set_downloader(self)
292 ies = [ie]
293 else:
294 ies = self._ies
295
296 for ie in ies:
297 if not ie.suitable(url):
298 continue
299
300 if not ie.working():
301 self.report_warning(u'The program functionality for this site has been marked as broken, '
302 u'and will probably not work.')
303
304 try:
305 ie_result = ie.extract(url)
306 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
307 break
308 if isinstance(ie_result, list):
309 # Backwards compatibility: old IE result format
310 for result in ie_result:
311 result.update(extra_info)
312 ie_result = {
313 '_type': 'compat_list',
314 'entries': ie_result,
315 }
316 else:
317 ie_result.update(extra_info)
318 if 'extractor' not in ie_result:
319 ie_result['extractor'] = ie.IE_NAME
320 return self.process_ie_result(ie_result, download=download)
321 except ExtractorError as de: # An error we somewhat expected
322 self.report_error(compat_str(de), de.format_traceback())
323 break
324 except Exception as e:
325 if self.params.get('ignoreerrors', False):
326 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
327 break
328 else:
329 raise
330 else:
331 self.report_error(u'no suitable InfoExtractor: %s' % url)
332
333 def process_ie_result(self, ie_result, download=True, extra_info={}):
334 """
335 Take the result of the ie(may be modified) and resolve all unresolved
336 references (URLs, playlist items).
337
338 It will also download the videos if 'download'.
339 Returns the resolved ie_result.
340 """
341
342 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
343 if result_type == 'video':
344 if 'playlist' not in ie_result:
345 # It isn't part of a playlist
346 ie_result['playlist'] = None
347 ie_result['playlist_index'] = None
348 if download:
349 self.process_info(ie_result)
350 return ie_result
351 elif result_type == 'url':
352 # We have to add extra_info to the results because it may be
353 # contained in a playlist
354 return self.extract_info(ie_result['url'],
355 download,
356 ie_key=ie_result.get('ie_key'),
357 extra_info=extra_info)
358 elif result_type == 'playlist':
359 # We process each entry in the playlist
360 playlist = ie_result.get('title', None) or ie_result.get('id', None)
361 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
362
363 playlist_results = []
364
365 n_all_entries = len(ie_result['entries'])
366 playliststart = self.params.get('playliststart', 1) - 1
367 playlistend = self.params.get('playlistend', -1)
368
369 if playlistend == -1:
370 entries = ie_result['entries'][playliststart:]
371 else:
372 entries = ie_result['entries'][playliststart:playlistend]
373
374 n_entries = len(entries)
375
376 self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
377 (ie_result['extractor'], playlist, n_all_entries, n_entries))
378
379 for i,entry in enumerate(entries,1):
380 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
381 extra = {
382 'playlist': playlist,
383 'playlist_index': i + playliststart,
384 }
385 if not 'extractor' in entry:
386 # We set the extractor, if it's an url it will be set then to
387 # the new extractor, but if it's already a video we must make
388 # sure it's present: see issue #877
389 entry['extractor'] = ie_result['extractor']
390 entry_result = self.process_ie_result(entry,
391 download=download,
392 extra_info=extra)
393 playlist_results.append(entry_result)
394 ie_result['entries'] = playlist_results
395 return ie_result
396 elif result_type == 'compat_list':
397 def _fixup(r):
398 r.setdefault('extractor', ie_result['extractor'])
399 return r
400 ie_result['entries'] = [
401 self.process_ie_result(_fixup(r), download=download)
402 for r in ie_result['entries']
403 ]
404 return ie_result
405 else:
406 raise Exception('Invalid result type: %s' % result_type)
407
408 def process_info(self, info_dict):
409 """Process a single resolved IE result."""
410
411 assert info_dict.get('_type', 'video') == 'video'
412 #We increment the download the download count here to match the previous behaviour.
413 self.increment_downloads()
414
415 info_dict['fulltitle'] = info_dict['title']
416 if len(info_dict['title']) > 200:
417 info_dict['title'] = info_dict['title'][:197] + u'...'
418
419 # Keep for backwards compatibility
420 info_dict['stitle'] = info_dict['title']
421
422 if not 'format' in info_dict:
423 info_dict['format'] = info_dict['ext']
424
425 reason = self._match_entry(info_dict)
426 if reason is not None:
427 self.to_screen(u'[download] ' + reason)
428 return
429
430 max_downloads = self.params.get('max_downloads')
431 if max_downloads is not None:
432 if self._num_downloads > int(max_downloads):
433 raise MaxDownloadsReached()
434
435 filename = self.prepare_filename(info_dict)
436
437 # Forced printings
438 if self.params.get('forcetitle', False):
439 compat_print(info_dict['title'])
440 if self.params.get('forceid', False):
441 compat_print(info_dict['id'])
442 if self.params.get('forceurl', False):
443 compat_print(info_dict['url'])
444 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
445 compat_print(info_dict['thumbnail'])
446 if self.params.get('forcedescription', False) and 'description' in info_dict:
447 compat_print(info_dict['description'])
448 if self.params.get('forcefilename', False) and filename is not None:
449 compat_print(filename)
450 if self.params.get('forceformat', False):
451 compat_print(info_dict['format'])
452
453 # Do nothing else if in simulate mode
454 if self.params.get('simulate', False):
455 return
456
457 if filename is None:
458 return
459
460 try:
461 dn = os.path.dirname(encodeFilename(filename))
462 if dn != '' and not os.path.exists(dn):
463 os.makedirs(dn)
464 except (OSError, IOError) as err:
465 self.report_error(u'unable to create directory ' + compat_str(err))
466 return
467
468 if self.params.get('writedescription', False):
469 try:
470 descfn = filename + u'.description'
471 self.report_writedescription(descfn)
472 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
473 descfile.write(info_dict['description'])
474 except (OSError, IOError):
475 self.report_error(u'Cannot write description file ' + descfn)
476 return
477
478 if (self.params.get('writesubtitles', False) or self.params.get('writeautomaticsub')) and 'subtitles' in info_dict and info_dict['subtitles']:
479 # subtitles download errors are already managed as troubles in relevant IE
480 # that way it will silently go on when used with unsupporting IE
481 subtitle = info_dict['subtitles'][0]
482 (sub_error, sub_lang, sub) = subtitle
483 sub_format = self.params.get('subtitlesformat')
484 if sub_error:
485 self.report_warning("Some error while getting the subtitles")
486 else:
487 try:
488 sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
489 self.report_writesubtitles(sub_filename)
490 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
491 subfile.write(sub)
492 except (OSError, IOError):
493 self.report_error(u'Cannot write subtitles file ' + descfn)
494 return
495
496 if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
497 subtitles = info_dict['subtitles']
498 sub_format = self.params.get('subtitlesformat')
499 for subtitle in subtitles:
500 (sub_error, sub_lang, sub) = subtitle
501 if sub_error:
502 self.report_warning("Some error while getting the subtitles")
503 else:
504 try:
505 sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
506 self.report_writesubtitles(sub_filename)
507 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
508 subfile.write(sub)
509 except (OSError, IOError):
510 self.report_error(u'Cannot write subtitles file ' + descfn)
511 return
512
513 if self.params.get('writeinfojson', False):
514 infofn = filename + u'.info.json'
515 self.report_writeinfojson(infofn)
516 try:
517 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
518 write_json_file(json_info_dict, encodeFilename(infofn))
519 except (OSError, IOError):
520 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
521 return
522
523 if self.params.get('writethumbnail', False):
524 if 'thumbnail' in info_dict:
525 thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
526 if not thumb_format:
527 thumb_format = 'jpg'
528 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
529 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
530 (info_dict['extractor'], info_dict['id']))
531 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
532 with open(thumb_filename, 'wb') as thumbf:
533 shutil.copyfileobj(uf, thumbf)
534 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
535 (info_dict['extractor'], info_dict['id'], thumb_filename))
536
537 if not self.params.get('skip_download', False):
538 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
539 success = True
540 else:
541 try:
542 success = self.fd._do_download(filename, info_dict)
543 except (OSError, IOError) as err:
544 raise UnavailableVideoError()
545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
546 self.report_error(u'unable to download video data: %s' % str(err))
547 return
548 except (ContentTooShortError, ) as err:
549 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
550 return
551
552 if success:
553 try:
554 self.post_process(filename, info_dict)
555 except (PostProcessingError) as err:
556 self.report_error(u'postprocessing: %s' % str(err))
557 return
558
559 def download(self, url_list):
560 """Download a given list of URLs."""
561 if len(url_list) > 1 and self.fixed_template():
562 raise SameFileError(self.params['outtmpl'])
563
564 for url in url_list:
565 try:
566 #It also downloads the videos
567 videos = self.extract_info(url)
568 except UnavailableVideoError:
569 self.report_error(u'unable to download video')
570 except MaxDownloadsReached:
571 self.to_screen(u'[info] Maximum number of downloaded files reached.')
572 raise
573
574 return self._download_retcode
575
576 def post_process(self, filename, ie_info):
577 """Run all the postprocessors on the given file."""
578 info = dict(ie_info)
579 info['filepath'] = filename
580 keep_video = None
581 for pp in self._pps:
582 try:
583 keep_video_wish,new_info = pp.run(info)
584 if keep_video_wish is not None:
585 if keep_video_wish:
586 keep_video = keep_video_wish
587 elif keep_video is None:
588 # No clear decision yet, let IE decide
589 keep_video = keep_video_wish
590 except PostProcessingError as e:
591 self.to_stderr(u'ERROR: ' + e.msg)
592 if keep_video is False and not self.params.get('keepvideo', False):
593 try:
594 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
595 os.remove(encodeFilename(filename))
596 except (IOError, OSError):
597 self.report_warning(u'Unable to remove downloaded video file')