debian/control: Add recommends on aria2 | wget | curl to use external downloaders.

[youtubedl] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 772fddd4542f6726b57878a950410c90ee1a640c..9605f8f99ac04bf72032b1cc8a64635bb6c2a8b8 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -10,6 +10,7 @@ import io
  import itertools
  import json
  import locale
+import operator
  import os
  import platform
  import re
@@ -24,6 +25,7 @@ if os.name == 'nt':
      import ctypes
  
  from .compat import (
+    compat_basestring,
      compat_cookiejar,
      compat_expanduser,
      compat_http_client,
@@ -49,11 +51,14 @@ from .utils import (
      make_HTTPS_handler,
      MaxDownloadsReached,
      PagedList,
+    parse_filesize,
      PostProcessingError,
      platform_name,
      preferredencoding,
+    render_table,
      SameFileError,
      sanitize_filename,
+    std_headers,
      subtitles_filename,
      takewhile_inclusive,
      UnavailableVideoError,
@@ -71,6 +76,7 @@ from .extractor import get_info_extractor, gen_extractors
  from .downloader import get_suitable_downloader
  from .downloader.rtmp import rtmpdump_version
  from .postprocessor import (
+    FFmpegFixupM4aPP,
      FFmpegFixupStretchedPP,
      FFmpegMergerPP,
      FFmpegPostProcessor,
@@ -132,6 +138,7 @@ class YoutubeDL(object):
      nooverwrites:      Prevent overwriting files.
      playliststart:     Playlist item to start at.
      playlistend:       Playlist item to end at.
+    playlist_items:    Specific indices of playlist to download.
      playlistreverse:   Download playlist items in reverse order.
      matchtitle:        Download only matching titles.
      rejecttitle:       Reject downloads for matching titles.
@@ -141,6 +148,7 @@ class YoutubeDL(object):
      writeinfojson:     Write the video description to a .info.json file
      writeannotations:  Write the video annotations to a .annotations.xml file
      writethumbnail:    Write the thumbnail image to a file
+    write_all_thumbnails:  Write all thumbnail formats to files
      writesubtitles:    Write the video subtitles to a file
      writeautomaticsub: Write the automatic subtitles to a file
      allsubtitles:      Downloads all the subtitles of the video
@@ -191,11 +199,12 @@ class YoutubeDL(object):
                         postprocessor.
      progress_hooks:    A list of functions that get called on download
                         progress, with a dictionary with the entries
-                       * filename: The final filename
-                       * status: One of "downloading" and "finished"
-
-                       The dict may also have some of the following entries:
+                       * status: One of "downloading" and "finished".
+                                 Check this first and ignore unknown values.
  
+                       If status is one of "downloading" or "finished", the
+                       following properties may also be present:
+                       * filename: The final filename (always present)
                         * downloaded_bytes: Bytes on disk
                         * total_bytes: Size of the whole file, None if unknown
                         * tmpfilename: The filename we're currently writing to
@@ -211,16 +220,21 @@ class YoutubeDL(object):
                         - "never": do nothing
                         - "warn": only emit a warning
                         - "detect_or_warn": check whether we can do anything
-                                           about it, warn otherwise
+                                           about it, warn otherwise (default)
      source_address:    (Experimental) Client-side IP address to bind to.
      call_home:         Boolean, true iff we are allowed to contact the
                         youtube-dl servers for debugging.
+    sleep_interval:    Number of seconds to sleep before each download.
+    external_downloader:  Executable of the external downloader to call.
+    listformats:       Print an overview of available video formats and exit.
+    list_thumbnails:   Print a table of all thumbnails and exit.
  
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
      nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
-    noresizebuffer, retries, continuedl, noprogress, consoletitle
+    noresizebuffer, retries, continuedl, noprogress, consoletitle,
+    xattr_set_filesize.
  
      The following options are used by the post processors:
      prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
@@ -530,6 +544,11 @@ class YoutubeDL(object):
              outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
              tmpl = compat_expanduser(outtmpl)
              filename = tmpl % template_dict
+            # Temporary fix for #4787
+            # 'Treat' all problem characters by passing filename through preferredencoding
+            # to workaround encoding issues with subprocess on python2 @ Windows
+            if sys.version_info < (3, 0) and sys.platform == 'win32':
+                filename = encodeFilename(filename, True).decode(preferredencoding())
              return filename
          except ValueError as err:
              self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
@@ -693,24 +712,51 @@ class YoutubeDL(object):
              if playlistend == -1:
                  playlistend = None
  
+            playlistitems_str = self.params.get('playlist_items', None)
+            playlistitems = None
+            if playlistitems_str is not None:
+                def iter_playlistitems(format):
+                    for string_segment in format.split(','):
+                        if '-' in string_segment:
+                            start, end = string_segment.split('-')
+                            for item in range(int(start), int(end) + 1):
+                                yield int(item)
+                        else:
+                            yield int(string_segment)
+                playlistitems = iter_playlistitems(playlistitems_str)
+
              ie_entries = ie_result['entries']
              if isinstance(ie_entries, list):
                  n_all_entries = len(ie_entries)
-                entries = ie_entries[playliststart:playlistend]
+                if playlistitems:
+                    entries = [ie_entries[i - 1] for i in playlistitems]
+                else:
+                    entries = ie_entries[playliststart:playlistend]
                  n_entries = len(entries)
                  self.to_screen(
                      "[%s] playlist %s: Collected %d video ids (downloading %d of them)" %
                      (ie_result['extractor'], playlist, n_all_entries, n_entries))
              elif isinstance(ie_entries, PagedList):
-                entries = ie_entries.getslice(
-                    playliststart, playlistend)
+                if playlistitems:
+                    entries = []
+                    for item in playlistitems:
+                        entries.extend(ie_entries.getslice(
+                            item - 1, item
+                        ))
+                else:
+                    entries = ie_entries.getslice(
+                        playliststart, playlistend)
                  n_entries = len(entries)
                  self.to_screen(
                      "[%s] playlist %s: Downloading %d videos" %
                      (ie_result['extractor'], playlist, n_entries))
              else:  # iterable
-                entries = list(itertools.islice(
-                    ie_entries, playliststart, playlistend))
+                if playlistitems:
+                    entry_list = list(ie_entries)
+                    entries = [entry_list[i - 1] for i in playlistitems]
+                else:
+                    entries = list(itertools.islice(
+                        ie_entries, playliststart, playlistend))
                  n_entries = len(entries)
                  self.to_screen(
                      "[%s] playlist %s: Downloading %d videos" %
@@ -768,7 +814,59 @@ class YoutubeDL(object):
          else:
              raise Exception('Invalid result type: %s' % result_type)
  
+    def _apply_format_filter(self, format_spec, available_formats):
+        " Returns a tuple of the remaining format_spec and filtered formats "
+
+        OPERATORS = {
+            '<': operator.lt,
+            '<=': operator.le,
+            '>': operator.gt,
+            '>=': operator.ge,
+            '=': operator.eq,
+            '!=': operator.ne,
+        }
+        operator_rex = re.compile(r'''(?x)\s*\[
+            (?P<key>width|height|tbr|abr|vbr|filesize|fps)
+            \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+            (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
+            \]$
+            ''' % '|'.join(map(re.escape, OPERATORS.keys())))
+        m = operator_rex.search(format_spec)
+        if not m:
+            raise ValueError('Invalid format specification %r' % format_spec)
+
+        try:
+            comparison_value = int(m.group('value'))
+        except ValueError:
+            comparison_value = parse_filesize(m.group('value'))
+            if comparison_value is None:
+                comparison_value = parse_filesize(m.group('value') + 'B')
+            if comparison_value is None:
+                raise ValueError(
+                    'Invalid value %r in format specification %r' % (
+                        m.group('value'), format_spec))
+        op = OPERATORS[m.group('op')]
+
+        def _filter(f):
+            actual_value = f.get(m.group('key'))
+            if actual_value is None:
+                return m.group('none_inclusive')
+            return op(actual_value, comparison_value)
+        new_formats = [f for f in available_formats if _filter(f)]
+
+        new_format_spec = format_spec[:-len(m.group(0))]
+        if not new_format_spec:
+            new_format_spec = 'best'
+
+        return (new_format_spec, new_formats)
+
      def select_format(self, format_spec, available_formats):
+        while format_spec.endswith(']'):
+            format_spec, available_formats = self._apply_format_filter(
+                format_spec, available_formats)
+        if not available_formats:
+            return None
+
          if format_spec == 'best' or format_spec is None:
              return available_formats[-1]
          elif format_spec == 'worst':
@@ -808,6 +906,42 @@ class YoutubeDL(object):
                  return matches[-1]
          return None
  
+    def _calc_headers(self, info_dict):
+        res = std_headers.copy()
+
+        add_headers = info_dict.get('http_headers')
+        if add_headers:
+            res.update(add_headers)
+
+        cookies = self._calc_cookies(info_dict)
+        if cookies:
+            res['Cookie'] = cookies
+
+        return res
+
+    def _calc_cookies(self, info_dict):
+        class _PseudoRequest(object):
+            def __init__(self, url):
+                self.url = url
+                self.headers = {}
+                self.unverifiable = False
+
+            def add_unredirected_header(self, k, v):
+                self.headers[k] = v
+
+            def get_full_url(self):
+                return self.url
+
+            def is_unverifiable(self):
+                return self.unverifiable
+
+            def has_header(self, h):
+                return h in self.headers
+
+        pr = _PseudoRequest(info_dict['url'])
+        self.cookiejar.add_cookie_header(pr)
+        return pr.headers.get('Cookie')
+
      def process_video_result(self, info_dict, download=True):
          assert info_dict.get('_type', 'video') == 'video'
  
@@ -822,12 +956,19 @@ class YoutubeDL(object):
              info_dict['playlist_index'] = None
  
          thumbnails = info_dict.get('thumbnails')
+        if thumbnails is None:
+            thumbnail = info_dict.get('thumbnail')
+            if thumbnail:
+                info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
          if thumbnails:
              thumbnails.sort(key=lambda t: (
-                t.get('width'), t.get('height'), t.get('url')))
-            for t in thumbnails:
+                t.get('preference'), t.get('width'), t.get('height'),
+                t.get('id'), t.get('url')))
+            for i, t in enumerate(thumbnails):
                  if 'width' in t and 'height' in t:
                      t['resolution'] = '%dx%d' % (t['width'], t['height'])
+                if t.get('id') is None:
+                    t['id'] = '%d' % i
  
          if thumbnails and 'thumbnail' not in info_dict:
              info_dict['thumbnail'] = thumbnails[-1]['url']
@@ -876,6 +1017,11 @@ class YoutubeDL(object):
              # Automatically determine file extension if missing
              if 'ext' not in format:
                  format['ext'] = determine_ext(format['url']).lower()
+            # Add HTTP headers, so that external programs can use them from the
+            # json output
+            full_format_info = info_dict.copy()
+            full_format_info.update(format)
+            format['http_headers'] = self._calc_headers(full_format_info)
  
          format_limit = self.params.get('format_limit', None)
          if format_limit:
@@ -891,9 +1037,12 @@ class YoutubeDL(object):
              # element in the 'formats' field in info_dict is info_dict itself,
              # wich can't be exported to json
              info_dict['formats'] = formats
-        if self.params.get('listformats', None):
+        if self.params.get('listformats'):
              self.list_formats(info_dict)
              return
+        if self.params.get('list_thumbnails'):
+            self.list_thumbnails(info_dict)
+            return
  
          req_format = self.params.get('format')
          if req_format is None:
@@ -927,8 +1076,10 @@ class YoutubeDL(object):
                                  else self.params['merge_output_format'])
                              selected_format = {
                                  'requested_formats': formats_info,
-                                'format': rf,
-                                'ext': formats_info[0]['ext'],
+                                'format': '%s+%s' % (formats_info[0].get('format'),
+                                                     formats_info[1].get('format')),
+                                'format_id': '%s+%s' % (formats_info[0].get('format_id'),
+                                                        formats_info[1].get('format_id')),
                                  'width': formats_info[0].get('width'),
                                  'height': formats_info[0].get('height'),
                                  'resolution': formats_info[0].get('resolution'),
@@ -989,7 +1140,7 @@ class YoutubeDL(object):
  
          self._num_downloads += 1
  
-        filename = self.prepare_filename(info_dict)
+        info_dict['_filename'] = filename = self.prepare_filename(info_dict)
  
          # Forced printings
          if self.params.get('forcetitle', False):
@@ -1014,10 +1165,7 @@ class YoutubeDL(object):
          if self.params.get('forceformat', False):
              self.to_stdout(info_dict['format'])
          if self.params.get('forcejson', False):
-            info_dict['_filename'] = filename
              self.to_stdout(json.dumps(info_dict))
-        if self.params.get('dump_single_json', False):
-            info_dict['_filename'] = filename
  
          # Do nothing else if in simulate mode
          if self.params.get('simulate', False):
@@ -1100,35 +1248,18 @@ class YoutubeDL(object):
                      self.report_error('Cannot write metadata to JSON file ' + infofn)
                      return
  
-        if self.params.get('writethumbnail', False):
-            if info_dict.get('thumbnail') is not None:
-                thumb_format = determine_ext(info_dict['thumbnail'], 'jpg')
-                thumb_filename = os.path.splitext(filename)[0] + '.' + thumb_format
-                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
-                    self.to_screen('[%s] %s: Thumbnail is already present' %
-                                   (info_dict['extractor'], info_dict['id']))
-                else:
-                    self.to_screen('[%s] %s: Downloading thumbnail ...' %
-                                   (info_dict['extractor'], info_dict['id']))
-                    try:
-                        uf = self.urlopen(info_dict['thumbnail'])
-                        with open(thumb_filename, 'wb') as thumbf:
-                            shutil.copyfileobj(uf, thumbf)
-                        self.to_screen('[%s] %s: Writing thumbnail to: %s' %
-                                       (info_dict['extractor'], info_dict['id'], thumb_filename))
-                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                        self.report_warning('Unable to download thumbnail "%s": %s' %
-                                            (info_dict['thumbnail'], compat_str(err)))
+        self._write_thumbnails(info_dict, filename)
  
          if not self.params.get('skip_download', False):
              try:
                  def dl(name, info):
-                    fd = get_suitable_downloader(info)(self, self.params)
+                    fd = get_suitable_downloader(info, self.params)(self, self.params)
                      for ph in self._progress_hooks:
                          fd.add_progress_hook(ph)
                      if self.params.get('verbose'):
                          self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
                      return fd.download(name, info)
+
                  if info_dict.get('requested_formats') is not None:
                      downloaded = []
                      success = True
@@ -1164,11 +1295,12 @@ class YoutubeDL(object):
  
              if success:
                  # Fixup content
+                fixup_policy = self.params.get('fixup')
+                if fixup_policy is None:
+                    fixup_policy = 'detect_or_warn'
+
                  stretched_ratio = info_dict.get('stretched_ratio')
                  if stretched_ratio is not None and stretched_ratio != 1:
-                    fixup_policy = self.params.get('fixup')
-                    if fixup_policy is None:
-                        fixup_policy = 'detect_or_warn'
                      if fixup_policy == 'warn':
                          self.report_warning('%s: Non-uniform pixel ratio (%s)' % (
                              info_dict['id'], stretched_ratio))
@@ -1182,7 +1314,23 @@ class YoutubeDL(object):
                                  '%s: Non-uniform pixel ratio (%s). Install ffmpeg or avconv to fix this automatically.' % (
                                      info_dict['id'], stretched_ratio))
                      else:
-                        assert fixup_policy == 'ignore'
+                        assert fixup_policy in ('ignore', 'never')
+
+                if info_dict.get('requested_formats') is None and info_dict.get('container') == 'm4a_dash':
+                    if fixup_policy == 'warn':
+                        self.report_warning('%s: writing DASH m4a. Only some players support this container.' % (
+                            info_dict['id']))
+                    elif fixup_policy == 'detect_or_warn':
+                        fixup_pp = FFmpegFixupM4aPP(self)
+                        if fixup_pp.available:
+                            info_dict.setdefault('__postprocessors', [])
+                            info_dict['__postprocessors'].append(fixup_pp)
+                        else:
+                            self.report_warning(
+                                '%s: writing DASH m4a. Only some players support this container. Install ffmpeg or avconv to fix this automatically.' % (
+                                    info_dict['id']))
+                    else:
+                        assert fixup_policy in ('ignore', 'never')
  
                  try:
                      self.post_process(filename, info_dict)
@@ -1384,8 +1532,26 @@ class YoutubeDL(object):
          header_line = line({
              'format_id': 'format code', 'ext': 'extension',
              'resolution': 'resolution', 'format_note': 'note'}, idlen=idlen)
-        self.to_screen('[info] Available formats for %s:\n%s\n%s' %
-                       (info_dict['id'], header_line, '\n'.join(formats_s)))
+        self.to_screen(
+            '[info] Available formats for %s:\n%s\n%s' %
+            (info_dict['id'], header_line, '\n'.join(formats_s)))
+
+    def list_thumbnails(self, info_dict):
+        thumbnails = info_dict.get('thumbnails')
+        if not thumbnails:
+            tn_url = info_dict.get('thumbnail')
+            if tn_url:
+                thumbnails = [{'id': '0', 'url': tn_url}]
+            else:
+                self.to_screen(
+                    '[info] No thumbnails present for %s' % info_dict['id'])
+                return
+
+        self.to_screen(
+            '[info] Thumbnails for %s:' % info_dict['id'])
+        self.to_screen(render_table(
+            ['ID', 'width', 'height', 'URL'],
+            [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
  
      def urlopen(self, req):
          """ Start an HTTP download """
@@ -1396,7 +1562,7 @@ class YoutubeDL(object):
          # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
          # To work around aforementioned issue we will replace request's original URL with
          # percent-encoded one
-        req_is_string = isinstance(req, basestring if sys.version_info < (3, 0) else compat_str)
+        req_is_string = isinstance(req, compat_basestring)
          url = req if req_is_string else req.get_full_url()
          url_escaped = escape_url(url)
  
@@ -1531,3 +1697,39 @@ class YoutubeDL(object):
          if encoding is None:
              encoding = preferredencoding()
          return encoding
+
+    def _write_thumbnails(self, info_dict, filename):
+        if self.params.get('writethumbnail', False):
+            thumbnails = info_dict.get('thumbnails')
+            if thumbnails:
+                thumbnails = [thumbnails[-1]]
+        elif self.params.get('write_all_thumbnails', False):
+            thumbnails = info_dict.get('thumbnails')
+        else:
+            return
+
+        if not thumbnails:
+            # No thumbnails present, so return immediately
+            return
+
+        for t in thumbnails:
+            thumb_ext = determine_ext(t['url'], 'jpg')
+            suffix = '_%s' % t['id'] if len(thumbnails) > 1 else ''
+            thumb_display_id = '%s ' % t['id'] if len(thumbnails) > 1 else ''
+            thumb_filename = os.path.splitext(filename)[0] + suffix + '.' + thumb_ext
+
+            if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(thumb_filename)):
+                self.to_screen('[%s] %s: Thumbnail %sis already present' %
+                               (info_dict['extractor'], info_dict['id'], thumb_display_id))
+            else:
+                self.to_screen('[%s] %s: Downloading thumbnail %s...' %
+                               (info_dict['extractor'], info_dict['id'], thumb_display_id))
+                try:
+                    uf = self.urlopen(t['url'])
+                    with open(thumb_filename, 'wb') as thumbf:
+                        shutil.copyfileobj(uf, thumbf)
+                    self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
+                                   (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))
+                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                    self.report_warning('Unable to download thumbnail "%s": %s' %
+                                        (t['url'], compat_str(err)))