Prepare to release.

[youtubedl] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 5036289b062f9ee05ab5c872c76ab38babcb4a54..2a405c5cac386d67b11dc0489dccf4ee68951c53 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -1,10 +1,11 @@
  #!/usr/bin/env python
-# -*- coding: utf-8 -*-
+# coding: utf-8
  
  from __future__ import absolute_import, unicode_literals
  
  import collections
  import contextlib
+import copy
  import datetime
  import errno
  import fileinput
@@ -23,14 +24,17 @@ import sys
  import time
  import tokenize
  import traceback
+import random
+
+from string import ascii_letters
  
  from .compat import (
      compat_basestring,
      compat_cookiejar,
-    compat_expanduser,
      compat_get_terminal_size,
      compat_http_client,
      compat_kwargs,
+    compat_numeric_types,
      compat_os_name,
      compat_str,
      compat_tokenize_tokenize,
@@ -51,12 +55,17 @@ from .utils import (
      encode_compat_str,
      encodeFilename,
      error_to_compat_str,
+    expand_path,
      ExtractorError,
      format_bytes,
      formatSeconds,
+    GeoRestrictedError,
+    int_or_none,
+    ISO3166Utils,
      locked_file,
      make_HTTPS_handler,
      MaxDownloadsReached,
+    orderedSet,
      PagedList,
      parse_filesize,
      PerRequestProxyHandler,
@@ -84,6 +93,7 @@ from .utils import (
  )
  from .cache import Cache
  from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
+from .extractor.openload import PhantomJSwrapper
  from .downloader import get_suitable_downloader
  from .downloader.rtmp import rtmpdump_version
  from .postprocessor import (
@@ -130,6 +140,9 @@ class YoutubeDL(object):
      username:          Username for authentication purposes.
      password:          Password for authentication purposes.
      videopassword:     Password for accessing a video.
+    ap_mso:            Adobe Pass multiple-system operator identifier.
+    ap_username:       Multiple-system operator account username.
+    ap_password:       Multiple-system operator account password.
      usenetrc:          Use netrc for authentication instead.
      verbose:           Print additional info to stdout.
      quiet:             Do not print messages to stdout.
@@ -155,6 +168,7 @@ class YoutubeDL(object):
      playlistend:       Playlist item to end at.
      playlist_items:    Specific indices of playlist to download.
      playlistreverse:   Download playlist items in reverse order.
+    playlistrandom:    Download playlist items in random order.
      matchtitle:        Download only matching titles.
      rejecttitle:       Reject downloads for matching titles.
      logger:            Log messages to a logging.Logger instance.
@@ -196,8 +210,8 @@ class YoutubeDL(object):
      prefer_insecure:   Use HTTP instead of HTTPS to retrieve information.
                         At the moment, this is only supported by YouTube.
      proxy:             URL of the proxy server to use
-    cn_verification_proxy:  URL of the proxy to use for IP address verification
-                       on Chinese sites. (Experimental)
+    geo_verification_proxy:  URL of the proxy to use for IP address verification
+                       on geo-restricted sites.
      socket_timeout:    Time to wait for unresponsive hosts, in seconds
      bidi_workaround:   Work around buggy terminals without bidirectional text
                         support, using fridibi
@@ -245,10 +259,19 @@ class YoutubeDL(object):
                         - "warn": only emit a warning
                         - "detect_or_warn": check whether we can do anything
                                             about it, warn otherwise (default)
-    source_address:    (Experimental) Client-side IP address to bind to.
+    source_address:    Client-side IP address to bind to.
      call_home:         Boolean, true iff we are allowed to contact the
                         youtube-dl servers for debugging.
-    sleep_interval:    Number of seconds to sleep before each download.
+    sleep_interval:    Number of seconds to sleep before each download when
+                       used alone or a lower bound of a range for randomized
+                       sleep before each download (minimum possible number
+                       of seconds to sleep) when used along with
+                       max_sleep_interval.
+    max_sleep_interval:Upper bound of a range for randomized sleep before each
+                       download (maximum possible number of seconds to sleep).
+                       Must only be used along with sleep_interval.
+                       Actual sleep time will be a random float from range
+                       [sleep_interval; max_sleep_interval].
      listformats:       Print an overview of available video formats and exit.
      list_thumbnails:   Print a table of all thumbnails and exit.
      match_filter:      A function that gets called with the info_dict of
@@ -257,6 +280,15 @@ class YoutubeDL(object):
                         If it returns None, the video is downloaded.
                         match_filter_func in utils.py is one example for this.
      no_color:          Do not emit color codes in output.
+    geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
+                       HTTP header
+    geo_bypass_country:
+                       Two-letter ISO 3166-2 country code that will be used for
+                       explicit geographic restriction bypassing via faking
+                       X-Forwarded-For HTTP header
+    geo_bypass_ip_block:
+                       IP range in CIDR notation that will be used similarly to
+                       geo_bypass_country
  
      The following options determine which downloader is picked:
      external_downloader: Executable of the external downloader to call.
@@ -269,15 +301,33 @@ class YoutubeDL(object):
      the downloader (see youtube_dl/downloader/common.py):
      nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
      noresizebuffer, retries, continuedl, noprogress, consoletitle,
-    xattr_set_filesize, external_downloader_args, hls_use_mpegts.
+    xattr_set_filesize, external_downloader_args, hls_use_mpegts,
+    http_chunk_size.
  
      The following options are used by the post processors:
      prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
                         otherwise prefer avconv.
      postprocessor_args: A list of additional command-line arguments for the
                          postprocessor.
+
+    The following options are used by the Youtube extractor:
+    youtube_include_dash_manifest: If True (default), DASH manifests and related
+                        data will be downloaded and processed by extractor.
+                        You can reduce network I/O by disabling it if you don't
+                        care about DASH.
      """
  
+    _NUMERIC_FIELDS = set((
+        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+        'timestamp', 'upload_year', 'upload_month', 'upload_day',
+        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
+        'average_rating', 'comment_count', 'age_limit',
+        'start_time', 'end_time',
+        'chapter_number', 'season_number', 'episode_number',
+        'track_number', 'disc_number', 'release_year',
+        'playlist_index',
+    ))
+
      params = None
      _ies = []
      _pps = []
@@ -304,6 +354,21 @@ class YoutubeDL(object):
          self.params.update(params)
          self.cache = Cache(self)
  
+        def check_deprecated(param, option, suggestion):
+            if self.params.get(param) is not None:
+                self.report_warning(
+                    '%s is deprecated. Use %s instead.' % (option, suggestion))
+                return True
+            return False
+
+        if check_deprecated('cn_verification_proxy', '--cn-verification-proxy', '--geo-verification-proxy'):
+            if self.params.get('geo_verification_proxy') is None:
+                self.params['geo_verification_proxy'] = self.params['cn_verification_proxy']
+
+        check_deprecated('autonumber_size', '--autonumber-size', 'output template with %(autonumber)0Nd, where N in the number of digits')
+        check_deprecated('autonumber', '--auto-number', '-o "%(autonumber)s-%(title)s.%(ext)s"')
+        check_deprecated('usetitle', '--title', '-o "%(title)s-%(id)s.%(ext)s"')
+
          if params.get('bidi_workaround', False):
              try:
                  import pty
@@ -331,10 +396,10 @@ class YoutubeDL(object):
                  else:
                      raise
  
-        if (sys.version_info >= (3,) and sys.platform != 'win32' and
+        if (sys.platform != 'win32' and
                  sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
                  not params.get('restrictfilenames', False)):
-            # On Python 3, the Unicode filesystem API will throw errors (#1474)
+            # Unicode filesystem API will throw errors (#1474, #13027)
              self.report_warning(
                  'Assuming --restrict-filenames since file system encoding '
                  'cannot encode all characters. '
@@ -459,24 +524,29 @@ class YoutubeDL(object):
      def to_console_title(self, message):
          if not self.params.get('consoletitle', False):
              return
-        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
-            # c_wchar_p() might not be necessary if `message` is
-            # already of type unicode()
-            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+        if compat_os_name == 'nt':
+            if ctypes.windll.kernel32.GetConsoleWindow():
+                # c_wchar_p() might not be necessary if `message` is
+                # already of type unicode()
+                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
          elif 'TERM' in os.environ:
              self._write_string('\033]0;%s\007' % message, self._screen_file)
  
      def save_console_title(self):
          if not self.params.get('consoletitle', False):
              return
-        if 'TERM' in os.environ:
+        if self.params.get('simulate', False):
+            return
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
              # Save the title on stack
              self._write_string('\033[22;0t', self._screen_file)
  
      def restore_console_title(self):
          if not self.params.get('consoletitle', False):
              return
-        if 'TERM' in os.environ:
+        if self.params.get('simulate', False):
+            return
+        if compat_os_name != 'nt' and 'TERM' in os.environ:
              # Restore the title from stack
              self._write_string('\033[23;0t', self._screen_file)
  
@@ -565,10 +635,7 @@ class YoutubeDL(object):
              autonumber_size = self.params.get('autonumber_size')
              if autonumber_size is None:
                  autonumber_size = 5
-            autonumber_templ = '%0' + str(autonumber_size) + 'd'
-            template_dict['autonumber'] = autonumber_templ % self._num_downloads
-            if template_dict.get('playlist_index') is not None:
-                template_dict['playlist_index'] = '%0*d' % (len(str(template_dict['n_entries'])), template_dict['playlist_index'])
+            template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
              if template_dict.get('resolution') is None:
                  if template_dict.get('width') and template_dict.get('height'):
                      template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
@@ -580,15 +647,64 @@ class YoutubeDL(object):
              sanitize = lambda k, v: sanitize_filename(
                  compat_str(v),
                  restricted=self.params.get('restrictfilenames'),
-                is_id=(k == 'id'))
-            template_dict = dict((k, sanitize(k, v))
+                is_id=(k == 'id' or k.endswith('_id')))
+            template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
                                   for k, v in template_dict.items()
                                   if v is not None and not isinstance(v, (list, tuple, dict)))
              template_dict = collections.defaultdict(lambda: 'NA', template_dict)
  
              outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
-            tmpl = compat_expanduser(outtmpl)
-            filename = tmpl % template_dict
+
+            # For fields playlist_index and autonumber convert all occurrences
+            # of %(field)s to %(field)0Nd for backward compatibility
+            field_size_compat_map = {
+                'playlist_index': len(str(template_dict['n_entries'])),
+                'autonumber': autonumber_size,
+            }
+            FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
+            mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
+            if mobj:
+                outtmpl = re.sub(
+                    FIELD_SIZE_COMPAT_RE,
+                    r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
+                    outtmpl)
+
+            # Missing numeric fields used together with integer presentation types
+            # in format specification will break the argument substitution since
+            # string 'NA' is returned for missing fields. We will patch output
+            # template for missing fields to meet string presentation type.
+            for numeric_field in self._NUMERIC_FIELDS:
+                if numeric_field not in template_dict:
+                    # As of [1] format syntax is:
+                    #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
+                    # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
+                    FORMAT_RE = r'''(?x)
+                        (?<!%)
+                        %
+                        \({0}\)  # mapping key
+                        (?:[#0\-+ ]+)?  # conversion flags (optional)
+                        (?:\d+)?  # minimum field width (optional)
+                        (?:\.\d+)?  # precision (optional)
+                        [hlL]?  # length modifier (optional)
+                        [diouxXeEfFgGcrs%]  # conversion type
+                    '''
+                    outtmpl = re.sub(
+                        FORMAT_RE.format(numeric_field),
+                        r'%({0})s'.format(numeric_field), outtmpl)
+
+            # expand_path translates '%%' into '%' and '$$' into '$'
+            # correspondingly that is not what we want since we need to keep
+            # '%%' intact for template dict substitution step. Working around
+            # with boundary-alike separator hack.
+            sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+
+            # outtmpl should be expand_path'ed before template dict substitution
+            # because meta fields may contain env variables we don't want to
+            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+            # title "Hello $PATH", we don't want `$PATH` to be expanded.
+            filename = expand_path(outtmpl).replace(sep, '') % template_dict
+
              # Temporary fix for #4787
              # 'Treat' all problem characters by passing filename through preferredencoding
              # to workaround encoding issues with subprocess on python2 @ Windows
@@ -687,6 +803,14 @@ class YoutubeDL(object):
                      return self.process_ie_result(ie_result, download, extra_info)
                  else:
                      return ie_result
+            except GeoRestrictedError as e:
+                msg = e.msg
+                if e.countries:
+                    msg += '\nThis video is available in %s.' % ', '.join(
+                        map(ISO3166Utils.short2full, e.countries))
+                msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
+                self.report_error(msg)
+                break
              except ExtractorError as e:  # An error we somewhat expected
                  self.report_error(compat_str(e), e.format_traceback())
                  break
@@ -744,19 +868,32 @@ class YoutubeDL(object):
                  ie_result['url'], ie_key=ie_result.get('ie_key'),
                  extra_info=extra_info, download=False, process=False)
  
+            # extract_info may return None when ignoreerrors is enabled and
+            # extraction failed with an error, don't crash and return early
+            # in this case
+            if not info:
+                return info
+
              force_properties = dict(
                  (k, v) for k, v in ie_result.items() if v is not None)
-            for f in ('_type', 'url', 'ie_key'):
+            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
                  if f in force_properties:
                      del force_properties[f]
              new_result = info.copy()
              new_result.update(force_properties)
  
-            assert new_result.get('_type') != 'url_transparent'
+            # Extracted info may not be a video result (i.e.
+            # info.get('_type', 'video') != video) but rather an url or
+            # url_transparent. In such cases outer metadata (from ie_result)
+            # should be propagated to inner one (info). For this to happen
+            # _type of info should be overridden with url_transparent. This
+            # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+            if new_result.get('_type') == 'url':
+                new_result['_type'] = 'url_transparent'
  
              return self.process_ie_result(
                  new_result, download=download, extra_info=extra_info)
-        elif result_type == 'playlist' or result_type == 'multi_video':
+        elif result_type in ('playlist', 'multi_video'):
              # We process each entry in the playlist
              playlist = ie_result.get('title') or ie_result.get('id')
              self.to_screen('[download] Downloading playlist: %s' % playlist)
@@ -780,15 +917,25 @@ class YoutubeDL(object):
                                  yield int(item)
                          else:
                              yield int(string_segment)
-                playlistitems = iter_playlistitems(playlistitems_str)
+                playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
  
              ie_entries = ie_result['entries']
+
+            def make_playlistitems_entries(list_ie_entries):
+                num_entries = len(list_ie_entries)
+                return [
+                    list_ie_entries[i - 1] for i in playlistitems
+                    if -num_entries <= i - 1 < num_entries]
+
+            def report_download(num_entries):
+                self.to_screen(
+                    '[%s] playlist %s: Downloading %d videos' %
+                    (ie_result['extractor'], playlist, num_entries))
+
              if isinstance(ie_entries, list):
                  n_all_entries = len(ie_entries)
                  if playlistitems:
-                    entries = [
-                        ie_entries[i - 1] for i in playlistitems
-                        if -n_all_entries <= i - 1 < n_all_entries]
+                    entries = make_playlistitems_entries(ie_entries)
                  else:
                      entries = ie_entries[playliststart:playlistend]
                  n_entries = len(entries)
@@ -806,31 +953,38 @@ class YoutubeDL(object):
                      entries = ie_entries.getslice(
                          playliststart, playlistend)
                  n_entries = len(entries)
-                self.to_screen(
-                    '[%s] playlist %s: Downloading %d videos' %
-                    (ie_result['extractor'], playlist, n_entries))
+                report_download(n_entries)
              else:  # iterable
                  if playlistitems:
-                    entry_list = list(ie_entries)
-                    entries = [entry_list[i - 1] for i in playlistitems]
+                    entries = make_playlistitems_entries(list(itertools.islice(
+                        ie_entries, 0, max(playlistitems))))
                  else:
                      entries = list(itertools.islice(
                          ie_entries, playliststart, playlistend))
                  n_entries = len(entries)
-                self.to_screen(
-                    '[%s] playlist %s: Downloading %d videos' %
-                    (ie_result['extractor'], playlist, n_entries))
+                report_download(n_entries)
  
              if self.params.get('playlistreverse', False):
                  entries = entries[::-1]
  
+            if self.params.get('playlistrandom', False):
+                random.shuffle(entries)
+
+            x_forwarded_for = ie_result.get('__x_forwarded_for_ip')
+
              for i, entry in enumerate(entries, 1):
                  self.to_screen('[download] Downloading video %s of %s' % (i, n_entries))
+                # This __x_forwarded_for_ip thing is a bit ugly but requires
+                # minimal changes
+                if x_forwarded_for:
+                    entry['__x_forwarded_for_ip'] = x_forwarded_for
                  extra = {
                      'n_entries': n_entries,
                      'playlist': playlist,
                      'playlist_id': ie_result.get('id'),
                      'playlist_title': ie_result.get('title'),
+                    'playlist_uploader': ie_result.get('uploader'),
+                    'playlist_uploader_id': ie_result.get('uploader_id'),
                      'playlist_index': i + playliststart,
                      'extractor': ie_result['extractor'],
                      'webpage_url': ie_result['webpage_url'],
@@ -886,7 +1040,7 @@ class YoutubeDL(object):
              '!=': operator.ne,
          }
          operator_rex = re.compile(r'''(?x)\s*
-            (?P<key>width|height|tbr|abr|vbr|asr|filesize|fps)
+            (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
              \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
              (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
              $
@@ -934,6 +1088,30 @@ class YoutubeDL(object):
              return op(actual_value, comparison_value)
          return _filter
  
+    def _default_format_spec(self, info_dict, download=True):
+
+        def can_merge():
+            merger = FFmpegMergerPP(self)
+            return merger.available and merger.can_merge()
+
+        def prefer_best():
+            if self.params.get('simulate', False):
+                return False
+            if not download:
+                return False
+            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+                return True
+            if info_dict.get('is_live'):
+                return True
+            if not can_merge():
+                return True
+            return False
+
+        req_format_list = ['bestvideo+bestaudio', 'best']
+        if prefer_best():
+            req_format_list.reverse()
+        return '/'.join(req_format_list)
+
      def build_format_selector(self, format_spec):
          def syntax_error(note, start):
              message = (
@@ -1046,9 +1224,9 @@ class YoutubeDL(object):
              if isinstance(selector, list):
                  fs = [_build_selector_function(s) for s in selector]
  
-                def selector_function(formats):
+                def selector_function(ctx):
                      for f in fs:
-                        for format in f(formats):
+                        for format in f(ctx):
                              yield format
                  return selector_function
              elif selector.type == GROUP:
@@ -1056,17 +1234,17 @@ class YoutubeDL(object):
              elif selector.type == PICKFIRST:
                  fs = [_build_selector_function(s) for s in selector.selector]
  
-                def selector_function(formats):
+                def selector_function(ctx):
                      for f in fs:
-                        picked_formats = list(f(formats))
+                        picked_formats = list(f(ctx))
                          if picked_formats:
                              return picked_formats
                      return []
              elif selector.type == SINGLE:
                  format_spec = selector.selector
  
-                def selector_function(formats):
-                    formats = list(formats)
+                def selector_function(ctx):
+                    formats = list(ctx['formats'])
                      if not formats:
                          return
                      if format_spec == 'all':
@@ -1079,9 +1257,10 @@ class YoutubeDL(object):
                              if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
                          if audiovideo_formats:
                              yield audiovideo_formats[format_idx]
-                        # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format
-                        elif (all(f.get('acodec') != 'none' for f in formats) or
-                              all(f.get('vcodec') != 'none' for f in formats)):
+                        # for extractors with incomplete formats (audio only (soundcloud)
+                        # or video only (imgur)) we will fallback to best/worst
+                        # {video,audio}-only format
+                        elif ctx['incomplete_formats']:
                              yield formats[format_idx]
                      elif format_spec == 'bestaudio':
                          audio_formats = [
@@ -1155,17 +1334,18 @@ class YoutubeDL(object):
                      }
                  video_selector, audio_selector = map(_build_selector_function, selector.selector)
  
-                def selector_function(formats):
-                    formats = list(formats)
-                    for pair in itertools.product(video_selector(formats), audio_selector(formats)):
+                def selector_function(ctx):
+                    for pair in itertools.product(
+                            video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
                          yield _merge(pair)
  
              filters = [self._build_format_filter(f) for f in selector.filters]
  
-            def final_selector(formats):
+            def final_selector(ctx):
+                ctx_copy = copy.deepcopy(ctx)
                  for _filter in filters:
-                    formats = list(filter(_filter, formats))
-                return selector_function(formats)
+                    ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+                return selector_function(ctx_copy)
              return final_selector
  
          stream = io.BytesIO(format_spec.encode('utf-8'))
@@ -1208,6 +1388,11 @@ class YoutubeDL(object):
          if cookies:
              res['Cookie'] = cookies
  
+        if 'X-Forwarded-For' not in res:
+            x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
+            if x_forwarded_for_ip:
+                res['X-Forwarded-For'] = x_forwarded_for_ip
+
          return res
  
      def _calc_cookies(self, info_dict):
@@ -1223,9 +1408,28 @@ class YoutubeDL(object):
          if 'title' not in info_dict:
              raise ExtractorError('Missing "title" field in extractor result')
  
-        if not isinstance(info_dict['id'], compat_str):
-            self.report_warning('"id" field is not a string - forcing string conversion')
-            info_dict['id'] = compat_str(info_dict['id'])
+        def report_force_conversion(field, field_not, conversion):
+            self.report_warning(
+                '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+                % (field, field_not, conversion))
+
+        def sanitize_string_field(info, string_field):
+            field = info.get(string_field)
+            if field is None or isinstance(field, compat_str):
+                return
+            report_force_conversion(string_field, 'a string', 'string')
+            info[string_field] = compat_str(field)
+
+        def sanitize_numeric_fields(info):
+            for numeric_field in self._NUMERIC_FIELDS:
+                field = info.get(numeric_field)
+                if field is None or isinstance(field, compat_numeric_types):
+                    continue
+                report_force_conversion(numeric_field, 'numeric', 'int')
+                info[numeric_field] = int_or_none(field)
+
+        sanitize_string_field(info_dict, 'id')
+        sanitize_numeric_fields(info_dict)
  
          if 'playlist' not in info_dict:
              # It isn't part of a playlist
@@ -1239,8 +1443,10 @@ class YoutubeDL(object):
                  info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
          if thumbnails:
              thumbnails.sort(key=lambda t: (
-                t.get('preference'), t.get('width'), t.get('height'),
-                t.get('id'), t.get('url')))
+                t.get('preference') if t.get('preference') is not None else -1,
+                t.get('width') if t.get('width') is not None else -1,
+                t.get('height') if t.get('height') is not None else -1,
+                t.get('id') if t.get('id') is not None else '', t.get('url')))
              for i, t in enumerate(thumbnails):
                  t['url'] = sanitize_url(t['url'])
                  if t.get('width') and t.get('height'):
@@ -1276,23 +1482,28 @@ class YoutubeDL(object):
              if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
                  info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
  
+        for cc_kind in ('subtitles', 'automatic_captions'):
+            cc = info_dict.get(cc_kind)
+            if cc:
+                for _, subtitle in cc.items():
+                    for subtitle_format in subtitle:
+                        if subtitle_format.get('url'):
+                            subtitle_format['url'] = sanitize_url(subtitle_format['url'])
+                        if subtitle_format.get('ext') is None:
+                            subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+
+        automatic_captions = info_dict.get('automatic_captions')
          subtitles = info_dict.get('subtitles')
-        if subtitles:
-            for _, subtitle in subtitles.items():
-                for subtitle_format in subtitle:
-                    if subtitle_format.get('url'):
-                        subtitle_format['url'] = sanitize_url(subtitle_format['url'])
-                    if 'ext' not in subtitle_format:
-                        subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
  
          if self.params.get('listsubtitles', False):
              if 'automatic_captions' in info_dict:
-                self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
+                self.list_subtitles(
+                    info_dict['id'], automatic_captions, 'automatic captions')
              self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
              return
+
          info_dict['requested_subtitles'] = self.process_subtitles(
-            info_dict['id'], subtitles,
-            info_dict.get('automatic_captions'))
+            info_dict['id'], subtitles, automatic_captions)
  
          # We now pick which formats have to be downloaded
          if info_dict.get('formats') is None:
@@ -1304,20 +1515,32 @@ class YoutubeDL(object):
          if not formats:
              raise ExtractorError('No video formats found!')
  
+        def is_wellformed(f):
+            url = f.get('url')
+            if not url:
+                self.report_warning(
+                    '"url" field is missing or empty - skipping format, '
+                    'there is an error in extractor')
+                return False
+            if isinstance(url, bytes):
+                sanitize_string_field(f, 'url')
+            return True
+
+        # Filter out malformed formats for better extraction robustness
+        formats = list(filter(is_wellformed, formats))
+
          formats_dict = {}
  
          # We check that all the formats have the format and format_id fields
          for i, format in enumerate(formats):
-            if 'url' not in format:
-                raise ExtractorError('Missing "url" key in result (index %d)' % i)
-
+            sanitize_string_field(format, 'format_id')
+            sanitize_numeric_fields(format)
              format['url'] = sanitize_url(format['url'])
-
-            if format.get('format_id') is None:
+            if not format.get('format_id'):
                  format['format_id'] = compat_str(i)
              else:
                  # Sanitize format_id from characters used in format selector expression
-                format['format_id'] = re.sub('[\s,/+\[\]()]', '_', format['format_id'])
+                format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
              format_id = format['format_id']
              if format_id not in formats_dict:
                  formats_dict[format_id] = []
@@ -1337,17 +1560,20 @@ class YoutubeDL(object):
                      note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
                  )
              # Automatically determine file extension if missing
-            if 'ext' not in format:
+            if format.get('ext') is None:
                  format['ext'] = determine_ext(format['url']).lower()
              # Automatically determine protocol if missing (useful for format
              # selection purposes)
-            if 'protocol' not in format:
+            if format.get('protocol') is None:
                  format['protocol'] = determine_protocol(format)
              # Add HTTP headers, so that external programs can use them from the
              # json output
              full_format_info = info_dict.copy()
              full_format_info.update(format)
              format['http_headers'] = self._calc_headers(full_format_info)
+        # Remove private housekeeping stuff
+        if '__x_forwarded_for_ip' in info_dict:
+            del info_dict['__x_forwarded_for_ip']
  
          # TODO Central sorting goes here
  
@@ -1363,16 +1589,39 @@ class YoutubeDL(object):
  
          req_format = self.params.get('format')
          if req_format is None:
-            req_format_list = []
-            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
-                    not info_dict.get('is_live')):
-                merger = FFmpegMergerPP(self)
-                if merger.available and merger.can_merge():
-                    req_format_list.append('bestvideo+bestaudio')
-            req_format_list.append('best')
-            req_format = '/'.join(req_format_list)
+            req_format = self._default_format_spec(info_dict, download=download)
+            if self.params.get('verbose'):
+                self.to_stdout('[debug] Default format spec: %s' % req_format)
+
          format_selector = self.build_format_selector(req_format)
-        formats_to_download = list(format_selector(formats))
+
+        # While in format selection we may need to have an access to the original
+        # format set in order to calculate some metrics or do some processing.
+        # For now we need to be able to guess whether original formats provided
+        # by extractor are incomplete or not (i.e. whether extractor provides only
+        # video-only or audio-only formats) for proper formats selection for
+        # extractors with such incomplete formats (see
+        # https://github.com/rg3/youtube-dl/pull/5556).
+        # Since formats may be filtered during format selection and may not match
+        # the original formats the results may be incorrect. Thus original formats
+        # or pre-calculated metrics should be passed to format selection routines
+        # as well.
+        # We will pass a context object containing all necessary additional data
+        # instead of just formats.
+        # This fixes incorrect format selection issue (see
+        # https://github.com/rg3/youtube-dl/issues/10083).
+        incomplete_formats = (
+            # All formats are video-only or
+            all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
+            # all formats are audio-only
+            all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+
+        ctx = {
+            'formats': formats,
+            'incomplete_formats': incomplete_formats,
+        }
+
+        formats_to_download = list(format_selector(ctx))
          if not formats_to_download:
              raise ExtractorError('requested format not available',
                                   expected=True)
@@ -1495,12 +1744,17 @@ class YoutubeDL(object):
          if filename is None:
              return
  
-        try:
-            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
-            if dn and not os.path.exists(dn):
-                os.makedirs(dn)
-        except (OSError, IOError) as err:
-            self.report_error('unable to create directory ' + error_to_compat_str(err))
+        def ensure_dir_exists(path):
+            try:
+                dn = os.path.dirname(path)
+                if dn and not os.path.exists(dn):
+                    os.makedirs(dn)
+                return True
+            except (OSError, IOError) as err:
+                self.report_error('unable to create directory ' + error_to_compat_str(err))
+                return False
+
+        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
              return
  
          if self.params.get('writedescription', False):
@@ -1543,27 +1797,30 @@ class YoutubeDL(object):
              ie = self.get_info_extractor(info_dict['extractor_key'])
              for sub_lang, sub_info in subtitles.items():
                  sub_format = sub_info['ext']
-                if sub_info.get('data') is not None:
-                    sub_data = sub_info['data']
+                sub_filename = subtitles_filename(filename, sub_lang, sub_format)
+                if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+                    self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
                  else:
-                    try:
-                        sub_data = ie._download_webpage(
-                            sub_info['url'], info_dict['id'], note=False)
-                    except ExtractorError as err:
-                        self.report_warning('Unable to download subtitle for "%s": %s' %
-                                            (sub_lang, error_to_compat_str(err.cause)))
-                        continue
-                try:
-                    sub_filename = subtitles_filename(filename, sub_lang, sub_format)
-                    if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
-                        self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+                    self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
+                    if sub_info.get('data') is not None:
+                        try:
+                            # Use newline='' to prevent conversion of newline characters
+                            # See https://github.com/rg3/youtube-dl/issues/10268
+                            with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
+                                subfile.write(sub_info['data'])
+                        except (OSError, IOError):
+                            self.report_error('Cannot write subtitles file ' + sub_filename)
+                            return
                      else:
-                        self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
-                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
-                            subfile.write(sub_data)
-                except (OSError, IOError):
-                    self.report_error('Cannot write subtitles file ' + sub_filename)
-                    return
+                        try:
+                            sub_data = ie._request_webpage(
+                                sub_info['url'], info_dict['id'], note=False).read()
+                            with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                subfile.write(sub_data)
+                        except (ExtractorError, IOError, OSError, ValueError) as err:
+                            self.report_warning('Unable to download subtitle for "%s": %s' %
+                                                (sub_lang, error_to_compat_str(err)))
+                            continue
  
          if self.params.get('writeinfojson', False):
              infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
@@ -1604,10 +1861,10 @@ class YoutubeDL(object):
                      def compatible_formats(formats):
                          video, audio = formats
                          # Check extension
-                        video_ext, audio_ext = audio.get('ext'), video.get('ext')
+                        video_ext, audio_ext = video.get('ext'), audio.get('ext')
                          if video_ext and audio_ext:
                              COMPATIBLE_EXTS = (
-                                ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v'),
+                                ('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma'),
                                  ('webm')
                              )
                              for exts in COMPATIBLE_EXTS:
@@ -1636,8 +1893,11 @@ class YoutubeDL(object):
                          for f in requested_formats:
                              new_info = dict(info_dict)
                              new_info.update(f)
-                            fname = self.prepare_filename(new_info)
-                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+                            fname = prepend_extension(
+                                self.prepare_filename(new_info),
+                                'f%s' % f['format_id'], new_info['ext'])
+                            if not ensure_dir_exists(fname):
+                                return
                              downloaded.append(fname)
                              partial_success = dl(fname, new_info)
                              success = success and partial_success
@@ -1704,7 +1964,7 @@ class YoutubeDL(object):
                          info_dict.get('protocol') == 'm3u8' and
                          self.params.get('hls_prefer_native')):
                      if fixup_policy == 'warn':
-                        self.report_warning('%s: malformated aac bitstream.' % (
+                        self.report_warning('%s: malformed AAC bitstream detected.' % (
                              info_dict['id']))
                      elif fixup_policy == 'detect_or_warn':
                          fixup_pp = FFmpegFixupM3u8PP(self)
@@ -1713,7 +1973,7 @@ class YoutubeDL(object):
                              info_dict['__postprocessors'].append(fixup_pp)
                          else:
                              self.report_warning(
-                                '%s: malformated aac bitstream. %s'
+                                '%s: malformed AAC bitstream detected. %s'
                                  % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
                      else:
                          assert fixup_policy in ('ignore', 'never')
@@ -1729,6 +1989,7 @@ class YoutubeDL(object):
          """Download a given list of URLs."""
          outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
          if (len(url_list) > 1 and
+                outtmpl != '-' and
                  '%' not in outtmpl and
                  self.params.get('max_downloads') != 1):
              raise SameFileError(outtmpl)
@@ -1985,11 +2246,20 @@ class YoutubeDL(object):
                  sys.exc_clear()
              except Exception:
                  pass
-        self._write_string('[debug] Python version %s - %s\n' % (
-            platform.python_version(), platform_name()))
+
+        def python_implementation():
+            impl_name = platform.python_implementation()
+            if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'):
+                return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3]
+            return impl_name
+
+        self._write_string('[debug] Python version %s (%s) - %s\n' % (
+            platform.python_version(), python_implementation(),
+            platform_name()))
  
          exe_versions = FFmpegPostProcessor.get_versions(self)
          exe_versions['rtmpdump'] = rtmpdump_version()
+        exe_versions['phantomjs'] = PhantomJSwrapper._version()
          exe_str = ', '.join(
              '%s %s' % (exe, v)
              for exe, v in sorted(exe_versions.items())
@@ -2026,7 +2296,7 @@ class YoutubeDL(object):
          if opts_cookiefile is None:
              self.cookiejar = compat_cookiejar.CookieJar()
          else:
-            opts_cookiefile = compat_expanduser(opts_cookiefile)
+            opts_cookiefile = expand_path(opts_cookiefile)
              self.cookiejar = compat_cookiejar.MozillaCookieJar(
                  opts_cookiefile)
              if os.access(opts_cookiefile, os.R_OK):