+ def _build_format_filter(self, filter_spec):
+ " Returns a function to filter the formats according to the filter_spec "
+
+ OPERATORS = {
+ '<': operator.lt,
+ '<=': operator.le,
+ '>': operator.gt,
+ '>=': operator.ge,
+ '=': operator.eq,
+ '!=': operator.ne,
+ }
+ operator_rex = re.compile(r'''(?x)\s*
+ (?P<key>width|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps)
+ \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
+ (?P<value>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)
+ $
+ ''' % '|'.join(map(re.escape, OPERATORS.keys())))
+ m = operator_rex.search(filter_spec)
+ if m:
+ try:
+ comparison_value = int(m.group('value'))
+ except ValueError:
+ comparison_value = parse_filesize(m.group('value'))
+ if comparison_value is None:
+ comparison_value = parse_filesize(m.group('value') + 'B')
+ if comparison_value is None:
+ raise ValueError(
+ 'Invalid value %r in format specification %r' % (
+ m.group('value'), filter_spec))
+ op = OPERATORS[m.group('op')]
+
+ if not m:
+ STR_OPERATORS = {
+ '=': operator.eq,
+ '!=': operator.ne,
+ '^=': lambda attr, value: attr.startswith(value),
+ '$=': lambda attr, value: attr.endswith(value),
+ '*=': lambda attr, value: value in attr,
+ }
+ str_operator_rex = re.compile(r'''(?x)
+ \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
+ \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+ \s*(?P<value>[a-zA-Z0-9._-]+)
+ \s*$
+ ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
+ m = str_operator_rex.search(filter_spec)
+ if m:
+ comparison_value = m.group('value')
+ op = STR_OPERATORS[m.group('op')]
+
+ if not m:
+ raise ValueError('Invalid filter specification %r' % filter_spec)
+
+ def _filter(f):
+ actual_value = f.get(m.group('key'))
+ if actual_value is None:
+ return m.group('none_inclusive')
+ return op(actual_value, comparison_value)
+ return _filter
+
+ def _default_format_spec(self, info_dict, download=True):
+
+ def can_merge():
+ merger = FFmpegMergerPP(self)
+ return merger.available and merger.can_merge()
+
+ def prefer_best():
+ if self.params.get('simulate', False):
+ return False
+ if not download:
+ return False
+ if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+ return True
+ if info_dict.get('is_live'):
+ return True
+ if not can_merge():
+ return True
+ return False
+
+ req_format_list = ['bestvideo+bestaudio', 'best']
+ if prefer_best():
+ req_format_list.reverse()
+ return '/'.join(req_format_list)
+
+ def build_format_selector(self, format_spec):
+ def syntax_error(note, start):
+ message = (
+ 'Invalid format specification: '
+ '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1]))
+ return SyntaxError(message)
+
+ PICKFIRST = 'PICKFIRST'
+ MERGE = 'MERGE'
+ SINGLE = 'SINGLE'
+ GROUP = 'GROUP'
+ FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
+
+ def _parse_filter(tokens):
+ filter_parts = []
+ for type, string, start, _, _ in tokens:
+ if type == tokenize.OP and string == ']':
+ return ''.join(filter_parts)
+ else:
+ filter_parts.append(string)
+
+ def _remove_unused_ops(tokens):
+ # Remove operators that we don't use and join them with the surrounding strings
+ # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
+ ALLOWED_OPS = ('/', '+', ',', '(', ')')
+ last_string, last_start, last_end, last_line = None, None, None, None
+ for type, string, start, end, line in tokens:
+ if type == tokenize.OP and string == '[':
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+ last_string = None
+ yield type, string, start, end, line
+ # everything inside brackets will be handled by _parse_filter
+ for type, string, start, end, line in tokens:
+ yield type, string, start, end, line
+ if type == tokenize.OP and string == ']':
+ break
+ elif type == tokenize.OP and string in ALLOWED_OPS:
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+ last_string = None
+ yield type, string, start, end, line
+ elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
+ if not last_string:
+ last_string = string
+ last_start = start
+ last_end = end
+ else:
+ last_string += string
+ if last_string:
+ yield tokenize.NAME, last_string, last_start, last_end, last_line
+
+ def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
+ selectors = []
+ current_selector = None
+ for type, string, start, _, _ in tokens:
+ # ENCODING is only defined in python 3.x
+ if type == getattr(tokenize, 'ENCODING', None):
+ continue
+ elif type in [tokenize.NAME, tokenize.NUMBER]:
+ current_selector = FormatSelector(SINGLE, string, [])
+ elif type == tokenize.OP:
+ if string == ')':
+ if not inside_group:
+ # ')' will be handled by the parentheses group
+ tokens.restore_last_token()
+ break
+ elif inside_merge and string in ['/', ',']:
+ tokens.restore_last_token()
+ break
+ elif inside_choice and string == ',':
+ tokens.restore_last_token()
+ break
+ elif string == ',':
+ if not current_selector:
+ raise syntax_error('"," must follow a format selector', start)
+ selectors.append(current_selector)
+ current_selector = None
+ elif string == '/':
+ if not current_selector:
+ raise syntax_error('"/" must follow a format selector', start)
+ first_choice = current_selector
+ second_choice = _parse_format_selection(tokens, inside_choice=True)
+ current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
+ elif string == '[':
+ if not current_selector:
+ current_selector = FormatSelector(SINGLE, 'best', [])
+ format_filter = _parse_filter(tokens)
+ current_selector.filters.append(format_filter)
+ elif string == '(':
+ if current_selector:
+ raise syntax_error('Unexpected "("', start)
+ group = _parse_format_selection(tokens, inside_group=True)
+ current_selector = FormatSelector(GROUP, group, [])
+ elif string == '+':
+ video_selector = current_selector
+ audio_selector = _parse_format_selection(tokens, inside_merge=True)
+ if not video_selector or not audio_selector:
+ raise syntax_error('"+" must be between two format selectors', start)
+ current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
+ else:
+ raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
+ elif type == tokenize.ENDMARKER:
+ break
+ if current_selector:
+ selectors.append(current_selector)
+ return selectors
+
+ def _build_selector_function(selector):
+ if isinstance(selector, list):
+ fs = [_build_selector_function(s) for s in selector]
+
+ def selector_function(ctx):
+ for f in fs:
+ for format in f(ctx):
+ yield format
+ return selector_function
+ elif selector.type == GROUP:
+ selector_function = _build_selector_function(selector.selector)
+ elif selector.type == PICKFIRST:
+ fs = [_build_selector_function(s) for s in selector.selector]
+
+ def selector_function(ctx):
+ for f in fs:
+ picked_formats = list(f(ctx))
+ if picked_formats:
+ return picked_formats
+ return []
+ elif selector.type == SINGLE:
+ format_spec = selector.selector
+
+ def selector_function(ctx):
+ formats = list(ctx['formats'])
+ if not formats:
+ return
+ if format_spec == 'all':
+ for f in formats:
+ yield f
+ elif format_spec in ['best', 'worst', None]:
+ format_idx = 0 if format_spec == 'worst' else -1
+ audiovideo_formats = [
+ f for f in formats
+ if f.get('vcodec') != 'none' and f.get('acodec') != 'none']
+ if audiovideo_formats:
+ yield audiovideo_formats[format_idx]
+ # for extractors with incomplete formats (audio only (soundcloud)
+ # or video only (imgur)) we will fallback to best/worst
+ # {video,audio}-only format
+ elif ctx['incomplete_formats']:
+ yield formats[format_idx]
+ elif format_spec == 'bestaudio':
+ audio_formats = [
+ f for f in formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ yield audio_formats[-1]
+ elif format_spec == 'worstaudio':
+ audio_formats = [
+ f for f in formats
+ if f.get('vcodec') == 'none']
+ if audio_formats:
+ yield audio_formats[0]
+ elif format_spec == 'bestvideo':
+ video_formats = [
+ f for f in formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ yield video_formats[-1]
+ elif format_spec == 'worstvideo':
+ video_formats = [
+ f for f in formats
+ if f.get('acodec') == 'none']
+ if video_formats:
+ yield video_formats[0]
+ else:
+ extensions = ['mp4', 'flv', 'webm', '3gp', 'm4a', 'mp3', 'ogg', 'aac', 'wav']
+ if format_spec in extensions:
+ filter_f = lambda f: f['ext'] == format_spec
+ else:
+ filter_f = lambda f: f['format_id'] == format_spec
+ matches = list(filter(filter_f, formats))
+ if matches:
+ yield matches[-1]
+ elif selector.type == MERGE:
+ def _merge(formats_info):
+ format_1, format_2 = [f['format_id'] for f in formats_info]
+ # The first format must contain the video and the
+ # second the audio
+ if formats_info[0].get('vcodec') == 'none':
+ self.report_error('The first format must '
+ 'contain the video, try using '
+ '"-f %s+%s"' % (format_2, format_1))
+ return
+ # Formats must be opposite (video+audio)
+ if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none':
+ self.report_error(
+ 'Both formats %s and %s are video-only, you must specify "-f video+audio"'
+ % (format_1, format_2))
+ return
+ output_ext = (
+ formats_info[0]['ext']
+ if self.params.get('merge_output_format') is None
+ else self.params['merge_output_format'])
+ return {
+ 'requested_formats': formats_info,
+ 'format': '%s+%s' % (formats_info[0].get('format'),
+ formats_info[1].get('format')),
+ 'format_id': '%s+%s' % (formats_info[0].get('format_id'),
+ formats_info[1].get('format_id')),
+ 'width': formats_info[0].get('width'),
+ 'height': formats_info[0].get('height'),
+ 'resolution': formats_info[0].get('resolution'),
+ 'fps': formats_info[0].get('fps'),
+ 'vcodec': formats_info[0].get('vcodec'),
+ 'vbr': formats_info[0].get('vbr'),
+ 'stretched_ratio': formats_info[0].get('stretched_ratio'),
+ 'acodec': formats_info[1].get('acodec'),
+ 'abr': formats_info[1].get('abr'),
+ 'ext': output_ext,
+ }
+ video_selector, audio_selector = map(_build_selector_function, selector.selector)
+
+ def selector_function(ctx):
+ for pair in itertools.product(
+ video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))):
+ yield _merge(pair)
+
+ filters = [self._build_format_filter(f) for f in selector.filters]
+
+ def final_selector(ctx):
+ ctx_copy = copy.deepcopy(ctx)
+ for _filter in filters:
+ ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats']))
+ return selector_function(ctx_copy)
+ return final_selector
+
+ stream = io.BytesIO(format_spec.encode('utf-8'))
+ try:
+ tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
+ except tokenize.TokenError:
+ raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
+
+ class TokenIterator(object):
+ def __init__(self, tokens):
+ self.tokens = tokens
+ self.counter = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ if self.counter >= len(self.tokens):
+ raise StopIteration()
+ value = self.tokens[self.counter]
+ self.counter += 1
+ return value
+
+ next = __next__
+
+ def restore_last_token(self):
+ self.counter -= 1
+
+ parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
+ return _build_selector_function(parsed_selector)
+
+ def _calc_headers(self, info_dict):
+ res = std_headers.copy()
+
+ add_headers = info_dict.get('http_headers')
+ if add_headers:
+ res.update(add_headers)
+
+ cookies = self._calc_cookies(info_dict)
+ if cookies:
+ res['Cookie'] = cookies
+
+ if 'X-Forwarded-For' not in res:
+ x_forwarded_for_ip = info_dict.get('__x_forwarded_for_ip')
+ if x_forwarded_for_ip:
+ res['X-Forwarded-For'] = x_forwarded_for_ip
+
+ return res
+
+ def _calc_cookies(self, info_dict):
+ pr = sanitized_Request(info_dict['url'])
+ self.cookiejar.add_cookie_header(pr)
+ return pr.get_header('Cookie')
+
+ def process_video_result(self, info_dict, download=True):
+ assert info_dict.get('_type', 'video') == 'video'
+
+ if 'id' not in info_dict:
+ raise ExtractorError('Missing "id" field in extractor result')
+ if 'title' not in info_dict:
+ raise ExtractorError('Missing "title" field in extractor result')
+
+ def report_force_conversion(field, field_not, conversion):
+ self.report_warning(
+ '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+ % (field, field_not, conversion))
+
+ def sanitize_string_field(info, string_field):
+ field = info.get(string_field)
+ if field is None or isinstance(field, compat_str):
+ return
+ report_force_conversion(string_field, 'a string', 'string')
+ info[string_field] = compat_str(field)
+
+ def sanitize_numeric_fields(info):
+ for numeric_field in self._NUMERIC_FIELDS:
+ field = info.get(numeric_field)
+ if field is None or isinstance(field, compat_numeric_types):
+ continue
+ report_force_conversion(numeric_field, 'numeric', 'int')
+ info[numeric_field] = int_or_none(field)
+
+ sanitize_string_field(info_dict, 'id')
+ sanitize_numeric_fields(info_dict)
+
+ if 'playlist' not in info_dict:
+ # It isn't part of a playlist
+ info_dict['playlist'] = None
+ info_dict['playlist_index'] = None
+
+ thumbnails = info_dict.get('thumbnails')
+ if thumbnails is None:
+ thumbnail = info_dict.get('thumbnail')
+ if thumbnail:
+ info_dict['thumbnails'] = thumbnails = [{'url': thumbnail}]
+ if thumbnails:
+ thumbnails.sort(key=lambda t: (
+ t.get('preference') if t.get('preference') is not None else -1,
+ t.get('width') if t.get('width') is not None else -1,
+ t.get('height') if t.get('height') is not None else -1,
+ t.get('id') if t.get('id') is not None else '', t.get('url')))
+ for i, t in enumerate(thumbnails):
+ t['url'] = sanitize_url(t['url'])
+ if t.get('width') and t.get('height'):
+ t['resolution'] = '%dx%d' % (t['width'], t['height'])
+ if t.get('id') is None:
+ t['id'] = '%d' % i
+
+ if self.params.get('list_thumbnails'):
+ self.list_thumbnails(info_dict)
+ return
+
+ thumbnail = info_dict.get('thumbnail')
+ if thumbnail:
+ info_dict['thumbnail'] = sanitize_url(thumbnail)
+ elif thumbnails:
+ info_dict['thumbnail'] = thumbnails[-1]['url']
+
+ if 'display_id' not in info_dict and 'id' in info_dict:
+ info_dict['display_id'] = info_dict['id']
+
+ if info_dict.get('upload_date') is None and info_dict.get('timestamp') is not None:
+ # Working around out-of-range timestamp values (e.g. negative ones on Windows,
+ # see http://bugs.python.org/issue1646728)
+ try:
+ upload_date = datetime.datetime.utcfromtimestamp(info_dict['timestamp'])
+ info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
+ except (ValueError, OverflowError, OSError):
+ pass
+
+ # Auto generate title fields corresponding to the *_number fields when missing
+ # in order to always have clean titles. This is very common for TV series.
+ for field in ('chapter', 'season', 'episode'):
+ if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+ info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+
+ for cc_kind in ('subtitles', 'automatic_captions'):
+ cc = info_dict.get(cc_kind)
+ if cc:
+ for _, subtitle in cc.items():
+ for subtitle_format in subtitle:
+ if subtitle_format.get('url'):
+ subtitle_format['url'] = sanitize_url(subtitle_format['url'])
+ if subtitle_format.get('ext') is None:
+ subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+
+ automatic_captions = info_dict.get('automatic_captions')
+ subtitles = info_dict.get('subtitles')
+
+ if self.params.get('listsubtitles', False):
+ if 'automatic_captions' in info_dict:
+ self.list_subtitles(
+ info_dict['id'], automatic_captions, 'automatic captions')
+ self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
+ return
+
+ info_dict['requested_subtitles'] = self.process_subtitles(
+ info_dict['id'], subtitles, automatic_captions)
+
+ # We now pick which formats have to be downloaded
+ if info_dict.get('formats') is None:
+ # There's only one format available
+ formats = [info_dict]
+ else:
+ formats = info_dict['formats']
+
+ if not formats:
+ raise ExtractorError('No video formats found!')
+
+ def is_wellformed(f):
+ url = f.get('url')
+ if not url:
+ self.report_warning(
+ '"url" field is missing or empty - skipping format, '
+ 'there is an error in extractor')
+ return False
+ if isinstance(url, bytes):
+ sanitize_string_field(f, 'url')
+ return True
+
+ # Filter out malformed formats for better extraction robustness
+ formats = list(filter(is_wellformed, formats))
+
+ formats_dict = {}
+
+ # We check that all the formats have the format and format_id fields
+ for i, format in enumerate(formats):
+ sanitize_string_field(format, 'format_id')
+ sanitize_numeric_fields(format)
+ format['url'] = sanitize_url(format['url'])
+ if not format.get('format_id'):
+ format['format_id'] = compat_str(i)
+ else:
+ # Sanitize format_id from characters used in format selector expression
+ format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id'])
+ format_id = format['format_id']
+ if format_id not in formats_dict:
+ formats_dict[format_id] = []
+ formats_dict[format_id].append(format)
+
+ # Make sure all formats have unique format_id
+ for format_id, ambiguous_formats in formats_dict.items():
+ if len(ambiguous_formats) > 1:
+ for i, format in enumerate(ambiguous_formats):
+ format['format_id'] = '%s-%d' % (format_id, i)
+
+ for i, format in enumerate(formats):
+ if format.get('format') is None:
+ format['format'] = '{id} - {res}{note}'.format(
+ id=format['format_id'],
+ res=self.format_resolution(format),
+ note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
+ )
+ # Automatically determine file extension if missing
+ if format.get('ext') is None:
+ format['ext'] = determine_ext(format['url']).lower()
+ # Automatically determine protocol if missing (useful for format
+ # selection purposes)
+ if format.get('protocol') is None:
+ format['protocol'] = determine_protocol(format)
+ # Add HTTP headers, so that external programs can use them from the
+ # json output
+ full_format_info = info_dict.copy()
+ full_format_info.update(format)
+ format['http_headers'] = self._calc_headers(full_format_info)
+ # Remove private housekeeping stuff
+ if '__x_forwarded_for_ip' in info_dict:
+ del info_dict['__x_forwarded_for_ip']
+
+ # TODO Central sorting goes here
+
+ if formats[0] is not info_dict:
+ # only set the 'formats' fields if the original info_dict list them
+ # otherwise we end up with a circular reference, the first (and unique)
+ # element in the 'formats' field in info_dict is info_dict itself,
+ # which can't be exported to json
+ info_dict['formats'] = formats
+ if self.params.get('listformats'):
+ self.list_formats(info_dict)
+ return
+
+ req_format = self.params.get('format')
+ if req_format is None:
+ req_format = self._default_format_spec(info_dict, download=download)
+ if self.params.get('verbose'):
+ self.to_stdout('[debug] Default format spec: %s' % req_format)
+
+ format_selector = self.build_format_selector(req_format)
+
+ # While in format selection we may need to have an access to the original
+ # format set in order to calculate some metrics or do some processing.
+ # For now we need to be able to guess whether original formats provided
+ # by extractor are incomplete or not (i.e. whether extractor provides only
+ # video-only or audio-only formats) for proper formats selection for
+ # extractors with such incomplete formats (see
+ # https://github.com/rg3/youtube-dl/pull/5556).
+ # Since formats may be filtered during format selection and may not match
+ # the original formats the results may be incorrect. Thus original formats
+ # or pre-calculated metrics should be passed to format selection routines
+ # as well.
+ # We will pass a context object containing all necessary additional data
+ # instead of just formats.
+ # This fixes incorrect format selection issue (see
+ # https://github.com/rg3/youtube-dl/issues/10083).
+ incomplete_formats = (
+ # All formats are video-only or
+ all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
+ # all formats are audio-only
+ all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+
+ ctx = {
+ 'formats': formats,
+ 'incomplete_formats': incomplete_formats,
+ }
+
+ formats_to_download = list(format_selector(ctx))
+ if not formats_to_download:
+ raise ExtractorError('requested format not available',
+ expected=True)
+
+ if download:
+ if len(formats_to_download) > 1:
+ self.to_screen('[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
+ for format in formats_to_download:
+ new_info = dict(info_dict)
+ new_info.update(format)
+ self.process_info(new_info)
+ # We update the info dict with the best quality format (backwards compatibility)
+ info_dict.update(formats_to_download[-1])
+ return info_dict
+
+ def process_subtitles(self, video_id, normal_subtitles, automatic_captions):
+ """Select the requested subtitles and their format"""
+ available_subs = {}
+ if normal_subtitles and self.params.get('writesubtitles'):
+ available_subs.update(normal_subtitles)
+ if automatic_captions and self.params.get('writeautomaticsub'):
+ for lang, cap_info in automatic_captions.items():
+ if lang not in available_subs:
+ available_subs[lang] = cap_info
+
+ if (not self.params.get('writesubtitles') and not
+ self.params.get('writeautomaticsub') or not
+ available_subs):
+ return None
+
+ if self.params.get('allsubtitles', False):
+ requested_langs = available_subs.keys()
+ else:
+ if self.params.get('subtitleslangs', False):
+ requested_langs = self.params.get('subtitleslangs')
+ elif 'en' in available_subs:
+ requested_langs = ['en']
+ else:
+ requested_langs = [list(available_subs.keys())[0]]
+
+ formats_query = self.params.get('subtitlesformat', 'best')
+ formats_preference = formats_query.split('/') if formats_query else []
+ subs = {}
+ for lang in requested_langs:
+ formats = available_subs.get(lang)
+ if formats is None:
+ self.report_warning('%s subtitles not available for %s' % (lang, video_id))
+ continue
+ for ext in formats_preference:
+ if ext == 'best':
+ f = formats[-1]
+ break
+ matches = list(filter(lambda f: f['ext'] == ext, formats))
+ if matches:
+ f = matches[-1]
+ break
+ else:
+ f = formats[-1]
+ self.report_warning(
+ 'No subtitle format found matching "%s" for language %s, '
+ 'using %s' % (formats_query, lang, f['ext']))
+ subs[lang] = f
+ return subs
+