+ def _get_tfa_info(self):
+ """
+ Get the two-factor authentication info
+ TODO - asking the user will be required for sms/phone verify
+ currently just uses the command line option
+ If there's no info available, return None
+ """
+ if self._downloader is None:
+ return None
+ downloader_params = self._downloader.params
+
+ if downloader_params.get('twofactor', None) is not None:
+ return downloader_params['twofactor']
+
+ return None
+
+ # Helper functions for extracting OpenGraph info
+ @staticmethod
+ def _og_regexes(prop):
+ content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
+ property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+ template = r'<meta[^>]+?%s[^>]+?%s'
+ return [
+ template % (property_re, content_re),
+ template % (content_re, property_re),
+ ]
+
+ def _og_search_property(self, prop, html, name=None, **kargs):
+ if name is None:
+ name = 'OpenGraph %s' % prop
+ escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+ if escaped is None:
+ return None
+ return unescapeHTML(escaped)
+
+ def _og_search_thumbnail(self, html, **kargs):
+ return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+
+ def _og_search_description(self, html, **kargs):
+ return self._og_search_property('description', html, fatal=False, **kargs)
+
+ def _og_search_title(self, html, **kargs):
+ return self._og_search_property('title', html, **kargs)
+
+ def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
+ regexes = self._og_regexes('video') + self._og_regexes('video:url')
+ if secure:
+ regexes = self._og_regexes('video:secure_url') + regexes
+ return self._html_search_regex(regexes, html, name, **kargs)
+
+ def _og_search_url(self, html, **kargs):
+ return self._og_search_property('url', html, **kargs)
+
+ def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+ if display_name is None:
+ display_name = name
+ return self._html_search_regex(
+ r'''(?isx)<meta
+ (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+ html, display_name, fatal=fatal, group='content', **kwargs)
+
+ def _dc_search_uploader(self, html):
+ return self._html_search_meta('dc.creator', html, 'uploader')
+
+ def _rta_search(self, html):
+ # See http://www.rtalabel.org/index.php?content=howtofaq#single
+ if re.search(r'(?ix)<meta\s+name="rating"\s+'
+ r' content="RTA-5042-1996-1400-1577-RTA"',
+ html):
+ return 18
+ return 0
+
+ def _media_rating_search(self, html):
+ # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+ rating = self._html_search_meta('rating', html)
+
+ if not rating:
+ return None
+
+ RATING_TABLE = {
+ 'safe for kids': 0,
+ 'general': 8,
+ '14 years': 14,
+ 'mature': 17,
+ 'restricted': 19,
+ }
+ return RATING_TABLE.get(rating.lower(), None)
+
+ def _family_friendly_search(self, html):
+ # See http://schema.org/VideoObject
+ family_friendly = self._html_search_meta('isFamilyFriendly', html)
+
+ if not family_friendly:
+ return None
+
+ RATING_TABLE = {
+ '1': 0,
+ 'true': 0,
+ '0': 18,
+ 'false': 18,
+ }
+ return RATING_TABLE.get(family_friendly.lower(), None)
+
+ def _twitter_search_player(self, html):
+ return self._html_search_meta('twitter:player', html,
+ 'twitter card player')
+
+ @staticmethod
+ def _hidden_inputs(html):
+ return dict([
+ (input.group('name'), input.group('value')) for input in re.finditer(
+ r'''(?x)
+ <input\s+
+ type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
+ name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
+ (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
+ value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
+ ''', html)
+ ])
+
+ def _form_hidden_inputs(self, form_id, html):
+ form = self._search_regex(
+ r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+ html, '%s form' % form_id, group='form')
+ return self._hidden_inputs(form)
+
+ def _sort_formats(self, formats, field_preference=None):
+ if not formats:
+ raise ExtractorError('No video formats found')
+
+ def _formats_key(f):
+ # TODO remove the following workaround
+ from ..utils import determine_ext
+ if not f.get('ext') and 'url' in f:
+ f['ext'] = determine_ext(f['url'])
+
+ if isinstance(field_preference, (list, tuple)):
+ return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
+ preference = f.get('preference')
+ if preference is None:
+ proto = f.get('protocol')
+ if proto is None:
+ proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
+
+ preference = 0 if proto in ['http', 'https'] else -0.1
+ if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
+ preference -= 0.5
+
+ if f.get('vcodec') == 'none': # audio only
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
+ else:
+ ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
+ ext_preference = 0
+ try:
+ audio_ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ audio_ext_preference = -1
+ else:
+ if self._downloader.params.get('prefer_free_formats'):
+ ORDER = ['flv', 'mp4', 'webm']
+ else:
+ ORDER = ['webm', 'flv', 'mp4']
+ try:
+ ext_preference = ORDER.index(f['ext'])
+ except ValueError:
+ ext_preference = -1
+ audio_ext_preference = 0
+
+ return (
+ preference,
+ f.get('language_preference') if f.get('language_preference') is not None else -1,
+ f.get('quality') if f.get('quality') is not None else -1,
+ f.get('tbr') if f.get('tbr') is not None else -1,
+ f.get('filesize') if f.get('filesize') is not None else -1,
+ f.get('vbr') if f.get('vbr') is not None else -1,
+ f.get('height') if f.get('height') is not None else -1,
+ f.get('width') if f.get('width') is not None else -1,
+ ext_preference,
+ f.get('abr') if f.get('abr') is not None else -1,
+ audio_ext_preference,
+ f.get('fps') if f.get('fps') is not None else -1,
+ f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
+ f.get('source_preference') if f.get('source_preference') is not None else -1,
+ f.get('format_id') if f.get('format_id') is not None else '',
+ )
+ formats.sort(key=_formats_key)
+
+ def _check_formats(self, formats, video_id):
+ if formats:
+ formats[:] = filter(
+ lambda f: self._is_valid_url(
+ f['url'], video_id,
+ item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
+ formats)
+
+ def _is_valid_url(self, url, video_id, item='video'):
+ url = self._proto_relative_url(url, scheme='http:')
+ # For now assume non HTTP(S) URLs always valid
+ if not (url.startswith('http://') or url.startswith('https://')):
+ return True
+ try:
+ self._request_webpage(url, video_id, 'Checking %s URL' % item)
+ return True
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError):
+ self.to_screen(
+ '%s: %s URL is invalid, skipping' % (video_id, item))
+ return False
+ raise
+
+ def http_scheme(self):
+ """ Either "http:" or "https:", depending on the user's preferences """
+ return (
+ 'http:'
+ if self._downloader.params.get('prefer_insecure', False)
+ else 'https:')
+
+ def _proto_relative_url(self, url, scheme=None):
+ if url is None:
+ return url
+ if url.startswith('//'):
+ if scheme is None:
+ scheme = self.http_scheme()
+ return scheme + url
+ else:
+ return url
+
+ def _sleep(self, timeout, video_id, msg_template=None):
+ if msg_template is None:
+ msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
+ msg = msg_template % {'video_id': video_id, 'timeout': timeout}
+ self.to_screen(msg)
+ time.sleep(timeout)
+
+ def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+ transform_source=lambda s: fix_xml_ampersands(s).strip()):
+ manifest = self._download_xml(
+ manifest_url, video_id, 'Downloading f4m manifest',
+ 'Unable to download f4m manifest',
+ # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+ # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+ transform_source=transform_source)
+
+ formats = []
+ manifest_version = '1.0'
+ media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
+ if not media_nodes:
+ manifest_version = '2.0'
+ media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+ for i, media_el in enumerate(media_nodes):
+ if manifest_version == '2.0':
+ media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+ if not media_url:
+ continue
+ manifest_url = (
+ media_url if media_url.startswith('http://') or media_url.startswith('https://')
+ else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+ # If media_url is itself a f4m manifest do the recursive extraction
+ # since bitrates in parent manifest (this one) and media_url manifest
+ # may differ leading to inability to resolve the format by requested
+ # bitrate in f4m downloader
+ if determine_ext(manifest_url) == 'f4m':
+ formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+ continue
+ tbr = int_or_none(media_el.attrib.get('bitrate'))
+ formats.append({
+ 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
+ 'url': manifest_url,
+ 'ext': 'flv',
+ 'tbr': tbr,
+ 'width': int_or_none(media_el.attrib.get('width')),
+ 'height': int_or_none(media_el.attrib.get('height')),
+ 'preference': preference,
+ })
+ self._sort_formats(formats)
+
+ return formats
+
+ def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, note=None, errnote=None,
+ fatal=True):
+
+ formats = [{
+ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+ 'url': m3u8_url,
+ 'ext': ext,
+ 'protocol': 'm3u8',
+ 'preference': preference - 1 if preference else -1,
+ 'resolution': 'multiple',
+ 'format_note': 'Quality selection URL',
+ }]
+
+ format_url = lambda u: (
+ u
+ if re.match(r'^https?://', u)
+ else compat_urlparse.urljoin(m3u8_url, u))
+
+ m3u8_doc = self._download_webpage(
+ m3u8_url, video_id,
+ note=note or 'Downloading m3u8 information',
+ errnote=errnote or 'Failed to download m3u8 information',
+ fatal=fatal)
+ if m3u8_doc is False:
+ return m3u8_doc
+ last_info = None
+ last_media = None
+ kv_rex = re.compile(
+ r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
+ for line in m3u8_doc.splitlines():
+ if line.startswith('#EXT-X-STREAM-INF:'):
+ last_info = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_info[m.group('key')] = v
+ elif line.startswith('#EXT-X-MEDIA:'):
+ last_media = {}
+ for m in kv_rex.finditer(line):
+ v = m.group('val')
+ if v.startswith('"'):
+ v = v[1:-1]
+ last_media[m.group('key')] = v
+ elif line.startswith('#') or not line.strip():
+ continue
+ else:
+ if last_info is None:
+ formats.append({'url': format_url(line)})
+ continue
+ tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+ format_id = []
+ if m3u8_id:
+ format_id.append(m3u8_id)
+ last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+ format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(line.strip()),
+ 'tbr': tbr,
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ codecs = last_info.get('CODECS')
+ if codecs:
+ # TODO: looks like video codec is not always necessarily goes first
+ va_codecs = codecs.split(',')
+ if va_codecs[0]:
+ f['vcodec'] = va_codecs[0].partition('.')[0]
+ if len(va_codecs) > 1 and va_codecs[1]:
+ f['acodec'] = va_codecs[1].partition('.')[0]
+ resolution = last_info.get('RESOLUTION')
+ if resolution:
+ width_str, height_str = resolution.split('x')
+ f['width'] = int(width_str)
+ f['height'] = int(height_str)
+ if last_media is not None:
+ f['m3u8_media'] = last_media
+ last_media = None
+ formats.append(f)
+ last_info = {}
+ self._sort_formats(formats)
+ return formats
+
+ # TODO: improve extraction
+ def _extract_smil_formats(self, smil_url, video_id, fatal=True):
+ smil = self._download_xml(
+ smil_url, video_id, 'Downloading SMIL file',
+ 'Unable to download SMIL file', fatal=fatal)
+ if smil is False:
+ assert not fatal
+ return []
+
+ base = smil.find('./head/meta').get('base')
+
+ formats = []
+ rtmp_count = 0
+ if smil.findall('./body/seq/video'):
+ video = smil.findall('./body/seq/video')[0]
+ fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
+ formats.extend(fmts)
+ else:
+ for video in smil.findall('./body/switch/video'):
+ fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
+ formats.extend(fmts)
+
+ self._sort_formats(formats)
+
+ return formats
+
+ def _parse_smil_video(self, video, video_id, base, rtmp_count):
+ src = video.get('src')
+ if not src:
+ return [], rtmp_count
+ bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+ width = int_or_none(video.get('width'))
+ height = int_or_none(video.get('height'))
+ proto = video.get('proto')
+ if not proto:
+ if base:
+ if base.startswith('rtmp'):
+ proto = 'rtmp'
+ elif base.startswith('http'):
+ proto = 'http'
+ ext = video.get('ext')
+ if proto == 'm3u8':
+ return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
+ elif proto == 'rtmp':
+ rtmp_count += 1
+ streamer = video.get('streamer') or base
+ return ([{
+ 'url': streamer,
+ 'play_path': src,
+ 'ext': 'flv',
+ 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ }], rtmp_count)
+ elif proto.startswith('http'):
+ return ([{
+ 'url': base + src,
+ 'ext': ext or 'flv',
+ 'tbr': bitrate,
+ 'width': width,
+ 'height': height,
+ }], rtmp_count)
+
+ def _live_title(self, name):
+ """ Generate the title for a live video """
+ now = datetime.datetime.now()
+ now_str = now.strftime("%Y-%m-%d %H:%M")
+ return name + ' ' + now_str
+
+ def _int(self, v, name, fatal=False, **kwargs):
+ res = int_or_none(v, **kwargs)
+ if 'get_attr' in kwargs:
+ print(getattr(v, kwargs['get_attr']))
+ if res is None:
+ msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+ if fatal:
+ raise ExtractorError(msg)
+ else:
+ self._downloader.report_warning(msg)
+ return res
+
+ def _float(self, v, name, fatal=False, **kwargs):
+ res = float_or_none(v, **kwargs)
+ if res is None:
+ msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
+ if fatal:
+ raise ExtractorError(msg)
+ else:
+ self._downloader.report_warning(msg)
+ return res
+
+ def _set_cookie(self, domain, name, value, expire_time=None):
+ cookie = compat_cookiejar.Cookie(
+ 0, name, value, None, None, domain, None,
+ None, '/', True, False, expire_time, '', None, None, None)
+ self._downloader.cookiejar.set_cookie(cookie)
+
+ def get_testcases(self, include_onlymatching=False):
+ t = getattr(self, '_TEST', None)
+ if t:
+ assert not hasattr(self, '_TESTS'), \
+ '%s has _TEST and _TESTS' % type(self).__name__
+ tests = [t]
+ else:
+ tests = getattr(self, '_TESTS', [])
+ for t in tests:
+ if not include_onlymatching and t.get('only_matching', False):
+ continue
+ t['name'] = type(self).__name__[:-len('IE')]
+ yield t
+
+ def is_suitable(self, age_limit):
+ """ Test whether the extractor is generally suitable for the given
+ age limit (i.e. pornographic sites are not, all others usually are) """
+
+ any_restricted = False
+ for tc in self.get_testcases(include_onlymatching=False):
+ if 'playlist' in tc:
+ tc = tc['playlist'][0]
+ is_restricted = age_restricted(
+ tc.get('info_dict', {}).get('age_limit'), age_limit)
+ if not is_restricted:
+ return True
+ any_restricted = any_restricted or is_restricted
+ return not any_restricted
+
+ def extract_subtitles(self, *args, **kwargs):
+ if (self._downloader.params.get('writesubtitles', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_subtitles(*args, **kwargs)
+ return {}
+
+ def _get_subtitles(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
+ def extract_automatic_captions(self, *args, **kwargs):
+ if (self._downloader.params.get('writeautomaticsub', False) or
+ self._downloader.params.get('listsubtitles')):
+ return self._get_automatic_captions(*args, **kwargs)
+ return {}
+
+ def _get_automatic_captions(self, *args, **kwargs):
+ raise NotImplementedError("This method must be implemented by subclasses")
+
+