+ manifest_url = (
+ url_or_none(try_get(
+ player_response,
+ lambda x: x['streamingData']['hlsManifestUrl'],
+ compat_str))
+ or url_or_none(try_get(
+ video_info, lambda x: x['hlsvp'][0], compat_str)))
+ if manifest_url:
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', fatal=False)
+ for a_format in m3u8_formats:
+ itag = self._search_regex(
+ r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+ if itag:
+ a_format['format_id'] = itag
+ if itag in self._formats:
+ dct = self._formats[itag].copy()
+ dct.update(a_format)
+ a_format = dct
+ a_format['player_url'] = player_url
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+ a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+ formats.append(a_format)
+ else:
+ error_message = clean_html(video_info.get('reason', [None])[0])
+ if not error_message:
+ error_message = extract_unavailable_message()
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
+ raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
+
+ # uploader
+ video_uploader = try_get(
+ video_info, lambda x: x['author'][0],
+ compat_str) or str_or_none(video_details.get('author'))
+ if video_uploader:
+ video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
+ else:
+ self._downloader.report_warning('unable to extract uploader name')
+
+ # uploader_id
+ video_uploader_id = None
+ video_uploader_url = None
+ mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">',
+ video_webpage)
+ if mobj is not None:
+ video_uploader_id = mobj.group('uploader_id')
+ video_uploader_url = mobj.group('uploader_url')
+ else:
+ self._downloader.report_warning('unable to extract uploader nickname')
+
+ channel_id = (
+ str_or_none(video_details.get('channelId'))
+ or self._html_search_meta(
+ 'channelId', video_webpage, 'channel id', default=None)
+ or self._search_regex(
+ r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1',
+ video_webpage, 'channel id', default=None, group='id'))
+ channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
+
+ # thumbnail image
+ # We try first to get a high quality image:
+ m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+ video_webpage, re.DOTALL)
+ if m_thumb is not None:
+ video_thumbnail = m_thumb.group(1)
+ elif 'thumbnail_url' not in video_info:
+ self._downloader.report_warning('unable to extract video thumbnail')
+ video_thumbnail = None
+ else: # don't panic if we can't find it
+ video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
+
+ # upload date
+ upload_date = self._html_search_meta(
+ 'datePublished', video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
+ video_webpage, 'upload date', default=None)
+ upload_date = unified_strdate(upload_date)
+
+ video_license = self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*License\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li',
+ video_webpage, 'license', default=None)
+
+ m_music = re.search(
+ r'''(?x)
+ <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+ <ul[^>]*>\s*
+ <li>(?P<title>.+?)
+ by (?P<creator>.+?)
+ (?:
+ \(.+?\)|
+ <a[^>]*
+ (?:
+ \bhref=["\']/red[^>]*>| # drop possible
+ >\s*Listen ad-free with YouTube Red # YouTube Red ad
+ )
+ .*?
+ )?</li
+ ''',
+ video_webpage)
+ if m_music:
+ video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
+ video_creator = clean_html(m_music.group('creator'))
+ else:
+ video_alt_title = video_creator = None
+
+ def extract_meta(field):
+ return self._html_search_regex(
+ r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
+ video_webpage, field, default=None)
+
+ track = extract_meta('Song')
+ artist = extract_meta('Artist')
+ album = extract_meta('Album')
+
+ # Youtube Music Auto-generated description
+ release_date = release_year = None
+ if video_description:
+ mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
+ if mobj:
+ if not track:
+ track = mobj.group('track').strip()
+ if not artist:
+ artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·'))
+ if not album:
+ album = mobj.group('album'.strip())
+ release_year = mobj.group('release_year')
+ release_date = mobj.group('release_date')
+ if release_date:
+ release_date = release_date.replace('-', '')
+ if not release_year:
+ release_year = int(release_date[:4])
+ if release_year:
+ release_year = int(release_year)
+
+ m_episode = re.search(
+ r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
+ video_webpage)
+ if m_episode:
+ series = unescapeHTML(m_episode.group('series'))
+ season_number = int(m_episode.group('season'))
+ episode_number = int(m_episode.group('episode'))
+ else:
+ series = season_number = episode_number = None
+
+ m_cat_container = self._search_regex(
+ r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
+ video_webpage, 'categories', default=None)
+ if m_cat_container:
+ category = self._html_search_regex(
+ r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
+ default=None)
+ video_categories = None if category is None else [category]
+ else:
+ video_categories = None
+
+ video_tags = [
+ unescapeHTML(m.group('content'))
+ for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+
+ def _extract_count(count_name):
+ return str_to_int(self._search_regex(
+ r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>'
+ % re.escape(count_name),
+ video_webpage, count_name, default=None))
+
+ like_count = _extract_count('like')
+ dislike_count = _extract_count('dislike')
+
+ if view_count is None:
+ view_count = str_to_int(self._search_regex(
+ r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage,
+ 'view count', default=None))
+
+ average_rating = (
+ float_or_none(video_details.get('averageRating'))
+ or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0])))
+
+ # subtitles
+ video_subtitles = self.extract_subtitles(video_id, video_webpage)
+ automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
+
+ video_duration = try_get(
+ video_info, lambda x: int_or_none(x['length_seconds'][0]))
+ if not video_duration:
+ video_duration = int_or_none(video_details.get('lengthSeconds'))
+ if not video_duration:
+ video_duration = parse_duration(self._html_search_meta(
+ 'duration', video_webpage, 'video duration'))
+
+ # annotations
+ video_annotations = None
+ if self._downloader.params.get('writeannotations', False):
+ video_annotations = self._extract_annotations(video_id)
+
+ chapters = self._extract_chapters(description_original, video_duration)