_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _YOUTUBE_CLIENT_HEADERS = {
+ 'x-youtube-client-name': '1',
+ 'x-youtube-client-version': '1.20200609.04.02',
+ }
+
def _set_language(self):
self._set_cookie(
- '.youtube.com', 'PREF', 'f1=50000000&hl=en',
+ '.youtube.com', 'PREF', 'f1=50000000&f6=8&hl=en',
# YouTube sets the expire time to about two months
expire_time=time.time() + 2 * 30 * 24 * 3600)
# Downloading page may result in intermittent 5xx HTTP error
# that is usually worked around with a retry
more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+ 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
'Downloading page #%s%s'
% (page_num, ' (retry #%d)' % count if count else ''),
- transform_source=uppercase_escape)
+ transform_source=uppercase_escape,
+ headers=self._YOUTUBE_CLIENT_HEADERS)
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
(?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/|
+ (?:www\.)?yewtu\.be/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?yt\.lelux\.fi/|
+ (?:www\.)?invidious\.ggc-project\.de/|
+ (?:www\.)?yt\.maisputain\.ovh/|
+ (?:www\.)?invidious\.13ad\.de/|
+ (?:www\.)?invidious\.toot\.koeln/|
+ (?:www\.)?invidious\.fdn\.fr/|
+ (?:www\.)?watch\.nettohikari\.com/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
+ (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
'params': {
'skip_download': True,
},
- }
+ },
+ {
+ # empty description results in an empty string
+ 'url': 'https://www.youtube.com/watch?v=x41yOUIvK2k',
+ 'info_dict': {
+ 'id': 'x41yOUIvK2k',
+ 'ext': 'mp4',
+ 'title': 'IMG 3456',
+ 'description': '',
+ 'upload_date': '20170613',
+ 'uploader_id': 'ElevageOrVert',
+ 'uploader': 'ElevageOrVert',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):
funcname = self._search_regex(
(r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\b(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
+ r'(?:\b|[^a-zA-Z0-9$])(?P<sig>[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
r'(?P<sig>[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)',
# Obsolete patterns
r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
video_id = mobj.group(2)
return video_id
+ def _extract_chapters_from_json(self, webpage, video_id, duration):
+ if not webpage:
+ return
+ player = self._parse_json(
+ self._search_regex(
+ r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
+ 'player args', default='{}'),
+ video_id, fatal=False)
+ if not player or not isinstance(player, dict):
+ return
+ watch_next_response = player.get('watch_next_response')
+ if not isinstance(watch_next_response, compat_str):
+ return
+ response = self._parse_json(watch_next_response, video_id, fatal=False)
+ if not response or not isinstance(response, dict):
+ return
+ chapters_list = try_get(
+ response,
+ lambda x: x['playerOverlays']
+ ['playerOverlayRenderer']
+ ['decoratedPlayerBarRenderer']
+ ['decoratedPlayerBarRenderer']
+ ['playerBar']
+ ['chapteredPlayerBarRenderer']
+ ['chapters'],
+ list)
+ if not chapters_list:
+ return
+
+ def chapter_time(chapter):
+ return float_or_none(
+ try_get(
+ chapter,
+ lambda x: x['chapterRenderer']['timeRangeStartMillis'],
+ int),
+ scale=1000)
+ chapters = []
+ for next_num, chapter in enumerate(chapters_list, start=1):
+ start_time = chapter_time(chapter)
+ if start_time is None:
+ continue
+ end_time = (chapter_time(chapters_list[next_num])
+ if next_num < len(chapters_list) else duration)
+ if end_time is None:
+ continue
+ title = try_get(
+ chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
+ compat_str)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': title,
+ })
+ return chapters
+
@staticmethod
- def _extract_chapters(description, duration):
+ def _extract_chapters_from_description(description, duration):
if not description:
return None
chapter_lines = re.findall(
})
return chapters
+ def _extract_chapters(self, webpage, description, video_id, duration):
+ return (self._extract_chapters_from_json(webpage, video_id, duration)
+ or self._extract_chapters_from_description(description, duration))
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
# Get video info
video_info = {}
embed_webpage = None
- if re.search(r'player-age-gate-content">', video_webpage) is not None:
+ if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+'
+ or re.search(r'player-age-gate-content">', video_webpage) is not None):
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
# this can be viewed without login into Youtube
video_details = try_get(
player_response, lambda x: x['videoDetails'], dict) or {}
+ microformat = try_get(
+ player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
+
video_title = video_info.get('title', [None])[0] or video_details.get('title')
if not video_title:
self._downloader.report_warning('Unable to extract video title')
''', replace_url, video_description)
video_description = clean_html(video_description)
else:
- video_description = self._html_search_meta('description', video_webpage) or video_details.get('shortDescription')
+ video_description = video_details.get('shortDescription')
+ if video_description is None:
+ video_description = self._html_search_meta('description', video_webpage)
if not smuggled_data.get('force_singlefeed', False):
if not self._downloader.params.get('noplaylist'):
view_count = extract_view_count(video_info)
if view_count is None and video_details:
view_count = int_or_none(video_details.get('viewCount'))
+ if view_count is None and microformat:
+ view_count = int_or_none(microformat.get('viewCount'))
if is_live is None:
is_live = bool_or_none(video_details.get('isLive'))
video_uploader_id = mobj.group('uploader_id')
video_uploader_url = mobj.group('uploader_url')
else:
- self._downloader.report_warning('unable to extract uploader nickname')
+ owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
+ if owner_profile_url:
+ video_uploader_id = self._search_regex(
+ r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
+ default=None)
+ video_uploader_url = owner_profile_url
channel_id = (
str_or_none(video_details.get('channelId'))
video_webpage, 'channel id', default=None, group='id'))
channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
- # thumbnail image
- # We try first to get a high quality image:
- m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
- video_webpage, re.DOTALL)
- if m_thumb is not None:
- video_thumbnail = m_thumb.group(1)
- elif 'thumbnail_url' not in video_info:
- self._downloader.report_warning('unable to extract video thumbnail')
+ thumbnails = []
+ thumbnails_list = try_get(
+ video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
+ for t in thumbnails_list:
+ if not isinstance(t, dict):
+ continue
+ thumbnail_url = url_or_none(t.get('url'))
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(t.get('width')),
+ 'height': int_or_none(t.get('height')),
+ })
+
+ if not thumbnails:
video_thumbnail = None
- else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
+ # We try first to get a high quality image:
+ m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+ video_webpage, re.DOTALL)
+ if m_thumb is not None:
+ video_thumbnail = m_thumb.group(1)
+ thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
+ if thumbnail_url:
+ video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
+ if video_thumbnail:
+ thumbnails.append({'url': video_thumbnail})
# upload date
upload_date = self._html_search_meta(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex(
m_cat_container = self._search_regex(
r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
video_webpage, 'categories', default=None)
+ category = None
if m_cat_container:
category = self._html_search_regex(
r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
default=None)
- video_categories = None if category is None else [category]
- else:
- video_categories = None
+ if not category:
+ category = try_get(
+ microformat, lambda x: x['category'], compat_str)
+ video_categories = None if category is None else [category]
video_tags = [
unescapeHTML(m.group('content'))
for m in re.finditer(self._meta_regex('og:video:tag'), video_webpage)]
+ if not video_tags:
+ video_tags = try_get(video_details, lambda x: x['keywords'], list)
def _extract_count(count_name):
return str_to_int(self._search_regex(
errnote='Unable to download video annotations', fatal=False,
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
- chapters = self._extract_chapters(description_original, video_duration)
+ chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
'creator': video_creator or artist,
'title': video_title,
'alt_title': video_alt_title or track,
- 'thumbnail': video_thumbnail,
+ 'thumbnails': thumbnails,
'description': video_description,
'categories': video_categories,
'tags': video_tags,
ids = []
last_id = playlist_id[-11:]
for n in itertools.count(1):
- url = 'https://youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
+ url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
webpage = self._download_webpage(
url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
new_ids = orderedSet(re.findall(
class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
IE_NAME = 'youtube:user'
}, {
'url': 'https://www.youtube.com/c/gametrailers',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
+ 'only_matching': True,
}, {
'url': 'https://www.youtube.com/gametrailers',
'only_matching': True,
class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
IE_DESC = 'YouTube.com user/channel playlists'
- _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel)/(?P<id>[^/]+)/playlists'
+ _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
IE_NAME = 'youtube:playlists'
_TESTS = [{
'title': 'Chem Player',
},
'skip': 'Blocked',
+ }, {
+ 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+ 'only_matching': True,
}]
break
more = self._download_json(
- 'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
+ 'https://www.youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
+ transform_source=uppercase_escape,
+ headers=self._YOUTUBE_CLIENT_HEADERS)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']