+version 2020.06.16
+
+Extractors
+* [youtube] Fix uploader id and uploader URL extraction
+* [youtube] Improve view count extraction
+* [youtube] Fix upload date extraction (#25677)
+* [youtube] Fix thumbnails extraction (#25676)
+* [youtube] Fix playlist and feed extraction (#25675)
++ [facebook] Add support for single-video ID links
++ [youtube] Extract chapters from JSON (#24819)
++ [kaltura] Add support for multiple embeds on a webpage (#25523)
+
+
+version 2020.06.06
+
+Extractors
+* [tele5] Bypass geo restriction
++ [jwplatform] Add support for bypass geo restriction
+* [tele5] Prefer jwplatform over nexx (#25533)
+* [twitch:stream] Expect 400 and 410 HTTP errors from API
+* [twitch:stream] Fix extraction (#25528)
+* [twitch] Fix thumbnails extraction (#25531)
++ [twitch] Pass v5 Accept HTTP header (#25531)
+* [brightcove] Fix subtitles extraction (#25540)
++ [malltv] Add support for sk.mall.tv (#25445)
+* [periscope] Fix untitled broadcasts (#25482)
+* [jwplatform] Improve embeds extraction (#25467)
+
+
+version 2020.05.29
+
+Core
+* [postprocessor/ffmpeg] Embed series metadata with --add-metadata
+* [utils] Fix file permissions in write_json_file (#12471, #25122)
+
+Extractors
+* [ard:beta] Extend URL regular expression (#25405)
++ [youtube] Add support for more invidious instances (#25417)
+* [giantbomb] Extend URL regular expression (#25222)
+* [ard] Improve URL regular expression (#25134, #25198)
+* [redtube] Improve formats extraction and extract m3u8 formats (#25311,
+ #25321)
+* [indavideo] Switch to HTTPS for API request (#25191)
+* [redtube] Improve title extraction (#25208)
+* [vimeo] Improve format extraction and sorting (#25285)
+* [soundcloud] Reduce API playlist page limit (#25274)
++ [youtube] Add support for yewtu.be (#25226)
+* [mailru] Fix extraction (#24530, #25239)
+* [bellator] Fix mgid extraction (#25195)
+
+
version 2020.05.08
Core
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/ytdl-org/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart):
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://flake8.pycqa.org/en/latest/index.html#quickstart):
$ flake8 youtube_dl/extractor/yourextractor.py
for description, duration, expected_chapters in self._TEST_CASES:
ie = YoutubeIE()
expect_value(
- self, ie._extract_chapters(description, duration),
+ self, ie._extract_chapters_from_description(description, duration),
expected_chapters, None)
.IP " 8." 4
Make sure your code follows youtube\-dl coding conventions and check the
code with
-flake8 (http://flake8.pycqa.org/en/latest/index.html#quickstart):
+flake8 (https://flake8.pycqa.org/en/latest/index.html#quickstart):
.RS 4
.IP
.nf
class ARDIE(InfoExtractor):
- _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
+ _VALID_URL = r'(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos(?:extern)?/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
_TESTS = [{
# available till 14.02.2019
'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html',
'upload_date': '20180214',
'thumbnail': r're:^https?://.*\.jpg$',
},
+ }, {
+ 'url': 'https://www.daserste.de/information/reportage-dokumentation/erlebnis-erde/videosextern/woelfe-und-herdenschutzhunde-ungleiche-brueder-102.html',
+ 'only_matching': True,
}, {
'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
'only_matching': True,
class ARDBetaMediathekIE(ARDMediathekBaseIE):
- _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P<client>[^/]+)/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?'
+ _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?:player|live|video)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)'
_TESTS = [{
- 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita',
+ 'url': 'https://ardmediathek.de/ard/video/die-robuste-roswita/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f',
'info_dict': {
'display_id': 'die-robuste-roswita',
'upload_date': '20191222',
'ext': 'mp4',
},
+ }, {
+ 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://ardmediathek.de/ard/video/saartalk/saartalk-gesellschaftsgift-haltung-gegen-hass/sr-fernsehen/Y3JpZDovL3NyLW9ubGluZS5kZS9TVF84MTY4MA/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/video/trailer/private-eyes-s01-e01/one/Y3JpZDovL3dkci5kZS9CZWl0cmFnLTE1MTgwYzczLWNiMTEtNGNkMS1iMjUyLTg5MGYzOWQxZmQ1YQ/',
+ 'only_matching': True,
}, {
'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
'only_matching': True,
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
- display_id = mobj.group('display_id') or video_id
+ display_id = mobj.group('display_id')
+ if display_id:
+ display_id = display_id.rstrip('/')
+ if not display_id:
+ display_id = video_id
player_page = self._download_json(
'https://api.ardmediathek.de/public-gateway',
def get_programme_id(item):
def get_from_attributes(item):
- for p in('identifier', 'group'):
+ for p in ('identifier', 'group'):
value = item.get(p)
if value and re.match(r'^[pb][\da-z]{7}$', value):
return value
import re
import struct
-from .common import InfoExtractor
from .adobepass import AdobePassIE
+from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
+ compat_HTTPError,
compat_parse_qs,
compat_urllib_parse_urlparse,
compat_urlparse,
compat_xml_parse_error,
- compat_HTTPError,
)
from ..utils import (
- ExtractorError,
+ clean_html,
extract_attributes,
+ ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
- js_to_json,
int_or_none,
+ js_to_json,
+ mimetype2ext,
parse_iso8601,
smuggle_url,
+ str_or_none,
unescapeHTML,
unsmuggle_url,
- update_url_query,
- clean_html,
- mimetype2ext,
UnsupportedError,
+ update_url_query,
+ url_or_none,
)
subtitles = {}
for text_track in json_data.get('text_tracks', []):
- if text_track.get('src'):
- subtitles.setdefault(text_track.get('srclang'), []).append({
- 'url': text_track['src'],
- })
+ if text_track.get('kind') != 'captions':
+ continue
+ text_track_url = url_or_none(text_track.get('src'))
+ if not text_track_url:
+ continue
+ lang = (str_or_none(text_track.get('srclang'))
+ or str_or_none(text_track.get('label')) or 'en').lower()
+ subtitles.setdefault(lang, []).append({
+ 'url': text_track_url,
+ })
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
return info_dict
if '/posts/' in url:
- entries = [
- self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
- for vid in self._parse_json(
- self._search_regex(
- r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])',
- webpage, 'video ids', group='ids'),
- video_id)]
-
- return self.playlist_result(entries, video_id)
+ video_id_json = self._search_regex(
+ r'(["\'])video_ids\1\s*:\s*(?P<ids>\[.+?\])', webpage, 'video ids', group='ids',
+ default='')
+ if video_id_json:
+ entries = [
+ self.url_result('facebook:%s' % vid, FacebookIE.ie_key())
+ for vid in self._parse_json(video_id_json, video_id)]
+ return self.playlist_result(entries, video_id)
+
+ # Single Video?
+ video_id = self._search_regex(r'video_id:\s*"([0-9]+)"', webpage, 'single video id')
+ return self.url_result('facebook:%s' % video_id, FacebookIE.ie_key())
else:
_, info_dict = self._extract_from_url(
self._VIDEO_PAGE_TEMPLATE % video_id,
},
'add_ie': ['Kaltura'],
},
+ {
+ # multiple kaltura embeds, nsfw
+ 'url': 'https://www.quartier-rouge.be/prive/femmes/kamila-avec-video-jaime-sadomie.html',
+ 'info_dict': {
+ 'id': 'kamila-avec-video-jaime-sadomie',
+ 'title': "Kamila avec vídeo “J'aime sadomie”",
+ },
+ 'playlist_count': 8,
+ },
{
# Non-standard Vimeo embed
'url': 'https://openclassrooms.com/courses/understanding-the-web',
return self.url_result(mobj.group('url'), 'Zapiks')
# Look for Kaltura embeds
- kaltura_url = KalturaIE._extract_url(webpage)
- if kaltura_url:
- return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
+ kaltura_urls = KalturaIE._extract_urls(webpage)
+ if kaltura_urls:
+ return self.playlist_from_matches(
+ kaltura_urls, video_id, video_title,
+ getter=lambda x: smuggle_url(x, {'source_url': url}),
+ ie=KalturaIE.ie_key())
# Look for EaglePlatform embeds
eagleplatform_url = EaglePlatformIE._extract_url(webpage)
class GiantBombIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?giantbomb\.com/(?:videos|shows)/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
+ _TESTS = [{
'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
- 'md5': 'c8ea694254a59246a42831155dec57ac',
+ 'md5': '132f5a803e7e0ab0e274d84bda1e77ae',
'info_dict': {
'id': '2300-9782',
'display_id': 'quick-look-destiny-the-dark-below',
'duration': 2399,
'thumbnail': r're:^https?://.*\.jpg$',
}
- }
+ }, {
+ 'url': 'https://www.giantbomb.com/shows/ben-stranding/2970-20212',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = self._match_id(url)
video = self._download_json(
- 'http://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
+ 'https://amfphp.indavideo.hu/SYm0json.php/player.playerHandler.getVideoData/%s' % video_id,
video_id)['data']
title = video['title']
import re
from .common import InfoExtractor
+from ..utils import unsmuggle_url
class JWPlatformIE(InfoExtractor):
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//content\.jwplatform\.com/players/[a-zA-Z0-9]{8})',
+ r'<(?:script|iframe)[^>]+?src=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})',
webpage)
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+ self._initialize_geo_bypass({
+ 'countries': smuggled_data.get('geo_countries'),
+ })
video_id = self._match_id(url)
json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id)
return self._parse_jwplayer_data(json_data, video_id)
@staticmethod
def _extract_url(webpage):
+ urls = KalturaIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
+ @staticmethod
+ def _extract_urls(webpage):
# Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
- mobj = (
- re.search(
+ finditer = (
+ re.finditer(
r"""(?xs)
kWidget\.(?:thumb)?[Ee]mbed\(
\{.*?
(?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
(?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
""", webpage)
- or re.search(
+ or re.finditer(
r'''(?xs)
(?P<q1>["'])
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
)
(?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
''', webpage)
- or re.search(
+ or re.finditer(
r'''(?xs)
<(?:iframe[^>]+src|meta[^>]+\bcontent)=(?P<q1>["'])
(?:https?:)?//(?:(?:www|cdnapi(?:sec)?)\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
(?P=q1)
''', webpage)
)
- if mobj:
+ urls = []
+ for mobj in finditer:
embed_info = mobj.groupdict()
for k, v in embed_info.items():
if v:
webpage)
if service_mobj:
url = smuggle_url(url, {'service_url': service_mobj.group('id')})
- return url
+ urls.append(url)
+ return urls
def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs):
params = actions[0]
'http://api.video.mail.ru/videos/%s.json?new=1' % video_id,
video_id, 'Downloading video JSON')
+ headers = {}
+
+ video_key = self._get_cookies('https://my.mail.ru').get('video_key')
+ if video_key:
+ headers['Cookie'] = 'video_key=%s' % video_key.value
+
formats = []
for f in video_data['videos']:
video_url = f.get('url')
'url': video_url,
'format_id': format_id,
'height': height,
+ 'http_headers': headers,
})
self._sort_formats(formats)
class MallTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'md5': '1c4a37f080e1f3023103a7b43458e518',
}, {
'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'only_matching': True,
+ }, {
+ 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka',
+ 'only_matching': True,
}]
def _real_extract(self, url):
item_id, query=query)
def _parse_broadcast_data(self, broadcast, video_id):
- title = broadcast['status']
+ title = broadcast.get('status') or 'Periscope Broadcast'
uploader = broadcast.get('user_display_name') or broadcast.get('username')
title = '%s - %s' % (uploader, title) if uploader else title
is_live = broadcast.get('state').lower() == 'running'
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
ExtractorError,
int_or_none,
merge_dicts,
if not info.get('title'):
info['title'] = self._html_search_regex(
- (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
+ (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle|video_title)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>',
r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',),
webpage, 'title', group='title',
default=None) or self._og_search_title(webpage)
})
medias = self._parse_json(
self._search_regex(
- r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
+ r'mediaDefinition["\']?\s*:\s*(\[.+?}\s*\])', webpage,
'media definitions', default='{}'),
video_id, fatal=False)
if medias and isinstance(medias, list):
format_url = url_or_none(media.get('videoUrl'))
if not format_url:
continue
+ if media.get('format') == 'hls' or determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False))
+ continue
format_id = media.get('quality')
formats.append({
'url': format_url,
class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
def _extract_playlist(self, base_url, playlist_id, playlist_title):
COMMON_QUERY = {
- 'limit': 2000000000,
+ 'limit': 80000,
'linked_partitioning': '1',
}
_TESTS = [{
'url': 'http://www.bellator.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg',
'info_dict': {
- 'id': 'b55e434e-fde1-4a98-b7cc-92003a034de4',
- 'ext': 'mp4',
- 'title': 'Douglas Lima vs. Paul Daley - Round 1',
- 'description': 'md5:805a8dd29310fd611d32baba2f767885',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'title': 'Michael Page vs. Evangelista Cyborg',
+ 'description': 'md5:0d917fc00ffd72dd92814963fc6cbb05',
},
+ 'playlist_count': 3,
}, {
'url': 'http://www.bellator.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page',
'only_matching': True,
_FEED_URL = 'http://www.bellator.com/feeds/mrss/'
_GEO_COUNTRIES = ['US']
+ def _extract_mgid(self, webpage):
+ return self._extract_triforce_mgid(webpage)
+
class ParamountNetworkIE(MTVServicesInfoExtractor):
_VALID_URL = r'https?://(?:www\.)?paramountnetwork\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)'
from .common import InfoExtractor
from .jwplatform import JWPlatformIE
from .nexx import NexxIE
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_urlparse
from ..utils import (
NO_DEFAULT,
- try_get,
+ smuggle_url,
)
class Tele5IE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _GEO_COUNTRIES = ['DE']
_TESTS = [{
'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
'info_dict': {
'params': {
'skip_download': True,
},
+ }, {
+ # jwplatform, nexx unavailable
+ 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
+ 'info_dict': {
+ 'id': 'WJuiOlUp',
+ 'ext': 'mp4',
+ 'upload_date': '20200603',
+ 'timestamp': 1591214400,
+ 'title': 'Ghoul - Das Geheimnis des Friedhofmonsters',
+ 'description': 'md5:42002af1d887ff3d5b2b3ca1f8137d97',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [JWPlatformIE.ie_key()],
}, {
'url': 'https://www.tele5.de/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191',
'only_matching': True,
if not jwplatform_id:
jwplatform_id = extract_id(JWPLATFORM_ID_RE, 'jwplatform id')
- media = self._download_json(
- 'https://cdn.jwplayer.com/v2/media/' + jwplatform_id,
- display_id)
- nexx_id = try_get(
- media, lambda x: x['playlist'][0]['nexx_id'], compat_str)
-
- if nexx_id:
- return nexx_result(nexx_id)
-
return self.url_result(
- 'jwplatform:%s' % jwplatform_id, ie=JWPlatformIE.ie_key(),
- video_id=jwplatform_id)
+ smuggle_url(
+ 'jwplatform:%s' % jwplatform_id,
+ {'geo_countries': self._GEO_COUNTRIES}),
+ ie=JWPlatformIE.ie_key(), video_id=jwplatform_id)
orderedSet,
parse_duration,
parse_iso8601,
+ qualities,
+ str_or_none,
try_get,
unified_timestamp,
update_url_query,
def _call_api(self, path, item_id, *args, **kwargs):
headers = kwargs.get('headers', {}).copy()
- headers['Client-ID'] = self._CLIENT_ID
- kwargs['headers'] = headers
+ headers.update({
+ 'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8',
+ 'Client-ID': self._CLIENT_ID,
+ })
+ kwargs.update({
+ 'headers': headers,
+ 'expected_status': (400, 410),
+ })
response = self._download_json(
'%s/%s' % (self._API_BASE, path), item_id,
*args, **compat_kwargs(kwargs))
is_live = False
else:
is_live = None
+ _QUALITIES = ('small', 'medium', 'large')
+ quality_key = qualities(_QUALITIES)
+ thumbnails = []
+ preview = info.get('preview')
+ if isinstance(preview, dict):
+ for thumbnail_id, thumbnail_url in preview.items():
+ thumbnail_url = url_or_none(thumbnail_url)
+ if not thumbnail_url:
+ continue
+ if thumbnail_id not in _QUALITIES:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'preference': quality_key(thumbnail_id),
+ })
return {
'id': info['_id'],
'title': info.get('title') or 'Untitled Broadcast',
'description': info.get('description'),
'duration': int_or_none(info.get('length')),
- 'thumbnail': info.get('preview'),
+ 'thumbnails': thumbnails,
'uploader': info.get('channel', {}).get('display_name'),
'uploader_id': info.get('channel', {}).get('name'),
'timestamp': parse_iso8601(info.get('recorded_at')),
else super(TwitchStreamIE, cls).suitable(url))
def _real_extract(self, url):
- channel_id = self._match_id(url)
+ channel_name = self._match_id(url)
+
+ access_token = self._call_api(
+ 'api/channels/%s/access_token' % channel_name, channel_name,
+ 'Downloading access token JSON')
+
+ token = access_token['token']
+ channel_id = compat_str(self._parse_json(
+ token, channel_name)['channel_id'])
stream = self._call_api(
- 'kraken/streams/%s?stream_type=all' % channel_id.lower(),
+ 'kraken/streams/%s?stream_type=all' % channel_id,
channel_id, 'Downloading stream JSON').get('stream')
if not stream:
# (e.g. http://www.twitch.tv/TWITCHPLAYSPOKEMON) that will lead to constructing
# an invalid m3u8 URL. Working around by use of original channel name from stream
# JSON and fallback to lowercase if it's not available.
- channel_id = stream.get('channel', {}).get('name') or channel_id.lower()
-
- access_token = self._call_api(
- 'api/channels/%s/access_token' % channel_id, channel_id,
- 'Downloading channel access token')
+ channel_name = try_get(
+ stream, lambda x: x['channel']['name'],
+ compat_str) or channel_name.lower()
query = {
'allow_source': 'true',
'playlist_include_framerate': 'true',
'segment_preference': '4',
'sig': access_token['sig'].encode('utf-8'),
- 'token': access_token['token'].encode('utf-8'),
+ 'token': token.encode('utf-8'),
}
formats = self._extract_m3u8_formats(
'%s/api/channel/hls/%s.m3u8?%s'
- % (self._USHER_BASE, channel_id, compat_urllib_parse_urlencode(query)),
+ % (self._USHER_BASE, channel_name, compat_urllib_parse_urlencode(query)),
channel_id, 'mp4')
self._prefer_source(formats)
})
return {
- 'id': compat_str(stream['_id']),
- 'display_id': channel_id,
+ 'id': str_or_none(stream.get('_id')) or channel_id,
+ 'display_id': channel_name,
'title': title,
'description': description,
'thumbnails': thumbnails,
IE_NAME = 'twitter:broadcast'
_VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})'
+ _TEST = {
+ # untitled Periscope video
+ 'url': 'https://twitter.com/i/broadcasts/1yNGaQLWpejGj',
+ 'info_dict': {
+ 'id': '1yNGaQLWpejGj',
+ 'ext': 'mp4',
+ 'title': 'Andrea May Sahouri - Periscope Broadcast',
+ 'uploader': 'Andrea May Sahouri',
+ 'uploader_id': '1PXEdBZWpGwKe',
+ },
+ }
+
def _real_extract(self, url):
broadcast_id = self._match_id(url)
broadcast = self._call_api(
})
# TODO: fix handling of 308 status code returned for live archive manifest requests
+ sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
format_id = '%s-%s' % (files_type, cdn_name)
- if files_type == 'hls':
- formats.extend(self._extract_m3u8_formats(
- manifest_url, video_id, 'mp4',
- 'm3u8' if is_live else 'm3u8_native', m3u8_id=format_id,
- note='Downloading %s m3u8 information' % cdn_name,
- fatal=False))
- elif files_type == 'dash':
- mpd_pattern = r'/%s/(?:sep/)?video/' % video_id
- mpd_manifest_urls = []
- if re.search(mpd_pattern, manifest_url):
- for suffix, repl in (('', 'video'), ('_sep', 'sep/video')):
- mpd_manifest_urls.append((format_id + suffix, re.sub(
- mpd_pattern, '/%s/%s/' % (video_id, repl), manifest_url)))
- else:
- mpd_manifest_urls = [(format_id, manifest_url)]
- for f_id, m_url in mpd_manifest_urls:
+ sep_manifest_urls = []
+ if re.search(sep_pattern, manifest_url):
+ for suffix, repl in (('', 'video'), ('_sep', 'sep/video')):
+ sep_manifest_urls.append((format_id + suffix, re.sub(
+ sep_pattern, '/%s/' % repl, manifest_url)))
+ else:
+ sep_manifest_urls = [(format_id, manifest_url)]
+ for f_id, m_url in sep_manifest_urls:
+ if files_type == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ m_url, video_id, 'mp4',
+ 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id,
+ note='Downloading %s m3u8 information' % cdn_name,
+ fatal=False))
+ elif files_type == 'dash':
if 'json=1' in m_url:
real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url')
if real_m_url:
m_url.replace('/master.json', '/master.mpd'), video_id, f_id,
'Downloading %s MPD information' % cdn_name,
fatal=False)
- for f in mpd_formats:
- if f.get('vcodec') == 'none':
- f['preference'] = -50
- elif f.get('acodec') == 'none':
- f['preference'] = -40
formats.extend(mpd_formats)
live_archive = live_event.get('archive') or {}
'preference': 1,
})
+ for f in formats:
+ if f.get('vcodec') == 'none':
+ f['preference'] = -50
+ elif f.get('acodec') == 'none':
+ f['preference'] = -40
+
subtitles = {}
text_tracks = config['request'].get('text_tracks')
if text_tracks:
_PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _YOUTUBE_CLIENT_HEADERS = {
+ 'x-youtube-client-name': '1',
+ 'x-youtube-client-version': '1.20200609.04.02',
+ }
+
def _set_language(self):
self._set_cookie(
'.youtube.com', 'PREF', 'f1=50000000&hl=en',
'https://youtube.com/%s' % mobj.group('more'), playlist_id,
'Downloading page #%s%s'
% (page_num, ' (retry #%d)' % count if count else ''),
- transform_source=uppercase_escape)
+ transform_source=uppercase_escape,
+ headers=self._YOUTUBE_CLIENT_HEADERS)
break
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
(?:www\.)?invidious\.drycat\.fr/|
(?:www\.)?tube\.poal\.co/|
(?:www\.)?vid\.wxzm\.sx/|
+ (?:www\.)?yewtu\.be/|
(?:www\.)?yt\.elukerio\.org/|
(?:www\.)?yt\.lelux\.fi/|
+ (?:www\.)?invidious\.ggc-project\.de/|
+ (?:www\.)?yt\.maisputain\.ovh/|
+ (?:www\.)?invidious\.13ad\.de/|
+ (?:www\.)?invidious\.toot\.koeln/|
+ (?:www\.)?invidious\.fdn\.fr/|
+ (?:www\.)?watch\.nettohikari\.com/|
(?:www\.)?kgg2m7yk5aybusll\.onion/|
(?:www\.)?qklhadlycap4cnod\.onion/|
(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/|
(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/|
(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/|
(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p/|
+ (?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
video_id = mobj.group(2)
return video_id
+ def _extract_chapters_from_json(self, webpage, video_id, duration):
+ if not webpage:
+ return
+ player = self._parse_json(
+ self._search_regex(
+ r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
+ 'player args', default='{}'),
+ video_id, fatal=False)
+ if not player or not isinstance(player, dict):
+ return
+ watch_next_response = player.get('watch_next_response')
+ if not isinstance(watch_next_response, compat_str):
+ return
+ response = self._parse_json(watch_next_response, video_id, fatal=False)
+ if not response or not isinstance(response, dict):
+ return
+ chapters_list = try_get(
+ response,
+ lambda x: x['playerOverlays']
+ ['playerOverlayRenderer']
+ ['decoratedPlayerBarRenderer']
+ ['decoratedPlayerBarRenderer']
+ ['playerBar']
+ ['chapteredPlayerBarRenderer']
+ ['chapters'],
+ list)
+ if not chapters_list:
+ return
+
+ def chapter_time(chapter):
+ return float_or_none(
+ try_get(
+ chapter,
+ lambda x: x['chapterRenderer']['timeRangeStartMillis'],
+ int),
+ scale=1000)
+ chapters = []
+ for next_num, chapter in enumerate(chapters_list, start=1):
+ start_time = chapter_time(chapter)
+ if start_time is None:
+ continue
+ end_time = (chapter_time(chapters_list[next_num])
+ if next_num < len(chapters_list) else duration)
+ if end_time is None:
+ continue
+ title = try_get(
+ chapter, lambda x: x['chapterRenderer']['title']['simpleText'],
+ compat_str)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': title,
+ })
+ return chapters
+
@staticmethod
- def _extract_chapters(description, duration):
+ def _extract_chapters_from_description(description, duration):
if not description:
return None
chapter_lines = re.findall(
})
return chapters
+ def _extract_chapters(self, webpage, description, video_id, duration):
+ return (self._extract_chapters_from_json(webpage, video_id, duration)
+ or self._extract_chapters_from_description(description, duration))
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
video_details = try_get(
player_response, lambda x: x['videoDetails'], dict) or {}
+ microformat = try_get(
+ player_response, lambda x: x['microformat']['playerMicroformatRenderer'], dict) or {}
+
video_title = video_info.get('title', [None])[0] or video_details.get('title')
if not video_title:
self._downloader.report_warning('Unable to extract video title')
view_count = extract_view_count(video_info)
if view_count is None and video_details:
view_count = int_or_none(video_details.get('viewCount'))
+ if view_count is None and microformat:
+ view_count = int_or_none(microformat.get('viewCount'))
if is_live is None:
is_live = bool_or_none(video_details.get('isLive'))
video_uploader_id = mobj.group('uploader_id')
video_uploader_url = mobj.group('uploader_url')
else:
- self._downloader.report_warning('unable to extract uploader nickname')
+ owner_profile_url = url_or_none(microformat.get('ownerProfileUrl'))
+ if owner_profile_url:
+ video_uploader_id = self._search_regex(
+ r'(?:user|channel)/([^/]+)', owner_profile_url, 'uploader id',
+ default=None)
+ video_uploader_url = owner_profile_url
channel_id = (
str_or_none(video_details.get('channelId'))
video_webpage, 'channel id', default=None, group='id'))
channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None
- # thumbnail image
- # We try first to get a high quality image:
- m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
- video_webpage, re.DOTALL)
- if m_thumb is not None:
- video_thumbnail = m_thumb.group(1)
- elif 'thumbnail_url' not in video_info:
- self._downloader.report_warning('unable to extract video thumbnail')
+ thumbnails = []
+ thumbnails_list = try_get(
+ video_details, lambda x: x['thumbnail']['thumbnails'], list) or []
+ for t in thumbnails_list:
+ if not isinstance(t, dict):
+ continue
+ thumbnail_url = url_or_none(t.get('url'))
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(t.get('width')),
+ 'height': int_or_none(t.get('height')),
+ })
+
+ if not thumbnails:
video_thumbnail = None
- else: # don't panic if we can't find it
- video_thumbnail = compat_urllib_parse_unquote_plus(video_info['thumbnail_url'][0])
+ # We try first to get a high quality image:
+ m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+ video_webpage, re.DOTALL)
+ if m_thumb is not None:
+ video_thumbnail = m_thumb.group(1)
+ thumbnail_url = try_get(video_info, lambda x: x['thumbnail_url'][0], compat_str)
+ if thumbnail_url:
+ video_thumbnail = compat_urllib_parse_unquote_plus(thumbnail_url)
+ if video_thumbnail:
+ thumbnails.append({'url': video_thumbnail})
# upload date
upload_date = self._html_search_meta(
[r'(?s)id="eow-date.*?>(.*?)</span>',
r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None)
+ if not upload_date:
+ upload_date = microformat.get('publishDate') or microformat.get('uploadDate')
upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex(
errnote='Unable to download video annotations', fatal=False,
data=urlencode_postdata({xsrf_field_name: xsrf_token}))
- chapters = self._extract_chapters(description_original, video_duration)
+ chapters = self._extract_chapters(video_webpage, description_original, video_id, video_duration)
# Look for the DASH manifest
if self._downloader.params.get('youtube_include_dash_manifest', True):
'creator': video_creator or artist,
'title': video_title,
'alt_title': video_alt_title or track,
- 'thumbnail': video_thumbnail,
+ 'thumbnails': thumbnails,
'description': video_description,
'categories': video_categories,
'tags': video_tags,
more = self._download_json(
'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
'Downloading page #%s' % page_num,
- transform_source=uppercase_escape)
+ transform_source=uppercase_escape,
+ headers=self._YOUTUBE_CLIENT_HEADERS)
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
metadata[meta_f] = info[info_f]
break
+ # See [1-4] for some info on media metadata/metadata supported
+ # by ffmpeg.
+ # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/
+ # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata
+ # 3. https://kodi.wiki/view/Video_file_tagging
+ # 4. http://atomicparsley.sourceforge.net/mpeg-4files.html
+
add('title', ('track', 'title'))
add('date', 'upload_date')
add(('description', 'comment'), 'description')
add('album')
add('album_artist')
add('disc', 'disc_number')
+ add('show', 'series')
+ add('season_number')
+ add('episode_id', ('episode', 'episode_id'))
+ add('episode_sort', 'episode_number')
if not metadata:
self._downloader.to_screen('[ffmpeg] There isn\'t any metadata to add')
os.unlink(fn)
except OSError:
pass
+ try:
+ mask = os.umask(0)
+ os.umask(mask)
+ os.chmod(tf.name, 0o666 & ~mask)
+ except OSError:
+ pass
os.rename(tf.name, fn)
except Exception:
try:
from __future__ import unicode_literals
-__version__ = '2020.05.08'
+__version__ = '2020.06.16'