Surya Oktafendri
TingPing
Alexandre Macabies
+Bastian de Groot
+Niklas Haas
+András Veres-Szentkirályi
+Enes Solak
+Nathan Rossi
+Thomas van der Berg
+Luca Cherubin
+version 2018.09.10
+
+Core
++ [utils] Properly recognize AV1 codec (#17506)
+
+Extractors
++ [iprima] Add support for prima.iprima.cz (#17514)
++ [tele5] Add support for tele5.de (#7805, #7922, #17331, #17414)
+* [nbc] Fix extraction of percent encoded URLs (#17374)
+
+
+version 2018.09.08
+
+Extractors
+* [youtube] Fix extraction (#17457, #17464)
++ [pornhub:uservideos] Add support for new URLs (#17388)
+* [iprima] Confirm adult check (#17437)
+* [slideslive] Make check for video service name case-insensitive (#17429)
+* [radiojavan] Fix extraction (#17151)
+* [generic] Skip unsuccessful jwplayer extraction (#16735)
+
+
+version 2018.09.01
+
+Core
+* [utils] Skip remote IP addresses non matching to source address' IP version
+ when creating a connection (#13422, #17362)
+
+Extractors
++ [ard] Add support for one.ard.de (#17397)
+* [niconico] Fix extraction on python3 (#17393, #17407)
+* [ard] Extract f4m formats
+* [crunchyroll] Parse vilos media data (#17343)
++ [ard] Add support for Beta ARD Mediathek
++ [bandcamp] Extract more metadata (#13197)
+* [internazionale] Fix extraction of non-available-abroad videos (#17386)
+
+
+version 2018.08.28
+
+Extractors
++ [youtube:playlist] Add support for music album playlists (OLAK5uy_ prefix)
+ (#17361)
+* [bitchute] Fix extraction by pass custom User-Agent (#17360)
+* [webofstories:playlist] Fix extraction (#16914)
++ [tvplayhome] Add support for new tvplay URLs (#17344)
++ [generic] Allow relative src for videojs embeds (#17324)
++ [xfileshare] Add support for vidto.se (#17317)
++ [vidzi] Add support for vidzi.nu (#17316)
++ [nova:embed] Add support for media.cms.nova.cz (#17282)
+
+
+version 2018.08.22
+
+Core
+* [utils] Use pure browser header for User-Agent (#17236)
+
+Extractors
++ [kinopoisk] Add support for kinopoisk.ru (#17283)
++ [yourporn] Add support for yourporn.sexy (#17298)
++ [go] Add support for disneynow.go.com (#16299, #17264)
++ [6play] Add support for play.rtl.hr (#17249)
+* [anvato] Fallback to generic API key for access-key-to-API-key lookup
+ (#16788, #17254)
+* [lci] Fix extraction (#17274)
+* [bbccouk] Extend id URL regular expression (#17270)
+* [cwtv] Fix extraction (#17256)
+* [nova] Fix extraction (#17241)
++ [generic] Add support for expressen embeds
+* [raywenderlich] Adapt to site redesign (#17225)
++ [redbulltv] Add support redbull.com tv URLs (#17218)
++ [bitchute] Add support for bitchute.com (#14052)
++ [clyp] Add support for token protected media (#17184)
+* [imdb] Fix extension extraction (#17167)
+
+
+version 2018.08.04
+
+Extractors
+* [funk:channel] Improve byChannelAlias extraction (#17142)
+* [twitch] Fix authentication (#17024, #17126)
+* [twitch:vod] Improve URL regular expression (#17135)
+* [watchbox] Fix extraction (#17107)
+* [pbs] Fix extraction (#17109)
+* [theplatform] Relax URL regular expression (#16181, #17097)
++ [viqeo] Add support for viqeo.tv (#17066)
+
+
+version 2018.07.29
+
+Extractors
+* [crunchyroll:playlist] Restrict URL regular expression (#17069, #17076)
++ [pornhub] Add support for subtitles (#16924, #17088)
+* [ceskatelevize] Use https for API call (#16997, #16999)
+* [dailymotion:playlist] Fix extraction (#16894)
+* [ted] Improve extraction
+* [ted] Fix extraction for videos without nativeDownloads (#16756, #17085)
+* [telecinco] Fix extraction (#17080)
+* [mitele] Reduce number of requests
+* [rai] Return non HTTP relinker URL intact (#17055)
+* [vk] Fix extraction for inline only videos (#16923)
+* [streamcloud] Fix extraction (#17054)
+* [facebook] Fix tahoe player extraction with authentication (#16655)
++ [puhutv] Add support for puhutv.com (#12712, #16010, #16269)
+
+
+version 2018.07.21
+
+Core
++ [utils] Introduce url_or_none
+* [utils] Allow JSONP without function name (#17028)
++ [extractor/common] Extract DASH and MSS formats from SMIL manifests
+
+Extractors
++ [bbc] Add support for BBC Radio Play pages (#17022)
+* [iwara] Fix download URLs (#17026)
+* [vrtnu] Relax title extraction and extract JSON-LD (#17018)
++ [viu] Pass Referer and Origin headers and area id (#16992)
++ [vimeo] Add another config regular expression (#17013)
++ [facebook] Extract view count (#16942)
+* [dailymotion] Improve description extraction (#16984)
+* [slutload] Fix and improve extraction (#17001)
+* [mediaset] Fix extraction (#16977)
++ [theplatform] Add support for theplatform TLD customization (#16977)
+* [imgur] Relax URL regular expression (#16987)
+* [pornhub] Improve extraction and extract all formats (#12166, #15891, #16262,
+ #16959)
+
+
+version 2018.07.10
+
+Core
+* [utils] Share JSON-LD regular expression
+* [downloader/dash] Improve error handling (#16927)
+
+Extractors
++ [nrktv] Add support for new season and serie URL schema
++ [nrktv] Add support for new episode URL schema (#16909)
++ [frontendmasters] Add support for frontendmasters.com (#3661, #16328)
+* [funk] Fix extraction (#16918)
+* [watchbox] Fix extraction (#16904)
+* [dplayit] Sort formats
+* [dplayit] Fix extraction (#16901)
+* [youtube] Improve login error handling (#13822)
+
+
+version 2018.07.04
+
+Core
+* [extractor/common] Properly escape % in MPD templates (#16867)
+* [extractor/common] Use source URL as Referer for HTML5 entries (16849)
+* Prefer ffmpeg over avconv by default (#8622)
+
+Extractors
+* [pluralsight] Switch to graphql (#16889, #16895, #16896, #16899)
+* [lynda] Simplify login and improve error capturing (#16891)
++ [go90] Add support for embed URLs (#16873)
+* [go90] Detect geo restriction error and pass geo verification headers
+ (#16874)
+* [vlive] Fix live streams extraction (#16871)
+* [npo] Fix typo (#16872)
++ [mediaset] Add support for new videos and extract all formats (#16568)
+* [dctptv] Restore extraction based on REST API (#16850)
+* [svt] Improve extraction and add support for pages (#16802)
+* [porncom] Fix extraction (#16808)
+
+
+version 2018.06.25
+
+Extractors
+* [joj] Relax URL regular expression (#16771)
+* [brightcove] Workaround sonyliv DRM protected videos (#16807)
+* [motherless] Fix extraction (#16786)
+* [itv] Make SOAP request non fatal and extract metadata from webpage (#16780)
+- [foxnews:insider] Remove extractor (#15810)
++ [foxnews] Add support for iframe embeds (#15810, #16711)
+
+
+version 2018.06.19
+
+Core
++ [extractor/common] Introduce expected_status in _download_* methods
+ for convenient accept of HTTP requests failed with non 2xx status codes
++ [compat] Introduce compat_integer_types
+
+Extractors
+* [peertube] Improve generic support (#16733)
++ [6play] Use geo verification headers
+* [rtbf] Fix extraction for python 3.2
+* [vgtv] Improve HLS formats extraction
++ [vgtv] Add support for www.aftonbladet.se/tv URLs
+* [bbccouk] Use expected_status
+* [markiza] Expect 500 HTTP status code
+* [tvnow] Try all clear manifest URLs (#15361)
+
+
version 2018.06.18
Core
# INSTALLATION
-To install it right away for all UNIX users (Linux, OS X, etc.), type:
+To install it right away for all UNIX users (Linux, macOS, etc.), type:
sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
sudo chmod a+rx /usr/local/bin/youtube-dl
This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
-OS X users can install youtube-dl with [Homebrew](https://brew.sh/):
+macOS users can install youtube-dl with [Homebrew](https://brew.sh/):
brew install youtube-dl
default; fix file if we can, warn
otherwise)
--prefer-avconv Prefer avconv over ffmpeg for running the
- postprocessors (default)
- --prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors
+ --prefer-ffmpeg Prefer ffmpeg over avconv for running the
+ postprocessors (default)
--ffmpeg-location PATH Location of the ffmpeg/avconv binary;
either the path to the binary or its
containing directory.
# CONFIGURATION
-You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and OS X, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
+You can configure youtube-dl by placing any supported command line option to a configuration file. On Linux and macOS, the system wide configuration file is located at `/etc/youtube-dl.conf` and the user wide configuration file at `~/.config/youtube-dl/config`. On Windows, the user wide configuration file locations are `%APPDATA%\youtube-dl\config.txt` or `C:\Users\<user name>\youtube-dl.conf`. Note that by default configuration file may not exist so you may need to create it yourself.
For example, with the following configuration file youtube-dl will always extract the audio, not copy the mtime, use a proxy and save all videos under `Movies` directory in your home directory:
```
Use the `--cookies` option, for example `--cookies /path/to/cookies/file.txt`.
-In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [Export Cookies](https://addons.mozilla.org/en-US/firefox/addon/export-cookies/) (for Firefox).
+In order to extract cookies from browser use any conforming browser extension for exporting cookies. For example, [cookies.txt](https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg) (for Chrome) or [cookies.txt](https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/) (for Firefox).
Note that the cookies file must be in Mozilla/Netscape format and the first line of the cookies file must be either `# HTTP Cookie File` or `# Netscape HTTP Cookie File`. Make sure you have correct [newline format](https://en.wikipedia.org/wiki/Newline) in the cookies file and convert newlines if necessary to correspond with your OS, namely `CRLF` (`\r\n`) for Windows and `LF` (`\n`) for Unix and Unix-like systems (Linux, macOS, etc.). `HTTP Error 400: Bad Request` when using `--cookies` is a good sign of invalid newline format.
INSTALLATION
-To install it right away for all UNIX users (Linux, OS X, etc.), type:
+To install it right away for all UNIX users (Linux, macOS, etc.), type:
sudo curl -L https://yt-dl.org/downloads/latest/youtube-dl -o /usr/local/bin/youtube-dl
sudo chmod a+rx /usr/local/bin/youtube-dl
This command will update youtube-dl if you have already installed it.
See the pypi page for more information.
-OS X users can install youtube-dl with Homebrew:
+macOS users can install youtube-dl with Homebrew:
brew install youtube-dl
default; fix file if we can, warn
otherwise)
--prefer-avconv Prefer avconv over ffmpeg for running the
- postprocessors (default)
- --prefer-ffmpeg Prefer ffmpeg over avconv for running the
postprocessors
+ --prefer-ffmpeg Prefer ffmpeg over avconv for running the
+ postprocessors (default)
--ffmpeg-location PATH Location of the ffmpeg/avconv binary;
either the path to the binary or its
containing directory.
You can configure youtube-dl by placing any supported command line
-option to a configuration file. On Linux and OS X, the system wide
+option to a configuration file. On Linux and macOS, the system wide
configuration file is located at /etc/youtube-dl.conf and the user wide
configuration file at ~/.config/youtube-dl/config. On Windows, the user
wide configuration file locations are %APPDATA%\youtube-dl\config.txt or
In order to extract cookies from browser use any conforming browser
extension for exporting cookies. For example, cookies.txt (for Chrome)
-or Export Cookies (for Firefox).
+or cookies.txt (for Firefox).
Note that the cookies file must be in Mozilla/Netscape format and the
first line of the cookies file must be either # HTTP Cookie File or
- **archive.org**: archive.org videos
- **ARD**
- **ARD:mediathek**
+ - **ARDBetaMediathek**
- **Arkena**
- **arte.tv**
- **arte.tv:+7**
- **BiliBili**
- **BioBioChileTV**
- **BIQLE**
+ - **BitChute**
+ - **BitChuteChannel**
- **BleacherReport**
- **BleacherReportCMS**
- **blinkx**
- **Crackle**
- **Criterion**
- **CrooksAndLiars**
- - **Crunchyroll**
+ - **crunchyroll**
- **crunchyroll:playlist**
- **CSNNE**
- **CSpan**: C-SPAN
- **Foxgay**
- **foxnews**: Fox News and Fox Business Video
- **foxnews:article**
- - **foxnews:insider**
- **FoxSports**
- **france2.fr:generation-what**
- **FranceCulture**
- **Freesound**
- **freespeech.org**
- **FreshLive**
+ - **FrontendMasters**
+ - **FrontendMastersCourse**
+ - **FrontendMastersLesson**
- **Funimation**
- **FunkChannel**
- **FunkMix**
- **Ketnet**
- **KhanAcademy**
- **KickStarter**
+ - **KinoPoisk**
- **KonserthusetPlay**
- **kontrtube**: KontrTube.ru - Труба зовёт
- **KrasView**: Красвью
- **Normalboots**
- **NosVideo**
- **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
+ - **NovaEmbed**
- **nowness**
- **nowness:playlist**
- **nowness:series**
- **NRKSkole**: NRK Skole
- **NRKTV**: NRK TV and NRK Radio
- **NRKTVDirekte**: NRK TV Direkte and NRK Radio Direkte
+ - **NRKTVEpisode**
- **NRKTVEpisodes**
+ - **NRKTVSeason**
- **NRKTVSeries**
- **ntv.ru**
- **Nuvid**
- **PrimeShareTV**
- **PromptFile**
- **prosiebensat1**: ProSiebenSat.1 Digital
+ - **puhutv**
+ - **puhutv:serie**
- **Puls4**
- **Pyvideo**
- **qqmusic**: QQ音乐
- **RaiPlayLive**
- **RaiPlayPlaylist**
- **RayWenderlich**
+ - **RayWenderlichCourse**
- **RBMARadio**
- **RDS**: RDS.ca
- **RedBullTV**
- **StretchInternet**
- **SunPorno**
- **SVT**
+ - **SVTPage**
- **SVTPlay**: SVT Play and Öppet arkiv
- **SVTSeries**
- **SWRMediathek**
- **techtv.mit.edu**
- **ted**
- **Tele13**
+ - **Tele5**
- **TeleBruxelles**
- **Telecinco**: telecinco.es, cuatro.com and mediaset.es
- **Telegraaf**
- **tvp:embed**: Telewizja Polska
- **tvp:series**
- **TVPlayer**
+ - **TVPlayHome**
- **Tweakers**
- **twitch:chapter**
- **twitch:clips**
- **Vimple**: Vimple - one-click video hosting
- **Vine**
- **vine:user**
+ - **Viqeo**
- **Viu**
- **viu:ott**
- **viu:playlist**
- **YouNowLive**
- **YouNowMoment**
- **YouPorn**
+ - **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- **youtube:channel**: YouTube.com channels
uppercase_escape,
lowercase_escape,
url_basename,
+ url_or_none,
base_url,
urljoin,
urlencode_postdata,
self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
+ def test_url_or_none(self):
+ self.assertEqual(url_or_none(None), None)
+ self.assertEqual(url_or_none(''), None)
+ self.assertEqual(url_or_none('foo'), None)
+ self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
+ self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de')
+ self.assertEqual(url_or_none('http$://foo.de'), None)
+ self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
+ self.assertEqual(url_or_none('//foo.de'), '//foo.de')
+
def test_parse_age_limit(self):
self.assertEqual(parse_age_limit(None), None)
self.assertEqual(parse_age_limit(False), None)
d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'})
+ stripped = strip_jsonp('({"status": "success"});')
+ d = json.loads(stripped)
+ self.assertEqual(d, {'status': 'success'})
+
def test_uppercase_escape(self):
self.assertEqual(uppercase_escape('aä'), 'aä')
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
'vcodec': 'h264',
'acodec': 'aac',
})
+ self.assertEqual(parse_codecs('av01.0.05M.08'), {
+ 'vcodec': 'av01.0.05M.08',
+ 'acodec': 'none',
+ })
def test_escape_rfc3986(self):
reserved = "!*'();:@&=+$,/?#[]"
.RE
.TP
.B \-\-prefer\-avconv
-Prefer avconv over ffmpeg for running the postprocessors (default)
+Prefer avconv over ffmpeg for running the postprocessors
.RS
.RE
.TP
.B \-\-prefer\-ffmpeg
-Prefer ffmpeg over avconv for running the postprocessors
+Prefer ffmpeg over avconv for running the postprocessors (default)
.RS
.RE
.TP
.PP
You can configure youtube\-dl by placing any supported command line
option to a configuration file.
-On Linux and OS X, the system wide configuration file is located at
+On Linux and macOS, the system wide configuration file is located at
\f[C]/etc/youtube\-dl.conf\f[] and the user wide configuration file at
\f[C]~/.config/youtube\-dl/config\f[].
On Windows, the user wide configuration file locations are
extension for exporting cookies.
For example,
cookies.txt (https://chrome.google.com/webstore/detail/cookiestxt/njabckikapfpffapmjgojcnbfjonfjfg)
-(for Chrome) or Export
-Cookies (https://addons.mozilla.org/en-US/firefox/addon/export-cookies/)
+(for Chrome) or
+cookies.txt (https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/)
(for Firefox).
.PP
Note that the cookies file must be in Mozilla/Netscape format and the
complete --command youtube-dl --long-option metadata-from-title --description 'Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output. Regular expression with named capture groups may also be used. The parsed parameters replace existing values. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise". Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'
complete --command youtube-dl --long-option xattrs --description 'Write metadata to the video file'"'"'s xattrs (using dublin core and xdg standards)'
complete --command youtube-dl --long-option fixup --description 'Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise)'
-complete --command youtube-dl --long-option prefer-avconv --description 'Prefer avconv over ffmpeg for running the postprocessors (default)'
-complete --command youtube-dl --long-option prefer-ffmpeg --description 'Prefer ffmpeg over avconv for running the postprocessors'
+complete --command youtube-dl --long-option prefer-avconv --description 'Prefer avconv over ffmpeg for running the postprocessors'
+complete --command youtube-dl --long-option prefer-ffmpeg --description 'Prefer ffmpeg over avconv for running the postprocessors (default)'
complete --command youtube-dl --long-option ffmpeg-location --description 'Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory.'
complete --command youtube-dl --long-option exec --description 'Execute a command on the file after downloading, similar to find'"'"'s -exec syntax. Example: --exec '"'"'adb push {} /sdcard/Music/ && rm {}'"'"''
complete --command youtube-dl --long-option convert-subs --description 'Convert the subtitles to other format (currently supported: srt|ass|vtt|lrc)'
http_chunk_size.
The following options are used by the post processors:
- prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available,
- otherwise prefer avconv.
+ prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available,
+ otherwise prefer ffmpeg.
postprocessor_args: A list of additional command-line arguments for the
postprocessor.
compat_numeric_types = (int, float, complex)
+try:
+ compat_integer_types = (int, long)
+except NameError: # Python 3
+ compat_integer_types = (int, )
+
+
if sys.version_info < (2, 7):
def compat_socket_create_connection(address, timeout, source_address=None):
host, port = address
'compat_http_client',
'compat_http_server',
'compat_input',
+ 'compat_integer_types',
'compat_itertools_count',
'compat_kwargs',
'compat_numeric_types',
from .fragment import FragmentFD
from ..compat import compat_urllib_error
-from ..utils import urljoin
+from ..utils import (
+ DownloadError,
+ urljoin,
+)
class DashSegmentsFD(FragmentFD):
count += 1
if count <= fragment_retries:
self.report_retry_fragment(err, frag_index, count, fragment_retries)
+ except DownloadError:
+ # Don't retry fragment if error occurred during HTTP downloading
+ # itself since it has own retry settings
+ if not fatal:
+ self.report_skip_fragment(frag_index)
+ break
+ raise
+
if count > fragment_retries:
if not fatal:
self.report_skip_fragment(frag_index)
from ..utils import (
int_or_none,
strip_or_none,
+ url_or_none,
)
if not video_id:
entries = []
for episode in video_data.get('archiveEpisodes', []):
- episode_url = episode.get('url')
+ episode_url = url_or_none(episode.get('url'))
if not episode_url:
continue
entries.append(self.url_result(
determine_ext,
ExtractorError,
int_or_none,
+ url_or_none,
urlencode_postdata,
xpath_text,
)
file_elements = video_element.findall(compat_xpath('./file'))
one = len(file_elements) == 1
for file_num, file_element in enumerate(file_elements, start=1):
- file_url = file_element.text
+ file_url = url_or_none(file_element.text)
if not file_url:
continue
key = file_element.get('key', '')
from .common import InfoExtractor
from ..utils import (
- int_or_none,
- parse_iso8601,
- mimetype2ext,
determine_ext,
ExtractorError,
+ int_or_none,
+ mimetype2ext,
+ parse_iso8601,
+ url_or_none,
)
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
thumbnail = thumbnail_data.get('@attributes', {})
- thumbnail_url = thumbnail.get('url')
+ thumbnail_url = url_or_none(thumbnail.get('url'))
if not thumbnail_url:
continue
thumbnails.append({
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
subtitle = subtitle_data.get('@attributes', {})
- subtitle_href = subtitle.get('href')
+ subtitle_href = url_or_none(subtitle.get('href'))
if not subtitle_href:
continue
subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
media_content = [media_content]
for media_data in media_content:
media = media_data.get('@attributes', {})
- media_url = media.get('url')
+ media_url = url_or_none(media.get('url'))
if not media_url:
continue
ext = mimetype2ext(media.get('type')) or determine_ext(media_url)
else:
formats.append({
'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
- 'url': media['url'],
+ 'url': media_url,
'tbr': int_or_none(media.get('bitrate')),
'filesize': int_or_none(media.get('fileSize')),
'ext': ext,
determine_ext,
extract_attributes,
ExtractorError,
+ url_or_none,
urlencode_postdata,
urljoin,
)
}, fatal=False)
if not playlist:
continue
- stream_url = playlist.get('streamurl')
+ stream_url = url_or_none(playlist.get('streamurl'))
if stream_url:
rtmp = re.search(
r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
}
+ _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA'
+
_ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
+ _TESTS = [{
+ # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874
+ 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496',
+ 'info_dict': {
+ 'id': '4465496',
+ 'ext': 'mp4',
+ 'title': 'VIDEO: Humpback whale breaches right next to NH boat',
+ 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.',
+ 'duration': 22,
+ 'timestamp': 1534855680,
+ 'upload_date': '20180821',
+ 'uploader': 'ANV',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/
+ 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601',
+ 'only_matching': True,
+ }]
+
def __init__(self, *args, **kwargs):
super(AnvatoIE, self).__init__(*args, **kwargs)
self.__server_time = None
'api': {
'anvrid': anvrid,
'anvstk': md5_text('%s|%s|%d|%s' % (
- access_key, anvrid, server_time, self._ANVACK_TABLE[access_key])),
+ access_key, anvrid, server_time,
+ self._ANVACK_TABLE.get(access_key, self._API_KEY))),
'anvts': server_time,
},
}
mobj = re.match(self._VALID_URL, url)
access_key, video_id = mobj.group('access_key_or_mcp', 'id')
if access_key not in self._ANVACK_TABLE:
- access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+ access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(
+ access_key) or access_key
return self._get_anvato_videos(access_key, video_id)
from ..utils import (
ExtractorError,
int_or_none,
+ url_or_none,
)
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
for rendition in video_data.get('renditions', []):
- video_url = rendition.get('url')
+ video_url = url_or_none(rendition.get('url'))
if not video_url:
continue
ext = rendition.get('format')
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
js_to_json,
+ url_or_none,
)
for source in sources:
if not isinstance(source, dict):
continue
- source_url = source.get('file')
- if not source_url or not isinstance(source_url, compat_str):
+ source_url = url_or_none(source.get('file'))
+ if not source_url:
continue
ext = determine_ext(source_url)
if ext == 'm3u8':
from ..utils import (
int_or_none,
mimetype2ext,
+ url_or_none,
)
formats = []
for item in file_list[0]:
- file_url = item.get('file')
+ file_url = url_or_none(item.get('file'))
if not file_url:
continue
ext = mimetype2ext(item.get('type'))
from .common import InfoExtractor
from .generic import GenericIE
-from ..compat import compat_str
from ..utils import (
determine_ext,
ExtractorError,
unified_strdate,
xpath_text,
update_url_query,
+ url_or_none,
)
from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor):
IE_NAME = 'ARD:mediathek'
- _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+ _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
# available till 26.07.2022
# m3u8 download
'skip_download': True,
}
+ }, {
+ 'url': 'https://one.ard.de/tv/Mord-mit-Aussicht/Mord-mit-Aussicht-6-39-T%C3%B6dliche-Nach/ONE/Video?bcastId=46384294&documentId=55586872',
+ 'only_matching': True,
}, {
# audio
'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
quality = stream.get('_quality')
server = stream.get('_server')
for stream_url in stream_urls:
- if not isinstance(stream_url, compat_str) or '//' not in stream_url:
+ if not url_or_none(stream_url):
continue
ext = determine_ext(stream_url)
if quality != 'auto' and ext in ('f4m', 'm3u8'):
'upload_date': upload_date,
'thumbnail': thumbnail,
}
+
+
+class ARDBetaMediathekIE(InfoExtractor):
+ _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P<video_id>[a-zA-Z0-9]+)/(?P<display_id>[^/?#]+)'
+ _TESTS = [{
+ 'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita',
+ 'md5': '2d02d996156ea3c397cfc5036b5d7f8f',
+ 'info_dict': {
+ 'display_id': 'die-robuste-roswita',
+ 'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE',
+ 'title': 'Tatort: Die robuste Roswita',
+ 'description': r're:^Der Mord.*trüber ist als die Ilm.',
+ 'duration': 5316,
+ 'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard',
+ 'upload_date': '20180826',
+ 'ext': 'mp4',
+ },
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('video_id')
+ display_id = mobj.group('display_id')
+
+ webpage = self._download_webpage(url, display_id)
+ data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json')
+ data = self._parse_json(data_json, display_id)
+
+ res = {
+ 'id': video_id,
+ 'display_id': display_id,
+ }
+ formats = []
+ for widget in data.values():
+ if widget.get('_geoblocked'):
+ raise ExtractorError('This video is not available due to geoblocking', expected=True)
+
+ if '_duration' in widget:
+ res['duration'] = widget['_duration']
+ if 'clipTitle' in widget:
+ res['title'] = widget['clipTitle']
+ if '_previewImage' in widget:
+ res['thumbnail'] = widget['_previewImage']
+ if 'broadcastedOn' in widget:
+ res['upload_date'] = unified_strdate(widget['broadcastedOn'])
+ if 'synopsis' in widget:
+ res['description'] = widget['synopsis']
+ if '_subtitleUrl' in widget:
+ res['subtitles'] = {'de': [{
+ 'ext': 'ttml',
+ 'url': widget['_subtitleUrl'],
+ }]}
+ if '_quality' in widget:
+ format_url = widget['_stream']['json'][0]
+
+ if format_url.endswith('.f4m'):
+ formats.extend(self._extract_f4m_formats(
+ format_url + '?hdcore=3.11.0',
+ video_id, f4m_id='hds', fatal=False))
+ elif format_url.endswith('m3u8'):
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': 'http-' + widget['_quality'],
+ 'url': format_url,
+ 'preference': 10, # Plain HTTP, that's nice
+ })
+
+ self._sort_formats(formats)
+ res['formats'] = formats
+
+ return res
from __future__ import unicode_literals
-import json
import random
import re
import time
int_or_none,
KNOWN_EXTENSIONS,
parse_filesize,
+ str_or_none,
+ try_get,
unescapeHTML,
update_url_query,
unified_strdate,
+ unified_timestamp,
+ url_or_none,
)
class BandcampIE(InfoExtractor):
- _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
+ _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
_TESTS = [{
'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
+ # free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
+ 'md5': '853e35bf34aa1d6fe2615ae612564b36',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
'title': 'Ben Prunty - Lanius (Battle)',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Ben Prunty',
+ 'timestamp': 1396508491,
+ 'upload_date': '20140403',
+ 'release_date': '20140403',
+ 'duration': 260.877,
+ 'track': 'Lanius (Battle)',
+ 'track_number': 1,
+ 'track_id': '2650410135',
+ 'artist': 'Ben Prunty',
+ 'album': 'FTL: Advanced Edition Soundtrack',
+ },
+ }, {
+ # no free download, mp3 128
+ 'url': 'https://relapsealumni.bandcamp.com/track/hail-to-fire',
+ 'md5': 'fec12ff55e804bb7f7ebeb77a800c8b7',
+ 'info_dict': {
+ 'id': '2584466013',
+ 'ext': 'mp3',
+ 'title': 'Mastodon - Hail to Fire',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Mastodon',
+ 'timestamp': 1322005399,
+ 'upload_date': '20111122',
+ 'release_date': '20040207',
+ 'duration': 120.79,
+ 'track': 'Hail to Fire',
+ 'track_number': 5,
+ 'track_id': '2584466013',
+ 'artist': 'Mastodon',
+ 'album': 'Call of the Mastodon',
},
}]
title = mobj.group('title')
webpage = self._download_webpage(url, title)
thumbnail = self._html_search_meta('og:image', webpage, default=None)
- m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
- if not m_download:
- m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
- if m_trackinfo:
- json_code = m_trackinfo.group(1)
- data = json.loads(json_code)[0]
- track_id = compat_str(data['id'])
-
- if not data.get('file'):
- raise ExtractorError('Not streamable', video_id=track_id, expected=True)
-
- formats = []
- for format_id, format_url in data['file'].items():
+
+ track_id = None
+ track = None
+ track_number = None
+ duration = None
+
+ formats = []
+ track_info = self._parse_json(
+ self._search_regex(
+ r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n',
+ webpage, 'track info', default='{}'), title)
+ if track_info:
+ file_ = track_info.get('file')
+ if isinstance(file_, dict):
+ for format_id, format_url in file_.items():
+ if not url_or_none(format_url):
+ continue
ext, abr_str = format_id.split('-', 1)
formats.append({
'format_id': format_id,
'acodec': ext,
'abr': int_or_none(abr_str),
})
+ track = track_info.get('title')
+ track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
+ track_number = int_or_none(track_info.get('track_num'))
+ duration = float_or_none(track_info.get('duration'))
+
+ def extract(key):
+ return self._search_regex(
+ r'\b%s\s*["\']?\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % key,
+ webpage, key, default=None, group='value')
+
+ artist = extract('artist')
+ album = extract('album_title')
+ timestamp = unified_timestamp(
+ extract('publish_date') or extract('album_publish_date'))
+ release_date = unified_strdate(extract('album_release_date'))
+
+ download_link = self._search_regex(
+ r'freeDownloadPage\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'download link', default=None, group='url')
+ if download_link:
+ track_id = self._search_regex(
+ r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
+ webpage, 'track id')
+
+ download_webpage = self._download_webpage(
+ download_link, track_id, 'Downloading free downloads page')
+
+ blob = self._parse_json(
+ self._search_regex(
+ r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
+ 'blob', group='blob'),
+ track_id, transform_source=unescapeHTML)
+
+ info = try_get(
+ blob, (lambda x: x['digital_items'][0],
+ lambda x: x['download_items'][0]), dict)
+ if info:
+ downloads = info.get('downloads')
+ if isinstance(downloads, dict):
+ if not track:
+ track = info.get('title')
+ if not artist:
+ artist = info.get('artist')
+ if not thumbnail:
+ thumbnail = info.get('thumb_url')
+
+ download_formats = {}
+ download_formats_list = blob.get('download_formats')
+ if isinstance(download_formats_list, list):
+ for f in blob['download_formats']:
+ name, ext = f.get('name'), f.get('file_extension')
+ if all(isinstance(x, compat_str) for x in (name, ext)):
+ download_formats[name] = ext.strip('.')
+
+ for format_id, f in downloads.items():
+ format_url = f.get('url')
+ if not format_url:
+ continue
+ # Stat URL generation algorithm is reverse engineered from
+ # download_*_bundle_*.js
+ stat_url = update_url_query(
+ format_url.replace('/download/', '/statdownload/'), {
+ '.rand': int(time.time() * 1000 * random.random()),
+ })
+ format_id = f.get('encoding_name') or format_id
+ stat = self._download_json(
+ stat_url, track_id, 'Downloading %s JSON' % format_id,
+ transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
+ fatal=False)
+ if not stat:
+ continue
+ retry_url = url_or_none(stat.get('retry_url'))
+ if not retry_url:
+ continue
+ formats.append({
+ 'url': self._proto_relative_url(retry_url, 'http:'),
+ 'ext': download_formats.get(format_id),
+ 'format_id': format_id,
+ 'format_note': f.get('description'),
+ 'filesize': parse_filesize(f.get('size_mb')),
+ 'vcodec': 'none',
+ })
- self._sort_formats(formats)
-
- return {
- 'id': track_id,
- 'title': data['title'],
- 'thumbnail': thumbnail,
- 'formats': formats,
- 'duration': float_or_none(data.get('duration')),
- }
- else:
- raise ExtractorError('No free songs found')
-
- download_link = m_download.group(1)
- video_id = self._search_regex(
- r'(?ms)var TralbumData = .*?[{,]\s*id: (?P<id>\d+),?$',
- webpage, 'video id')
-
- download_webpage = self._download_webpage(
- download_link, video_id, 'Downloading free downloads page')
-
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
- 'blob', group='blob'),
- video_id, transform_source=unescapeHTML)
-
- info = blob['digital_items'][0]
-
- downloads = info['downloads']
- track = info['title']
+ self._sort_formats(formats)
- artist = info.get('artist')
title = '%s - %s' % (artist, track) if artist else track
- download_formats = {}
- for f in blob['download_formats']:
- name, ext = f.get('name'), f.get('file_extension')
- if all(isinstance(x, compat_str) for x in (name, ext)):
- download_formats[name] = ext.strip('.')
-
- formats = []
- for format_id, f in downloads.items():
- format_url = f.get('url')
- if not format_url:
- continue
- # Stat URL generation algorithm is reverse engineered from
- # download_*_bundle_*.js
- stat_url = update_url_query(
- format_url.replace('/download/', '/statdownload/'), {
- '.rand': int(time.time() * 1000 * random.random()),
- })
- format_id = f.get('encoding_name') or format_id
- stat = self._download_json(
- stat_url, video_id, 'Downloading %s JSON' % format_id,
- transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1],
- fatal=False)
- if not stat:
- continue
- retry_url = stat.get('retry_url')
- if not isinstance(retry_url, compat_str):
- continue
- formats.append({
- 'url': self._proto_relative_url(retry_url, 'http:'),
- 'ext': download_formats.get(format_id),
- 'format_id': format_id,
- 'format_note': f.get('description'),
- 'filesize': parse_filesize(f.get('size_mb')),
- 'vcodec': 'none',
- })
- self._sort_formats(formats)
+ if not duration:
+ duration = float_or_none(self._html_search_meta(
+ 'duration', webpage, default=None))
return {
- 'id': video_id,
+ 'id': track_id,
'title': title,
- 'thumbnail': info.get('thumb_url') or thumbnail,
- 'uploader': info.get('artist'),
- 'artist': artist,
+ 'thumbnail': thumbnail,
+ 'uploader': artist,
+ 'timestamp': timestamp,
+ 'release_date': release_date,
+ 'duration': duration,
'track': track,
+ 'track_number': track_number,
+ 'track_id': track_id,
+ 'artist': artist,
+ 'album': album,
'formats': formats,
}
formats = []
for format_id, format_url in show['audio_stream'].items():
- if not isinstance(format_url, compat_str):
+ if not url_or_none(format_url):
continue
for known_ext in KNOWN_EXTENSIONS:
if known_ext in format_id:
urljoin,
)
from ..compat import (
- compat_etree_fromstring,
compat_HTTPError,
compat_urlparse,
)
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _ID_REGEX = r'[pbw][\da-z]{7}'
+ _ID_REGEX = r'(?:[pbm][\da-z]{7}|w[\da-z]{7,14})'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
}, {
'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
'only_matching': True,
+ }, {
+ 'url': 'https://www.bbc.co.uk/programmes/m00005xn',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bbc.co.uk/programmes/w172w4dww1jqt5s',
+ 'only_matching': True,
}]
_USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
self._raise_extractor_error(last_exception)
def _download_media_selector_url(self, url, programme_id=None):
- try:
- media_selection = self._download_xml(
- url, programme_id, 'Downloading media selection XML')
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code in (403, 404):
- media_selection = compat_etree_fromstring(ee.cause.read().decode('utf-8'))
- else:
- raise
+ media_selection = self._download_xml(
+ url, programme_id, 'Downloading media selection XML',
+ expected_status=(403, 404))
return self._process_media_selector(media_selection, programme_id)
def _process_media_selector(self, media_selection, programme_id):
'params': {
'skip_download': True,
}
+ }, {
+ # window.__PRELOADED_STATE__
+ 'url': 'https://www.bbc.co.uk/radio/play/b0b9z4yl',
+ 'info_dict': {
+ 'id': 'b0b9z4vz',
+ 'ext': 'mp4',
+ 'title': 'Prom 6: An American in Paris and Turangalila',
+ 'description': 'md5:51cf7d6f5c8553f197e58203bc78dff8',
+ 'uploader': 'Radio 3',
+ 'uploader_id': 'bbc_radio_three',
+ },
}]
@classmethod
'subtitles': subtitles,
}
+ preload_state = self._parse_json(self._search_regex(
+ r'window\.__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+ 'preload state', default='{}'), playlist_id, fatal=False)
+ if preload_state:
+ current_programme = preload_state.get('programmes', {}).get('current') or {}
+ programme_id = current_programme.get('id')
+ if current_programme and programme_id and current_programme.get('type') == 'playable_item':
+ title = current_programme.get('titles', {}).get('tertiary') or playlist_title
+ formats, subtitles = self._download_media_selector(programme_id)
+ self._sort_formats(formats)
+ synopses = current_programme.get('synopses') or {}
+ network = current_programme.get('network') or {}
+ duration = int_or_none(
+ current_programme.get('duration', {}).get('value'))
+ thumbnail = None
+ image_url = current_programme.get('image_url')
+ if image_url:
+ thumbnail = image_url.replace('{recipe}', '1920x1920')
+ return {
+ 'id': programme_id,
+ 'title': title,
+ 'description': dict_get(synopses, ('long', 'medium', 'short')),
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'uploader': network.get('short_title'),
+ 'uploader_id': network.get('id'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
bbc3_config = self._parse_json(
self._search_regex(
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+import re
+
+from .common import InfoExtractor
+from ..utils import urlencode_postdata
+
+
+class BitChuteIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.bitchute.com/video/szoMrox2JEI/',
+ 'md5': '66c4a70e6bfc40dcb6be3eb1d74939eb',
+ 'info_dict': {
+ 'id': 'szoMrox2JEI',
+ 'ext': 'mp4',
+ 'title': 'Fuck bitches get money',
+ 'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Victoria X Rave',
+ },
+ }, {
+ 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
+ })
+
+ title = self._search_regex(
+ (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
+ webpage, 'title', default=None) or self._html_search_meta(
+ 'description', webpage, 'title',
+ default=None) or self._og_search_description(webpage)
+
+ formats = [
+ {'url': mobj.group('url')}
+ for mobj in re.finditer(
+ r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage)]
+ self._sort_formats(formats)
+
+ description = self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>',
+ webpage, 'description', fatal=False)
+ thumbnail = self._og_search_thumbnail(
+ webpage, default=None) or self._html_search_meta(
+ 'twitter:image:src', webpage, 'thumbnail')
+ uploader = self._html_search_regex(
+ r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>', webpage,
+ 'uploader', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'formats': formats,
+ }
+
+
+class BitChuteChannelIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.bitchute.com/channel/victoriaxrave/',
+ 'playlist_mincount': 185,
+ 'info_dict': {
+ 'id': 'victoriaxrave',
+ },
+ }
+
+ _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7'
+
+ def _entries(self, channel_id):
+ channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id
+ offset = 0
+ for page_num in itertools.count(1):
+ data = self._download_json(
+ '%sextend/' % channel_url, channel_id,
+ 'Downloading channel page %d' % page_num,
+ data=urlencode_postdata({
+ 'csrfmiddlewaretoken': self._TOKEN,
+ 'name': '',
+ 'offset': offset,
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+ 'Referer': channel_url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Cookie': 'csrftoken=%s' % self._TOKEN,
+ })
+ if data.get('success') is False:
+ break
+ html = data.get('html')
+ if not html:
+ break
+ video_ids = re.findall(
+ r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)',
+ html)
+ if not video_ids:
+ break
+ offset += len(video_ids)
+ for video_id in video_ids:
+ yield self.url_result(
+ 'https://www.bitchute.com/video/%s' % video_id,
+ ie=BitChuteIE.ie_key(), video_id=video_id)
+
+ def _real_extract(self, url):
+ channel_id = self._match_id(url)
+ return self.playlist_result(
+ self._entries(channel_id), playlist_id=channel_id)
from .common import InfoExtractor
from .youtube import YoutubeIE
-from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ url_or_none,
+)
class BreakIE(InfoExtractor):
formats = []
for video in content:
- video_url = video.get('url')
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(video.get('url'))
+ if not video_url:
continue
bitrate = int_or_none(self._search_regex(
r'(\d+)_kbps', video_url, 'tbr', default=None))
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
- if ext == 'ism' or container == 'WVM':
+ # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
+ if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
continue
elif ext == 'm3u8' or container == 'M2TS':
if not src:
'format_id': build_format_id('rtmp'),
})
formats.append(f)
+ if not formats:
+ # for sonyliv.com DRM protected videos
+ s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
+ if s3_source_url:
+ formats.append({
+ 'url': s3_source_url,
+ 'format_id': 'source',
+ })
errors = json_data.get('errors')
if not formats and errors:
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
+ url_or_none,
)
for media in encodings:
if not isinstance(media, dict):
continue
- media_url = media.get('location')
- if not media_url or not isinstance(media_url, compat_str):
+ media_url = url_or_none(media.get('location'))
+ if not media_url:
continue
format_id_list = [format_id]
strip_or_none,
float_or_none,
int_or_none,
+ merge_dicts,
parse_iso8601,
)
webpage, urlh = self._download_webpage_handle(url, display_id)
- title = self._html_search_regex(
+ info = self._search_json_ld(webpage, display_id, default={})
+
+ # title is optional here since it may be extracted by extractor
+ # that is delegated from here
+ title = strip_or_none(self._html_search_regex(
r'(?ms)<h1 class="content__heading">(.+?)</h1>',
- webpage, 'title').strip()
+ webpage, 'title', default=None))
description = self._html_search_regex(
r'(?ms)<div class="content__description">(.+?)</div>',
# the first one
video_id = list(video.values())[0].get('videoid')
- return {
+ return merge_dicts(info, {
'_type': 'url_transparent',
'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
'ie_key': CanvasIE.ie_key(),
'season_number': season_number,
'episode_number': episode_number,
'release_date': release_date,
- }
+ })
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
clean_html,
int_or_none,
parse_duration,
parse_iso8601,
parse_resolution,
+ url_or_none,
)
media_url = media['media']['url']
if isinstance(media_url, list):
for format_ in media_url:
- format_url = format_.get('file')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(format_.get('file'))
+ if not format_url:
continue
label = format_.get('label')
f = parse_resolution(label)
for user_agent in (None, USER_AGENTS['Safari']):
req = sanitized_Request(
- 'http://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
+ 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist',
data=urlencode_postdata(data))
req.add_header('Content-type', 'application/x-www-form-urlencoded')
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
float_or_none,
- parse_iso8601,
+ unified_timestamp,
)
class ClypIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clyp\.it/(?P<id>[a-z0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://clyp.it/ojz2wfah',
'md5': '1d4961036c41247ecfdcc439c0cddcbb',
'info_dict': {
'timestamp': 1443515251,
'upload_date': '20150929',
},
- }
+ }, {
+ 'url': 'https://clyp.it/b04p1odi?token=b0078e077e15835845c528a44417719d',
+ 'info_dict': {
+ 'id': 'b04p1odi',
+ 'ext': 'mp3',
+ 'title': 'GJ! (Reward Edit)',
+ 'description': 'Metal Resistance (THE ONE edition)',
+ 'duration': 177.789,
+ 'timestamp': 1528241278,
+ 'upload_date': '20180605',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
def _real_extract(self, url):
audio_id = self._match_id(url)
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ token = qs.get('token', [None])[0]
+
+ query = {}
+ if token:
+ query['token'] = token
+
metadata = self._download_json(
- 'https://api.clyp.it/%s' % audio_id, audio_id)
+ 'https://api.clyp.it/%s' % audio_id, audio_id, query=query)
formats = []
for secure in ('', 'Secure'):
title = metadata['Title']
description = metadata.get('Description')
duration = float_or_none(metadata.get('Duration'))
- timestamp = parse_iso8601(metadata.get('DateCreated'))
+ timestamp = unified_timestamp(metadata.get('DateCreated'))
return {
'id': audio_id,
compat_cookies,
compat_etree_fromstring,
compat_getpass,
+ compat_integer_types,
compat_http_client,
compat_os_name,
compat_str,
GeoUtils,
int_or_none,
js_to_json,
+ JSON_LD_RE,
mimetype2ext,
orderedSet,
parse_codecs,
def IE_NAME(self):
return compat_str(type(self).__name__[:-2])
- def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
- """ Returns the response handle """
+ @staticmethod
+ def __can_accept_status_code(err, expected_status):
+ assert isinstance(err, compat_urllib_error.HTTPError)
+ if expected_status is None:
+ return False
+ if isinstance(expected_status, compat_integer_types):
+ return err.code == expected_status
+ elif isinstance(expected_status, (list, tuple)):
+ return err.code in expected_status
+ elif callable(expected_status):
+ return expected_status(err.code) is True
+ else:
+ assert False
+
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the response handle.
+
+ See _download_webpage docstring for arguments specification.
+ """
if note is None:
self.report_download_webpage(video_id)
elif note is not False:
try:
return self._downloader.urlopen(url_or_request)
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if isinstance(err, compat_urllib_error.HTTPError):
+ if self.__can_accept_status_code(err, expected_status):
+ return err.fp
+
if errnote is False:
return False
if errnote is None:
self._downloader.report_warning(errmsg)
return False
- def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
- """ Returns a tuple (page content as string, URL handle) """
+ def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
+ """
+ Return a tuple (page content as string, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
- urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status)
if urlh is False:
assert not fatal
return False
return content
- def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
- """ Returns the data of the page as a string """
+ def _download_webpage(
+ self, url_or_request, video_id, note=None, errnote=None,
+ fatal=True, tries=1, timeout=5, encoding=None, data=None,
+ headers={}, query={}, expected_status=None):
+ """
+ Return the data of the page as a string.
+
+ Arguments:
+ url_or_request -- plain text URL as a string or
+ a compat_urllib_request.Requestobject
+ video_id -- Video/playlist/item identifier (string)
+
+ Keyword arguments:
+ note -- note printed before downloading (string)
+ errnote -- note printed in case of an error (string)
+ fatal -- flag denoting whether error should be considered fatal,
+ i.e. whether it should cause ExtractionError to be raised,
+ otherwise a warning will be reported and extraction continued
+ tries -- number of tries
+ timeout -- sleep interval between tries
+ encoding -- encoding for a page content decoding, guessed automatically
+ when not explicitly specified
+ data -- POST data (bytes)
+ headers -- HTTP headers (dict)
+ query -- URL query (dict)
+ expected_status -- allows to accept failed HTTP requests (non 2xx
+ status code) by explicitly specifying a set of accepted status
+ codes. Can be any of the following entities:
+ - an integer type specifying an exact failed status code to
+ accept
+ - a list or a tuple of integer types specifying a list of
+ failed status codes to accept
+ - a callable accepting an actual failed status code and
+ returning True if it should be accepted
+ Note that this argument does not affect success status codes (2xx)
+ which are always accepted.
+ """
+
success = False
try_count = 0
while success is False:
try:
- res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
+ res = self._download_webpage_handle(
+ url_or_request, video_id, note, errnote, fatal,
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
success = True
except compat_http_client.IncompleteRead as e:
try_count += 1
def _download_xml_handle(
self, url_or_request, video_id, note='Downloading XML',
errnote='Unable to download XML', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
if res is False:
return res
xml_string, urlh = res
xml_string, video_id, transform_source=transform_source,
fatal=fatal), urlh
- def _download_xml(self, url_or_request, video_id,
- note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None,
- data=None, headers={}, query={}):
- """Return the xml as an xml.etree.ElementTree.Element"""
+ def _download_xml(
+ self, url_or_request, video_id,
+ note='Downloading XML', errnote='Unable to download XML',
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}, expected_status=None):
+ """
+ Return the xml as an xml.etree.ElementTree.Element.
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_xml_handle(
url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query)
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
return res if res is False else res[0]
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
def _download_json_handle(
self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
- """Return a tuple (JSON object, URL handle)"""
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return a tuple (JSON object, URL handle).
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_webpage_handle(
url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query)
+ encoding=encoding, data=data, headers=headers, query=query,
+ expected_status=expected_status)
if res is False:
return res
json_string, urlh = res
def _download_json(
self, url_or_request, video_id, note='Downloading JSON metadata',
errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={}):
+ fatal=True, encoding=None, data=None, headers={}, query={},
+ expected_status=None):
+ """
+ Return the JSON object as a dict.
+
+ See _download_webpage docstring for arguments specification.
+ """
res = self._download_json_handle(
url_or_request, video_id, note=note, errnote=errnote,
transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query)
+ data=data, headers=headers, query=query,
+ expected_status=expected_status)
return res if res is False else res[0]
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
json_ld = self._search_regex(
- r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
- html, 'JSON-LD', group='json_ld', **kwargs)
+ JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs)
default = kwargs.get('default', NO_DEFAULT)
if not json_ld:
return default if default is not NO_DEFAULT else {}
'height': height,
})
formats.extend(m3u8_formats)
- continue
-
- if src_ext == 'f4m':
+ elif src_ext == 'f4m':
f4m_url = src_url
if not f4m_params:
f4m_params = {
f4m_url += '&' if '?' in f4m_url else '?'
f4m_url += compat_urllib_parse_urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
- continue
-
- if src_url.startswith('http') and self._is_valid_url(src, video_id):
+ elif src_ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src_url, video_id, mpd_id='dash', fatal=False))
+ elif re.search(r'\.ism/[Mm]anifest', src_url):
+ formats.extend(self._extract_ism_formats(
+ src_url, video_id, ism_id='mss', fatal=False))
+ elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
'url': src_url,
'width': width,
'height': height,
})
- continue
return formats
representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
def prepare_template(template_name, identifiers):
- t = representation_ms_info[template_name]
+ tmpl = representation_ms_info[template_name]
+ # First of, % characters outside $...$ templates
+ # must be escaped by doubling for proper processing
+ # by % operator string formatting used further (see
+ # https://github.com/rg3/youtube-dl/issues/16867).
+ t = ''
+ in_template = False
+ for c in tmpl:
+ t += c
+ if c == '$':
+ in_template = not in_template
+ elif c == '%' and not in_template:
+ t += c
+ # Next, $...$ templates are translated to their
+ # %(...) counterparts to be used with % operator
t = t.replace('$RepresentationID$', representation_id)
t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
media_info['subtitles'].setdefault(lang, []).append({
'url': absolute_url(src),
})
+ for f in media_info['formats']:
+ f.setdefault('http_headers', {})['Referer'] = base_url
if media_info['formats'] or media_info['subtitles']:
entries.append(media_info)
return entries
import re
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_HTTPError,
-)
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
parse_age_limit,
parse_duration,
+ url_or_none,
ExtractorError
)
for e in media['MediaURLs']:
if e.get('UseDRM') is True:
continue
- format_url = e.get('Path')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(e.get('Path'))
+ if not format_url:
continue
ext = determine_ext(format_url)
if ext == 'm3u8':
for cc_file in cc_files:
if not isinstance(cc_file, dict):
continue
- cc_url = cc_file.get('Path')
- if not cc_url or not isinstance(cc_url, compat_str):
+ cc_url = url_or_none(cc_file.get('Path'))
+ if not cc_url:
continue
lang = cc_file.get('Locale') or 'en'
subtitles.setdefault(lang, []).append({'url': cc_url})
from hashlib import sha1
from math import pow, sqrt, floor
from .common import InfoExtractor
+from .vrv import VRVIE
from ..compat import (
compat_b64decode,
compat_etree_fromstring,
from ..utils import (
ExtractorError,
bytes_to_intlist,
+ extract_attributes,
+ float_or_none,
intlist_to_bytes,
int_or_none,
lowercase_escape,
unified_strdate,
urlencode_postdata,
xpath_text,
- extract_attributes,
)
from ..aes import (
aes_cbc_decrypt,
parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
-class CrunchyrollIE(CrunchyrollBaseIE):
+class CrunchyrollIE(CrunchyrollBaseIE, VRVIE):
+ IE_NAME = 'crunchyroll'
_VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
'ext': 'mp4',
'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
'description': 'md5:2d17137920c64f2f49981a7797d275ef',
- 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Yomiuri Telecasting Corporation (YTV)',
'upload_date': '20131013',
'url': 're:(?!.*&)',
'info_dict': {
'id': '535080',
'ext': 'mp4',
- 'title': '11eyes Episode 1 – Piros éjszaka - Red Night',
+ 'title': '11eyes Episode 1 – Red Night ~ Piros éjszaka',
'description': 'Kakeru and Yuka are thrown into an alternate nightmarish world they call "Red Night".',
'uploader': 'Marvelous AQL Inc.',
'upload_date': '20091021',
# Just test metadata extraction
'skip_download': True,
},
+ }, {
+ 'url': 'http://www.crunchyroll.com/media-723735',
+ 'only_matching': True,
}]
_FORMAT_IDS = {
if 'To view this, please log in to verify you are 18 or older.' in webpage:
self.raise_login_required()
+ media = self._parse_json(self._search_regex(
+ r'vilos\.config\.media\s*=\s*({.+?});',
+ webpage, 'vilos media', default='{}'), video_id)
+ media_metadata = media.get('metadata') or {}
+
video_title = self._html_search_regex(
r'(?s)<h1[^>]*>((?:(?!<h1).)*?<span[^>]+itemprop=["\']title["\'][^>]*>(?:(?!<h1).)+?)</h1>',
webpage, 'video_title')
video_title = re.sub(r' {2,}', ' ', video_title)
- video_description = self._parse_json(self._html_search_regex(
+ video_description = (self._parse_json(self._html_search_regex(
r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id,
- webpage, 'description', default='{}'), video_id).get('description')
+ webpage, 'description', default='{}'), video_id) or media_metadata).get('description')
if video_description:
video_description = lowercase_escape(video_description.replace(r'\r\n', '\n'))
video_upload_date = self._html_search_regex(
[r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'],
webpage, 'video_uploader', fatal=False)
- available_fmts = []
- for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
- attrs = extract_attributes(a)
- href = attrs.get('href')
- if href and '/freetrial' in href:
- continue
- available_fmts.append(fmt)
- if not available_fmts:
- for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
- available_fmts = re.findall(p, webpage)
- if available_fmts:
- break
- video_encode_ids = []
formats = []
- for fmt in available_fmts:
- stream_quality, stream_format = self._FORMAT_IDS[fmt]
- video_format = fmt + 'p'
- stream_infos = []
- streamdata = self._call_rpc_api(
- 'VideoPlayer_GetStandardConfig', video_id,
- 'Downloading media info for %s' % video_format, data={
- 'media_id': video_id,
- 'video_format': stream_format,
- 'video_quality': stream_quality,
- 'current_page': url,
- })
- if streamdata is not None:
- stream_info = streamdata.find('./{default}preload/stream_info')
+ for stream in media.get('streams', []):
+ formats.extend(self._extract_vrv_formats(
+ stream.get('url'), video_id, stream.get('format'),
+ stream.get('audio_lang'), stream.get('hardsub_lang')))
+ if not formats:
+ available_fmts = []
+ for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage):
+ attrs = extract_attributes(a)
+ href = attrs.get('href')
+ if href and '/freetrial' in href:
+ continue
+ available_fmts.append(fmt)
+ if not available_fmts:
+ for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'):
+ available_fmts = re.findall(p, webpage)
+ if available_fmts:
+ break
+ if not available_fmts:
+ available_fmts = self._FORMAT_IDS.keys()
+ video_encode_ids = []
+
+ for fmt in available_fmts:
+ stream_quality, stream_format = self._FORMAT_IDS[fmt]
+ video_format = fmt + 'p'
+ stream_infos = []
+ streamdata = self._call_rpc_api(
+ 'VideoPlayer_GetStandardConfig', video_id,
+ 'Downloading media info for %s' % video_format, data={
+ 'media_id': video_id,
+ 'video_format': stream_format,
+ 'video_quality': stream_quality,
+ 'current_page': url,
+ })
+ if streamdata is not None:
+ stream_info = streamdata.find('./{default}preload/stream_info')
+ if stream_info is not None:
+ stream_infos.append(stream_info)
+ stream_info = self._call_rpc_api(
+ 'VideoEncode_GetStreamInfo', video_id,
+ 'Downloading stream info for %s' % video_format, data={
+ 'media_id': video_id,
+ 'video_format': stream_format,
+ 'video_encode_quality': stream_quality,
+ })
if stream_info is not None:
stream_infos.append(stream_info)
- stream_info = self._call_rpc_api(
- 'VideoEncode_GetStreamInfo', video_id,
- 'Downloading stream info for %s' % video_format, data={
- 'media_id': video_id,
- 'video_format': stream_format,
- 'video_encode_quality': stream_quality,
- })
- if stream_info is not None:
- stream_infos.append(stream_info)
- for stream_info in stream_infos:
- video_encode_id = xpath_text(stream_info, './video_encode_id')
- if video_encode_id in video_encode_ids:
- continue
- video_encode_ids.append(video_encode_id)
-
- video_file = xpath_text(stream_info, './file')
- if not video_file:
- continue
- if video_file.startswith('http'):
- formats.extend(self._extract_m3u8_formats(
- video_file, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- continue
+ for stream_info in stream_infos:
+ video_encode_id = xpath_text(stream_info, './video_encode_id')
+ if video_encode_id in video_encode_ids:
+ continue
+ video_encode_ids.append(video_encode_id)
- video_url = xpath_text(stream_info, './host')
- if not video_url:
- continue
- metadata = stream_info.find('./metadata')
- format_info = {
- 'format': video_format,
- 'height': int_or_none(xpath_text(metadata, './height')),
- 'width': int_or_none(xpath_text(metadata, './width')),
- }
-
- if '.fplive.net/' in video_url:
- video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
- parsed_video_url = compat_urlparse.urlparse(video_url)
- direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
- netloc='v.lvlt.crcdn.net',
- path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
- if self._is_valid_url(direct_video_url, video_id, video_format):
- format_info.update({
- 'format_id': 'http-' + video_format,
- 'url': direct_video_url,
- })
- formats.append(format_info)
+ video_file = xpath_text(stream_info, './file')
+ if not video_file:
+ continue
+ if video_file.startswith('http'):
+ formats.extend(self._extract_m3u8_formats(
+ video_file, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
continue
- format_info.update({
- 'format_id': 'rtmp-' + video_format,
- 'url': video_url,
- 'play_path': video_file,
- 'ext': 'flv',
- })
- formats.append(format_info)
+ video_url = xpath_text(stream_info, './host')
+ if not video_url:
+ continue
+ metadata = stream_info.find('./metadata')
+ format_info = {
+ 'format': video_format,
+ 'height': int_or_none(xpath_text(metadata, './height')),
+ 'width': int_or_none(xpath_text(metadata, './width')),
+ }
+
+ if '.fplive.net/' in video_url:
+ video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
+ parsed_video_url = compat_urlparse.urlparse(video_url)
+ direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
+ netloc='v.lvlt.crcdn.net',
+ path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1])))
+ if self._is_valid_url(direct_video_url, video_id, video_format):
+ format_info.update({
+ 'format_id': 'http-' + video_format,
+ 'url': direct_video_url,
+ })
+ formats.append(format_info)
+ continue
+
+ format_info.update({
+ 'format_id': 'rtmp-' + video_format,
+ 'url': video_url,
+ 'play_path': video_file,
+ 'ext': 'flv',
+ })
+ formats.append(format_info)
self._sort_formats(formats, ('height', 'width', 'tbr', 'fps'))
metadata = self._call_rpc_api(
'media_id': video_id,
})
- subtitles = self.extract_subtitles(video_id, webpage)
+ subtitles = {}
+ for subtitle in media.get('subtitles', []):
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({
+ 'url': subtitle_url,
+ 'ext': subtitle.get('format', 'ass'),
+ })
+ if not subtitles:
+ subtitles = self.extract_subtitles(video_id, webpage)
# webpage provide more accurate data than series_title from XML
series = self._html_search_regex(
webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title')
- episode = xpath_text(metadata, 'episode_title')
- episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
+ episode = xpath_text(metadata, 'episode_title') or media_metadata.get('title')
+ episode_number = int_or_none(xpath_text(metadata, 'episode_number') or media_metadata.get('episode_number'))
season_number = int_or_none(self._search_regex(
r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
'id': video_id,
'title': video_title,
'description': video_description,
- 'thumbnail': xpath_text(metadata, 'episode_image_url'),
+ 'duration': float_or_none(media_metadata.get('duration'), 1000),
+ 'thumbnail': xpath_text(metadata, 'episode_image_url') or media_metadata.get('thumbnail', {}).get('url'),
'uploader': video_uploader,
'upload_date': video_upload_date,
'series': series,
class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
IE_NAME = 'crunchyroll:playlist'
- _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
+ _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)'
_TESTS = [{
'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ parse_age_limit,
parse_iso8601,
+ smuggle_url,
+ str_or_none,
)
'duration': 1263,
'series': 'Whose Line Is It Anyway?',
'season_number': 11,
- 'season': '11',
'episode_number': 20,
'upload_date': '20151006',
'timestamp': 1444107300,
+ 'age_limit': 14,
+ 'uploader': 'CWTV',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
}, {
'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6',
def _real_extract(self, url):
video_id = self._match_id(url)
- video_data = None
- formats = []
- for partner in (154, 213):
- vdata = self._download_json(
- 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False)
- if not vdata:
- continue
- video_data = vdata
- for quality, quality_data in vdata.get('videos', {}).items():
- quality_url = quality_data.get('uri')
- if not quality_url:
- continue
- if quality == 'variantplaylist':
- formats.extend(self._extract_m3u8_formats(
- quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
- else:
- tbr = int_or_none(quality_data.get('bitrate'))
- format_id = 'http' + ('-%d' % tbr if tbr else '')
- if self._is_valid_url(quality_url, video_id, format_id):
- formats.append({
- 'format_id': format_id,
- 'url': quality_url,
- 'tbr': tbr,
- })
- video_metadata = video_data['assetFields']
- ism_url = video_metadata.get('smoothStreamingUrl')
- if ism_url:
- formats.extend(self._extract_ism_formats(
- ism_url, video_id, ism_id='mss', fatal=False))
- self._sort_formats(formats)
-
- thumbnails = [{
- 'url': image['uri'],
- 'width': image.get('width'),
- 'height': image.get('height'),
- } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None
+ video_data = self._download_json(
+ 'http://images.cwtv.com/feed/mobileapp/video-meta/apiversion_8/guid_' + video_id,
+ video_id)['video']
+ title = video_data['title']
+ mpx_url = video_data.get('mpx_url') or 'http://link.theplatform.com/s/cwtv/media/guid/2703454149/%s?formats=M3U' % video_id
- subtitles = {
- 'en': [{
- 'url': video_metadata['UnicornCcUrl'],
- }],
- } if video_metadata.get('UnicornCcUrl') else None
+ season = str_or_none(video_data.get('season'))
+ episode = str_or_none(video_data.get('episode'))
+ if episode and season:
+ episode = episode.lstrip(season)
return {
+ '_type': 'url_transparent',
'id': video_id,
- 'title': video_metadata['title'],
- 'description': video_metadata.get('description'),
- 'duration': int_or_none(video_metadata.get('duration')),
- 'series': video_metadata.get('seriesName'),
- 'season_number': int_or_none(video_metadata.get('seasonNumber')),
- 'season': video_metadata.get('seasonName'),
- 'episode_number': int_or_none(video_metadata.get('episodeNumber')),
- 'timestamp': parse_iso8601(video_data.get('startTime')),
- 'thumbnails': thumbnails,
- 'formats': formats,
- 'subtitles': subtitles,
+ 'title': title,
+ 'url': smuggle_url(mpx_url, {'force_smil_url': True}),
+ 'description': video_data.get('description_long'),
+ 'duration': int_or_none(video_data.get('duration_secs')),
+ 'series': video_data.get('series_name'),
+ 'season_number': int_or_none(season),
+ 'episode_number': int_or_none(episode),
+ 'timestamp': parse_iso8601(video_data.get('start_time')),
+ 'age_limit': parse_age_limit(video_data.get('rating')),
+ 'ie_key': 'ThePlatform',
}
from __future__ import unicode_literals
import base64
+import functools
import hashlib
import itertools
import json
error_to_compat_str,
ExtractorError,
int_or_none,
+ mimetype2ext,
+ OnDemandPagedList,
parse_iso8601,
sanitized_Request,
str_to_int,
unescapeHTML,
- mimetype2ext,
+ urlencode_postdata,
)
age_limit = self._rta_search(webpage)
- description = self._og_search_description(webpage) or self._html_search_meta(
+ description = self._og_search_description(
+ webpage, default=None) or self._html_search_meta(
'description', webpage, 'description')
view_count_str = self._search_regex(
class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:playlist'
- _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>[^/?#&]+)'
- _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
- _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
_TESTS = [{
'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
'info_dict': {
'title': 'SPORT',
- 'id': 'xv4bw_nqtv_sport',
+ 'id': 'xv4bw',
},
'playlist_mincount': 20,
}]
-
- def _extract_entries(self, id):
- video_ids = set()
- processed_urls = set()
- for pagenum in itertools.count(1):
- page_url = self._PAGE_TEMPLATE % (id, pagenum)
- webpage, urlh = self._download_webpage_handle_no_ff(
- page_url, id, 'Downloading page %s' % pagenum)
- if urlh.geturl() in processed_urls:
- self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
- page_url, urlh.geturl()), id)
- break
-
- processed_urls.add(urlh.geturl())
-
- for video_id in re.findall(r'data-xid="(.+?)"', webpage):
- if video_id not in video_ids:
- yield self.url_result(
- 'http://www.dailymotion.com/video/%s' % video_id,
- DailymotionIE.ie_key(), video_id)
- video_ids.add(video_id)
-
- if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
- break
+ _PAGE_SIZE = 100
+
+ def _fetch_page(self, playlist_id, authorizaion, page):
+ page += 1
+ videos = self._download_json(
+ 'https://graphql.api.dailymotion.com',
+ playlist_id, 'Downloading page %d' % page,
+ data=json.dumps({
+ 'query': '''{
+ collection(xid: "%s") {
+ videos(first: %d, page: %d) {
+ pageInfo {
+ hasNextPage
+ nextPage
+ }
+ edges {
+ node {
+ xid
+ url
+ }
+ }
+ }
+ }
+}''' % (playlist_id, self._PAGE_SIZE, page)
+ }).encode(), headers={
+ 'Authorization': authorizaion,
+ 'Origin': 'https://www.dailymotion.com',
+ })['data']['collection']['videos']
+ for edge in videos['edges']:
+ node = edge['node']
+ yield self.url_result(
+ node['url'], DailymotionIE.ie_key(), node['xid'])
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
-
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': self._og_search_title(webpage),
- 'entries': self._extract_entries(playlist_id),
- }
-
-
-class DailymotionUserIE(DailymotionPlaylistIE):
+ api = self._parse_json(self._search_regex(
+ r'__PLAYER_CONFIG__\s*=\s*({.+?});',
+ webpage, 'player config'), playlist_id)['context']['api']
+ auth = self._download_json(
+ api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
+ playlist_id, data=urlencode_postdata({
+ 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
+ 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
+ 'grant_type': 'client_credentials',
+ }))
+ authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
+ entries = OnDemandPagedList(functools.partial(
+ self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
+ return self.playlist_result(
+ entries, playlist_id,
+ self._og_search_title(webpage))
+
+
+class DailymotionUserIE(DailymotionBaseInfoExtractor):
IE_NAME = 'dailymotion:user'
_VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
+ _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
_PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'skip': 'Takes too long time',
}]
+ def _extract_entries(self, id):
+ video_ids = set()
+ processed_urls = set()
+ for pagenum in itertools.count(1):
+ page_url = self._PAGE_TEMPLATE % (id, pagenum)
+ webpage, urlh = self._download_webpage_handle_no_ff(
+ page_url, id, 'Downloading page %s' % pagenum)
+ if urlh.geturl() in processed_urls:
+ self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
+ page_url, urlh.geturl()), id)
+ break
+
+ processed_urls.add(urlh.geturl())
+
+ for video_id in re.findall(r'data-xid="(.+?)"', webpage):
+ if video_id not in video_ids:
+ yield self.url_result(
+ 'http://www.dailymotion.com/video/%s' % video_id,
+ DailymotionIE.ie_key(), video_id)
+ video_ids.add(video_id)
+
+ if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
+ break
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
user = mobj.group('user')
from ..compat import compat_str
from ..utils import (
float_or_none,
- unified_strdate,
+ int_or_none,
+ unified_timestamp,
+ url_or_none,
)
class DctpTvIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
+ # 4x3
'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/',
'info_dict': {
'id': '95eaa4f33dad413aa17b4ee613cccc6c',
'ext': 'flv',
'title': 'Videoinstallation für eine Kaufhausfassade',
'description': 'Kurzfilm',
- 'upload_date': '20110407',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 71.24,
+ 'timestamp': 1302172322,
+ 'upload_date': '20110407',
},
'params': {
# rtmp download
'skip_download': True,
},
- }
+ }, {
+ # 16x9
+ 'url': 'http://www.dctp.tv/filme/sind-youtuber-die-besseren-lehrer/',
+ 'only_matching': True,
+ }]
+
+ _BASE_URL = 'http://dctp-ivms2-restapi.s3.amazonaws.com'
def _real_extract(self, url):
display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
+ version = self._download_json(
+ '%s/version.json' % self._BASE_URL, display_id,
+ 'Downloading version JSON')
+
+ restapi_base = '%s/%s/restapi' % (
+ self._BASE_URL, version['version_name'])
- video_id = self._html_search_meta(
- 'DC.identifier', webpage, 'video id',
- default=None) or self._search_regex(
- r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id')
+ info = self._download_json(
+ '%s/slugs/%s.json' % (restapi_base, display_id), display_id,
+ 'Downloading video info JSON')
- title = self._og_search_title(webpage)
+ media = self._download_json(
+ '%s/media/%s.json' % (restapi_base, compat_str(info['object_id'])),
+ display_id, 'Downloading media JSON')
+
+ uuid = media['uuid']
+ title = media['title']
+ ratio = '16x9' if media.get('is_wide') else '4x3'
+ play_path = 'mp4:%s_dctp_0500_%s.m4v' % (uuid, ratio)
servers = self._download_json(
'http://www.dctp.tv/streaming_servers/', display_id,
- note='Downloading server list', fatal=False)
+ note='Downloading server list JSON', fatal=False)
if servers:
endpoint = next(
server['endpoint']
for server in servers
- if isinstance(server.get('endpoint'), compat_str) and
+ if url_or_none(server.get('endpoint')) and
'cloudfront' in server['endpoint'])
else:
endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/'
formats = [{
'url': endpoint,
'app': app,
- 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id,
+ 'play_path': play_path,
'page_url': url,
- 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf',
+ 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-110.swf',
'ext': 'flv',
}]
- description = self._html_search_meta('DC.description', webpage)
- upload_date = unified_strdate(
- self._html_search_meta('DC.date.created', webpage))
- thumbnail = self._og_search_thumbnail(webpage)
- duration = float_or_none(self._search_regex(
- r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration',
- default=None), scale=1000)
+ thumbnails = []
+ images = media.get('images')
+ if isinstance(images, list):
+ for image in images:
+ if not isinstance(image, dict):
+ continue
+ image_url = url_or_none(image.get('url'))
+ if not image_url:
+ continue
+ thumbnails.append({
+ 'url': image_url,
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ })
return {
- 'id': video_id,
+ 'id': uuid,
+ 'display_id': display_id,
'title': title,
+ 'alt_title': media.get('subtitle'),
+ 'description': media.get('description') or media.get('teaser'),
+ 'timestamp': unified_timestamp(media.get('created')),
+ 'duration': float_or_none(media.get('duration_in_ms'), scale=1000),
+ 'thumbnails': thumbnails,
'formats': formats,
- 'display_id': display_id,
- 'description': description,
- 'upload_date': upload_date,
- 'thumbnail': thumbnail,
- 'duration': duration,
}
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
parse_age_limit,
remove_end,
unescapeHTML,
+ url_or_none,
)
captions = stream.get('captions')
if isinstance(captions, list):
for caption in captions:
- subtitle_url = caption.get('fileUrl')
- if (not subtitle_url or not isinstance(subtitle_url, compat_str) or
- not subtitle_url.startswith('http')):
+ subtitle_url = url_or_none(caption.get('fileUrl'))
+ if not subtitle_url or not subtitle_url.startswith('http'):
continue
lang = caption.get('fileLang', 'en')
ext = determine_ext(subtitle_url)
unified_strdate,
unified_timestamp,
update_url_query,
+ urljoin,
USER_AGENTS,
)
if not info:
info_url = self._search_regex(
- r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
- webpage, 'info url')
+ (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+ r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'),
+ webpage, 'info url', group='url')
+ info_url = urljoin(url, info_url)
video_id = info_url.rpartition('/')[-1]
try:
'dplayit_token').value,
'Referer': url,
})
+ if isinstance(info, compat_str):
+ info = self._parse_json(info, display_id)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
formats = self._extract_m3u8_formats(
hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
m3u8_id='hls')
+ self._sort_formats(formats)
series = self._html_search_regex(
r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- compat_str,
compat_urlparse,
)
from ..utils import (
parse_age_limit,
parse_duration,
unified_timestamp,
+ url_or_none,
)
for sub in subs:
if not isinstance(sub, dict):
continue
- sub_url = sub.get('url')
- if not sub_url or not isinstance(sub_url, compat_str):
+ sub_url = url_or_none(sub.get('url'))
+ if not sub_url:
continue
subtitles.setdefault(
sub.get('code') or sub.get('language') or 'en', []).append({
for format_id, format_dict in download_assets.items():
if not isinstance(format_dict, dict):
continue
- format_url = format_dict.get('url')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(format_dict.get('url'))
+ if not format_url:
continue
formats.append({
'url': format_url,
try:
self.to_screen('%s: Checking %s video format URL' % (video_id, format_id))
self._downloader._opener.open(video_url, timeout=5).close()
- except timeout as e:
+ except timeout:
self.to_screen(
'%s: %s URL is invalid, skipping' % (video_id, format_id))
continue
import re
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
unsmuggle_url,
+ url_or_none,
)
video_id, 'Downloading mp4 JSON', fatal=False)
if mp4_data:
for format_id, format_url in mp4_data.get('data', {}).items():
- if not isinstance(format_url, compat_str):
+ if not url_or_none(format_url):
continue
height = int_or_none(format_id)
if height is not None and m3u8_formats_dict.get(height):
int_or_none,
try_get,
unified_timestamp,
+ url_or_none,
)
entries = []
for lesson in lessons:
- lesson_url = lesson.get('http_url')
- if not lesson_url or not isinstance(lesson_url, compat_str):
+ lesson_url = url_or_none(lesson.get('http_url'))
+ if not lesson_url:
continue
lesson_id = lesson.get('id')
if lesson_id:
formats = []
for _, format_url in lesson['media_urls'].items():
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(format_url)
+ if not format_url:
continue
ext = determine_ext(format_url)
if ext == 'm3u8':
int_or_none,
parse_duration,
str_to_int,
+ url_or_none,
)
for format_id, format_dict in formats_dict.items():
if not isinstance(format_dict, dict):
continue
- src = format_dict.get('src')
- if not isinstance(src, compat_str) or not src.startswith('http'):
+ src = url_or_none(format_dict.get('src'))
+ if not src or not src.startswith('http'):
continue
if kind == 'hls':
formats.extend(self._extract_m3u8_formats(
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
determine_ext,
class ExpressenIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?expressen\.se/
+ (?:(?:tvspelare/video|videoplayer/embed)/)?
+ tv/(?:[^/]+/)*
+ (?P<id>[^/?#&]+)
+ '''
_TESTS = [{
'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
'md5': '2fbbe3ca14392a6b1b36941858d33a45',
}, {
'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.expressen.se/tvspelare/video/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.expressen.se/videoplayer/embed/tv/ditv/ekonomistudion/experterna-har-ar-fragorna-som-avgor-valet/?embed=true&external=true&autoplay=true&startVolume=0&partnerId=di',
+ 'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url') for mobj in re.finditer(
+ r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?expressen\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1',
+ webpage)]
+
def _real_extract(self, url):
display_id = self._match_id(url)
from .archiveorg import ArchiveOrgIE
from .arkena import ArkenaIE
from .ard import (
+ ARDBetaMediathekIE,
ARDIE,
ARDMediathekIE,
)
BiliBiliBangumiIE,
)
from .biobiochiletv import BioBioChileTVIE
+from .bitchute import (
+ BitChuteIE,
+ BitChuteChannelIE,
+)
from .biqle import BIQLEIE
from .bleacherreport import (
BleacherReportIE,
from .foxnews import (
FoxNewsIE,
FoxNewsArticleIE,
- FoxNewsInsiderIE,
)
from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .freesound import FreesoundIE
from .freespeech import FreespeechIE
from .freshlive import FreshLiveIE
+from .frontendmasters import (
+ FrontendMastersIE,
+ FrontendMastersLessonIE,
+ FrontendMastersCourseIE
+)
from .funimation import FunimationIE
from .funk import (
FunkMixIE,
from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
+from .kinopoisk import KinoPoiskIE
from .keek import KeekIE
from .konserthusetplay import KonserthusetPlayIE
from .kontrtube import KontrTubeIE
from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
-from .nova import NovaIE
+from .nova import (
+ NovaEmbedIE,
+ NovaIE,
+)
from .novamov import (
AuroraVidIE,
CloudTimeIE,
NRKSkoleIE,
NRKTVIE,
NRKTVDirekteIE,
+ NRKTVEpisodeIE,
NRKTVEpisodesIE,
+ NRKTVSeasonIE,
NRKTVSeriesIE,
)
from .ntvde import NTVDeIE
from .pornotube import PornotubeIE
from .pornovoisines import PornoVoisinesIE
from .pornoxo import PornoXOIE
+from .puhutv import (
+ PuhuTVIE,
+ PuhuTVSerieIE,
+)
from .presstv import PressTVIE
from .primesharetv import PrimeShareTVIE
from .promptfile import PromptFileIE
RaiPlayPlaylistIE,
RaiIE,
)
-from .raywenderlich import RayWenderlichIE
+from .raywenderlich import (
+ RayWenderlichIE,
+ RayWenderlichCourseIE,
+)
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import RedBullTVIE
from .sunporno import SunPornoIE
from .svt import (
SVTIE,
+ SVTPageIE,
SVTPlayIE,
SVTSeriesIE,
)
from .teamcoco import TeamcocoIE
from .techtalks import TechTalksIE
from .ted import TEDIE
+from .tele5 import Tele5IE
from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .tvplay import (
TVPlayIE,
ViafreeIE,
+ TVPlayHomeIE,
)
from .tvplayer import TVPlayerIE
from .tweakers import TweakersIE
VikiIE,
VikiChannelIE,
)
+from .viqeo import ViqeoIE
from .viu import (
ViuIE,
ViuPlaylistIE,
YouNowMomentIE,
)
from .youporn import YouPornIE
+from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
int_or_none,
js_to_json,
limit_length,
+ parse_count,
sanitized_Request,
try_get,
urlencode_postdata,
'info_dict': {
'id': '274175099429670',
'ext': 'mp4',
- 'title': 'Asif Nawab Butt posted a video to his Timeline.',
+ 'title': 're:^Asif Nawab Butt posted a video',
'uploader': 'Asif Nawab Butt',
'upload_date': '20140506',
'timestamp': 1399398998,
}, {
# have 1080P, but only up to 720p in swf params
'url': 'https://www.facebook.com/cnn/videos/10155529876156509/',
- 'md5': '0d9813160b146b3bc8744e006027fcc6',
+ 'md5': '9571fae53d4165bbbadb17a94651dcdc',
'info_dict': {
'id': '10155529876156509',
'ext': 'mp4',
'upload_date': '20161030',
'uploader': 'CNN',
'thumbnail': r're:^https?://.*',
+ 'view_count': int,
},
}, {
# bigPipe.onPageletArrive ... onPageletArrive pagelet_group_mall
'info_dict': {
'id': '1417995061575415',
'ext': 'mp4',
- 'title': 'md5:a7b86ca673f51800cd54687b7f4012fe',
+ 'title': 'md5:1db063d6a8c13faa8da727817339c857',
'timestamp': 1486648217,
'upload_date': '20170209',
'uploader': 'Yaroslav Korpan',
'info_dict': {
'id': '1396382447100162',
'ext': 'mp4',
- 'title': 'md5:e2d2700afdf84e121f5d0f999bad13a3',
+ 'title': 'md5:19a428bbde91364e3de815383b54a235',
'timestamp': 1486035494,
'upload_date': '20170202',
'uploader': 'Elisabeth Ahtn',
tahoe_data = self._download_webpage(
self._VIDEO_PAGE_TAHOE_TEMPLATE % video_id, video_id,
data=urlencode_postdata({
- '__user': 0,
'__a': 1,
'__pc': self._search_regex(
r'pkg_cohort["\']\s*:\s*["\'](.+?)["\']', webpage,
'__rev': self._search_regex(
r'client_revision["\']\s*:\s*(\d+),', webpage,
'client revision', default='3944515'),
+ 'fb_dtsg': self._search_regex(
+ r'"DTSGInitialData"\s*,\s*\[\]\s*,\s*{\s*"token"\s*:\s*"([^"]+)"',
+ webpage, 'dtsg token', default=''),
}),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'timestamp', default=None))
thumbnail = self._og_search_thumbnail(webpage)
+ view_count = parse_count(self._search_regex(
+ r'\bviewCount\s*:\s*["\']([\d,.]+)', webpage, 'view count',
+ default=None))
+
info_dict = {
'id': video_id,
'title': video_title,
'uploader': uploader,
'timestamp': timestamp,
'thumbnail': thumbnail,
+ 'view_count': view_count,
}
return webpage, info_dict
int_or_none,
qualities,
unified_strdate,
+ url_or_none,
)
formats = []
path = None
for f in item.get('mbr', []):
- src = f.get('src')
- if not src or not isinstance(src, compat_str):
+ src = url_or_none(f.get('src'))
+ if not src:
continue
tbr = int_or_none(self._search_regex(
r'_(\d{3,})\.mp4', src, 'tbr', default=None))
},
]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1',
+ webpage)]
+
def _real_extract(self, url):
host, video_id = re.match(self._VALID_URL, url).groups()
class FoxNewsArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)'
IE_NAME = 'foxnews:article'
- _TEST = {
+ _TESTS = [{
+ # data-video-id
'url': 'http://www.foxnews.com/politics/2016/09/08/buzz-about-bud-clinton-camp-denies-claims-wore-earpiece-at-forum.html',
- 'md5': '62aa5a781b308fdee212ebb6f33ae7ef',
+ 'md5': '83d44e1aff1433e7a29a7b537d1700b5',
'info_dict': {
'id': '5116295019001',
'ext': 'mp4',
'title': 'Trump and Clinton asked to defend positions on Iraq War',
'description': 'Veterans react on \'The Kelly File\'',
- 'timestamp': 1473299755,
+ 'timestamp': 1473301045,
'upload_date': '20160908',
},
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- video_id = self._html_search_regex(
- r'data-video-id=([\'"])(?P<id>[^\'"]+)\1',
- webpage, 'video ID', group='id')
- return self.url_result(
- 'http://video.foxnews.com/v/' + video_id,
- FoxNewsIE.ie_key())
-
-
-class FoxNewsInsiderIE(InfoExtractor):
- _VALID_URL = r'https?://insider\.foxnews\.com/([^/]+/)+(?P<id>[a-z-]+)'
- IE_NAME = 'foxnews:insider'
-
- _TEST = {
- 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
- 'md5': 'a10c755e582d28120c62749b4feb4c0c',
+ }, {
+ # iframe embed
+ 'url': 'http://www.foxnews.com/us/2018/03/09/parkland-survivor-kyle-kashuv-on-meeting-trump-his-app-to-prevent-another-school-shooting.amp.html?__twitter_impression=true',
'info_dict': {
- 'id': '5099377331001',
- 'display_id': 'univ-wisconsin-student-group-pushing-silence-certain-words',
- 'ext': 'mp4',
- 'title': 'Student Group: Saying \'Politically Correct,\' \'Trash\' and \'Lame\' Is Offensive',
- 'description': 'Is campus censorship getting out of control?',
- 'timestamp': 1472168725,
- 'upload_date': '20160825',
+ 'id': '5748266721001',
+ 'ext': 'flv',
+ 'title': 'Kyle Kashuv has a positive message for the Trump White House',
+ 'description': 'Marjory Stoneman Douglas student disagrees with classmates.',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 229,
+ 'timestamp': 1520594670,
+ 'upload_date': '20180309',
},
'params': {
- # m3u8 download
'skip_download': True,
},
- 'add_ie': [FoxNewsIE.ie_key()],
- }
+ }, {
+ 'url': 'http://insider.foxnews.com/2016/08/25/univ-wisconsin-student-group-pushing-silence-certain-words',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
-
webpage = self._download_webpage(url, display_id)
- embed_url = self._html_search_meta('embedUrl', webpage, 'embed URL')
-
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
+ video_id = self._html_search_regex(
+ r'data-video-id=([\'"])(?P<id>[^\'"]+)\1',
+ webpage, 'video ID', group='id', default=None)
+ if video_id:
+ return self.url_result(
+ 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key())
- return {
- '_type': 'url_transparent',
- 'ie_key': FoxNewsIE.ie_key(),
- 'url': embed_url,
- 'display_id': display_id,
- 'title': title,
- 'description': description,
- }
+ return self.url_result(
+ FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key())
int_or_none,
parse_duration,
try_get,
+ url_or_none,
)
from .dailymotion import DailymotionIE
def sign(manifest_url, manifest_id):
for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'):
- signed_url = self._download_webpage(
+ signed_url = url_or_none(self._download_webpage(
'https://%s/esi/TA' % host, video_id,
'Downloading signed %s manifest URL' % manifest_id,
fatal=False, query={
'url': manifest_url,
- })
- if (signed_url and isinstance(signed_url, compat_str) and
- re.search(r'^(?:https?:)?//', signed_url)):
+ }))
+ if signed_url:
return signed_url
return manifest_url
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
+from ..utils import (
+ ExtractorError,
+ parse_duration,
+ url_or_none,
+ urlencode_postdata,
+)
+
+
+class FrontendMastersBaseIE(InfoExtractor):
+ _API_BASE = 'https://api.frontendmasters.com/v1/kabuki'
+ _LOGIN_URL = 'https://frontendmasters.com/login/'
+
+ _NETRC_MACHINE = 'frontendmasters'
+
+ _QUALITIES = {
+ 'low': {'width': 480, 'height': 360},
+ 'mid': {'width': 1280, 'height': 720},
+ 'high': {'width': 1920, 'height': 1080}
+ }
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+
+ login_page = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'username': username,
+ 'password': password
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
+ 'post_url', default=self._LOGIN_URL, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+
+ response = self._download_webpage(
+ post_url, None, 'Logging in', data=urlencode_postdata(login_form),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ # Successful login
+ if any(p in response for p in (
+ 'wp-login.php?action=logout', '>Logout')):
+ return
+
+ error = self._html_search_regex(
+ r'class=(["\'])(?:(?!\1).)*\bMessageAlert\b(?:(?!\1).)*\1[^>]*>(?P<error>[^<]+)<',
+ response, 'error message', default=None, group='error')
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class FrontendMastersPageBaseIE(FrontendMastersBaseIE):
+ def _download_course(self, course_name, url):
+ return self._download_json(
+ '%s/courses/%s' % (self._API_BASE, course_name), course_name,
+ 'Downloading course JSON', headers={'Referer': url})
+
+ @staticmethod
+ def _extract_chapters(course):
+ chapters = []
+ lesson_elements = course.get('lessonElements')
+ if isinstance(lesson_elements, list):
+ chapters = [url_or_none(e) for e in lesson_elements if url_or_none(e)]
+ return chapters
+
+ @staticmethod
+ def _extract_lesson(chapters, lesson_id, lesson):
+ title = lesson.get('title') or lesson_id
+ display_id = lesson.get('slug')
+ description = lesson.get('description')
+ thumbnail = lesson.get('thumbnail')
+
+ chapter_number = None
+ index = lesson.get('index')
+ element_index = lesson.get('elementIndex')
+ if (isinstance(index, int) and isinstance(element_index, int) and
+ index < element_index):
+ chapter_number = element_index - index
+ chapter = (chapters[chapter_number - 1]
+ if chapter_number - 1 < len(chapters) else None)
+
+ duration = None
+ timestamp = lesson.get('timestamp')
+ if isinstance(timestamp, compat_str):
+ mobj = re.search(
+ r'(?P<start>\d{1,2}:\d{1,2}:\d{1,2})\s*-(?P<end>\s*\d{1,2}:\d{1,2}:\d{1,2})',
+ timestamp)
+ if mobj:
+ duration = parse_duration(mobj.group('end')) - parse_duration(
+ mobj.group('start'))
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'frontendmasters:%s' % lesson_id,
+ 'ie_key': FrontendMastersIE.ie_key(),
+ 'id': lesson_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'chapter': chapter,
+ 'chapter_number': chapter_number,
+ }
+
+
+class FrontendMastersIE(FrontendMastersBaseIE):
+ _VALID_URL = r'(?:frontendmasters:|https?://api\.frontendmasters\.com/v\d+/kabuki/video/)(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://api.frontendmasters.com/v1/kabuki/video/a2qogef6ba',
+ 'md5': '7f161159710d6b7016a4f4af6fcb05e2',
+ 'info_dict': {
+ 'id': 'a2qogef6ba',
+ 'ext': 'mp4',
+ 'title': 'a2qogef6ba',
+ },
+ 'skip': 'Requires FrontendMasters account credentials',
+ }, {
+ 'url': 'frontendmasters:a2qogef6ba',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ lesson_id = self._match_id(url)
+
+ source_url = '%s/video/%s/source' % (self._API_BASE, lesson_id)
+
+ formats = []
+ for ext in ('webm', 'mp4'):
+ for quality in ('low', 'mid', 'high'):
+ resolution = self._QUALITIES[quality].copy()
+ format_id = '%s-%s' % (ext, quality)
+ format_url = self._download_json(
+ source_url, lesson_id,
+ 'Downloading %s source JSON' % format_id, query={
+ 'f': ext,
+ 'r': resolution['height'],
+ }, headers={
+ 'Referer': url,
+ }, fatal=False)['url']
+
+ if not format_url:
+ continue
+
+ f = resolution.copy()
+ f.update({
+ 'url': format_url,
+ 'ext': ext,
+ 'format_id': format_id,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ subtitles = {
+ 'en': [{
+ 'url': '%s/transcripts/%s.vtt' % (self._API_BASE, lesson_id),
+ }]
+ }
+
+ return {
+ 'id': lesson_id,
+ 'title': lesson_id,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class FrontendMastersLessonIE(FrontendMastersPageBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<course_name>[^/]+)/(?P<lesson_name>[^/]+)'
+ _TEST = {
+ 'url': 'https://frontendmasters.com/courses/web-development/tools',
+ 'info_dict': {
+ 'id': 'a2qogef6ba',
+ 'display_id': 'tools',
+ 'ext': 'mp4',
+ 'title': 'Tools',
+ 'description': 'md5:82c1ea6472e88ed5acd1829fe992e4f7',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'chapter': 'Introduction',
+ 'chapter_number': 1,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires FrontendMasters account credentials',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_name, lesson_name = mobj.group('course_name', 'lesson_name')
+
+ course = self._download_course(course_name, url)
+
+ lesson_id, lesson = next(
+ (video_id, data)
+ for video_id, data in course['lessonData'].items()
+ if data.get('slug') == lesson_name)
+
+ chapters = self._extract_chapters(course)
+ return self._extract_lesson(chapters, lesson_id, lesson)
+
+
+class FrontendMastersCourseIE(FrontendMastersPageBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?frontendmasters\.com/courses/(?P<id>[^/]+)'
+ _TEST = {
+ 'url': 'https://frontendmasters.com/courses/web-development/',
+ 'info_dict': {
+ 'id': 'web-development',
+ 'title': 'Introduction to Web Development',
+ 'description': 'md5:9317e6e842098bf725d62360e52d49a6',
+ },
+ 'playlist_count': 81,
+ 'skip': 'Requires FrontendMasters account credentials',
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if FrontendMastersLessonIE.suitable(url) else super(
+ FrontendMastersBaseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ course_name = self._match_id(url)
+
+ course = self._download_course(course_name, url)
+
+ chapters = self._extract_chapters(course)
+
+ lessons = sorted(
+ course['lessonData'].values(), key=lambda data: data['index'])
+
+ entries = []
+ for lesson in lessons:
+ lesson_name = lesson.get('slug')
+ if not lesson_name:
+ continue
+ lesson_id = lesson.get('hash') or lesson.get('statsId')
+ entries.append(self._extract_lesson(chapters, lesson_id, lesson))
+
+ title = course.get('title')
+ description = course.get('description')
+
+ return self.playlist_result(entries, course_name, title, description)
# coding: utf-8
from __future__ import unicode_literals
+import itertools
import re
from .common import InfoExtractor
from .nexx import NexxIE
+from ..compat import compat_str
from ..utils import (
int_or_none,
try_get,
class FunkBaseIE(InfoExtractor):
+ _HEADERS = {
+ 'Accept': '*/*',
+ 'Accept-Language': 'en-US,en;q=0.9,ru;q=0.8',
+ 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4',
+ }
+ _AUTH = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoid2ViYXBwLXYzMSIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxuZXh4LWNvbnRlbnQtYXBpLXYzMSx3ZWJhcHAtYXBpIn0.mbuG9wS9Yf5q6PqgR4fiaRFIagiHk9JhwoKES7ksVX4'
+
+ @staticmethod
+ def _make_headers(referer):
+ headers = FunkBaseIE._HEADERS.copy()
+ headers['Referer'] = referer
+ return headers
+
def _make_url_result(self, video):
return {
'_type': 'url_transparent',
lists = self._download_json(
'https://www.funk.net/api/v3.1/curation/curatedLists/',
- mix_id, headers={
- 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI',
- 'Referer': url,
- }, query={
+ mix_id, headers=self._make_headers(url), query={
'size': 100,
- })['result']['lists']
+ })['_embedded']['curatedListList']
metas = next(
l for l in lists
if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas']
video = next(
meta['videoDataDelegate']
- for meta in metas if meta.get('alias') == alias)
+ for meta in metas
+ if try_get(
+ meta, lambda x: x['videoDataDelegate']['alias'],
+ compat_str) == alias)
return self._make_url_result(video)
channel_id = mobj.group('id')
alias = mobj.group('alias')
- headers = {
- 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc',
- 'Referer': url,
- }
+ headers = self._make_headers(url)
video = None
- by_id_list = self._download_json(
- 'https://www.funk.net/api/v3.0/content/videos/byIdList', channel_id,
- headers=headers, query={
- 'ids': alias,
- }, fatal=False)
- if by_id_list:
- video = try_get(by_id_list, lambda x: x['result'][0], dict)
+ # Id-based channels are currently broken on their side: webplayer
+ # tries to process them via byChannelAlias endpoint and fails
+ # predictably.
+ for page_num in itertools.count():
+ by_channel_alias = self._download_json(
+ 'https://www.funk.net/api/v3.1/webapp/videos/byChannelAlias/%s'
+ % channel_id,
+ 'Downloading byChannelAlias JSON page %d' % (page_num + 1),
+ headers=headers, query={
+ 'filterFsk': 'false',
+ 'sort': 'creationDate,desc',
+ 'size': 100,
+ 'page': page_num,
+ }, fatal=False)
+ if not by_channel_alias:
+ break
+ video_list = try_get(
+ by_channel_alias, lambda x: x['_embedded']['videoList'], list)
+ if not video_list:
+ break
+ try:
+ video = next(r for r in video_list if r.get('alias') == alias)
+ break
+ except StopIteration:
+ pass
+ if not try_get(
+ by_channel_alias, lambda x: x['_links']['next']):
+ break
+
+ if not video:
+ by_id_list = self._download_json(
+ 'https://www.funk.net/api/v3.0/content/videos/byIdList',
+ channel_id, 'Downloading byIdList JSON', headers=headers,
+ query={
+ 'ids': alias,
+ }, fatal=False)
+ if by_id_list:
+ video = try_get(by_id_list, lambda x: x['result'][0], dict)
if not video:
results = self._download_json(
- 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id,
- headers=headers, query={
+ 'https://www.funk.net/api/v3.0/content/videos/filter',
+ channel_id, 'Downloading filter JSON', headers=headers, query={
'channelId': channel_id,
'size': 100,
})['result']
from .peertube import PeerTubeIE
from .indavideo import IndavideoEmbedIE
from .apa import APAIE
+from .foxnews import FoxNewsIE
+from .viqeo import ViqeoIE
+from .expressen import ExpressenIE
class GenericIE(InfoExtractor):
'skip_download': True,
},
},
- # SVT embed
- {
- 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
- 'info_dict': {
- 'id': '2900353',
- 'ext': 'flv',
- 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
- 'duration': 27,
- 'age_limit': 0,
- },
- },
# Crooks and Liars embed
{
'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
},
'skip': 'TODO: fix nested playlists processing in tests',
},
+ {
+ # Viqeo embeds
+ 'url': 'https://viqeo.tv/',
+ 'info_dict': {
+ 'id': 'viqeo',
+ 'title': 'All-new video platform',
+ },
+ 'playlist_count': 6,
+ },
+ {
+ # videojs embed
+ 'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
+ 'info_dict': {
+ 'id': 'shell',
+ 'ext': 'mp4',
+ 'title': 'Доставщик пиццы спросил разрешения сыграть на фортепиано',
+ 'description': 'md5:89209cdc587dab1e4a090453dbaa2cb1',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download MPD manifest'],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
return self.playlist_from_matches(
cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key())
- peertube_urls = PeerTubeIE._extract_urls(webpage)
+ peertube_urls = PeerTubeIE._extract_urls(webpage, url)
if peertube_urls:
return self.playlist_from_matches(
peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
return self.playlist_from_matches(
apa_urls, video_id, video_title, ie=APAIE.ie_key())
- sharevideos_urls = [mobj.group('url') for mobj in re.finditer(
+ foxnews_urls = FoxNewsIE._extract_urls(webpage)
+ if foxnews_urls:
+ return self.playlist_from_matches(
+ foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key())
+
+ sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer(
r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1',
webpage)]
if sharevideos_urls:
return self.playlist_from_matches(
sharevideos_urls, video_id, video_title)
+ viqeo_urls = ViqeoIE._extract_urls(webpage)
+ if viqeo_urls:
+ return self.playlist_from_matches(
+ viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key())
+
+ expressen_urls = ExpressenIE._extract_urls(webpage)
+ if expressen_urls:
+ return self.playlist_from_matches(
+ expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
- info = self._parse_jwplayer_data(
- jwplayer_data, video_id, require_title=False, base_url=url)
- return merge_dicts(info, info_dict)
+ try:
+ info = self._parse_jwplayer_data(
+ jwplayer_data, video_id, require_title=False, base_url=url)
+ return merge_dicts(info, info_dict)
+ except ExtractorError:
+ # See https://github.com/rg3/youtube-dl/pull/16735
+ pass
# Video.js embed
mobj = re.search(
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\
+ % '|'.join(list(_SITE_INFO.keys()) + ['disneynow'])
_TESTS = [{
'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
}, {
'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland',
'only_matching': True,
+ }, {
+ # brand 004
+ 'url': 'http://disneynow.go.com/shows/big-hero-6-the-series/season-01/episode-10-mr-sparkles-loses-his-sparkle/vdka4637915',
+ 'only_matching': True,
+ }, {
+ # brand 008
+ 'url': 'http://disneynow.go.com/shows/minnies-bow-toons/video/happy-campers/vdka4872013',
+ 'only_matching': True,
}]
def _extract_videos(self, brand, video_id='-1', show_id='-1'):
def _real_extract(self, url):
sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
- site_info = self._SITE_INFO[sub_domain]
- brand = site_info['brand']
- if not video_id:
- webpage = self._download_webpage(url, display_id)
+ site_info = self._SITE_INFO.get(sub_domain, {})
+ brand = site_info.get('brand')
+ if not video_id or not site_info:
+ webpage = self._download_webpage(url, display_id or video_id)
video_id = self._search_regex(
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None)
+ r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id',
+ default=None)
+ if not site_info:
+ brand = self._search_regex(
+ (r'data-brand=\s*["\']\s*(\d+)',
+ r'data-page-brand=\s*["\']\s*(\d+)'), webpage, 'brand',
+ default='004')
+ site_info = next(
+ si for _, si in self._SITE_INFO.items()
+ if si.get('brand') == brand)
if not video_id:
# show extraction works for Disney, DisneyJunior and DisneyXD
# ABC and Freeform has different layout
import re
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
ExtractorError,
class Go90IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
+ _TESTS = [{
'url': 'https://www.go90.com/videos/84BUqjLpf9D',
'md5': 'efa7670dbbbf21a7b07b360652b24a32',
'info_dict': {
'upload_date': '20170411',
'age_limit': 14,
}
- }
+ }, {
+ 'url': 'https://www.go90.com/embed/261MflWkD3N',
+ 'only_matching': True,
+ }]
+ _GEO_BYPASS = False
def _real_extract(self, url):
video_id = self._match_id(url)
- video_data = self._download_json(
- 'https://www.go90.com/api/view/items/' + video_id,
- video_id, headers={
+
+ try:
+ headers = self.geo_verification_headers()
+ headers.update({
'Content-Type': 'application/json; charset=utf-8',
- }, data=b'{"client":"web","device_type":"pc"}')
+ })
+ video_data = self._download_json(
+ 'https://www.go90.com/api/view/items/' + video_id, video_id,
+ headers=headers, data=b'{"client":"web","device_type":"pc"}')
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ message = self._parse_json(e.cause.read().decode(), None)['error']['message']
+ if 'region unavailable' in message:
+ self.raise_geo_restricted(countries=['US'])
+ raise ExtractorError(message, expected=True)
+ raise
+
if video_data.get('requires_drm'):
raise ExtractorError('This video is DRM protected.', expected=True)
main_video_asset = video_data['main_video_asset']
from ..utils import (
ExtractorError,
int_or_none,
+ url_or_none,
urlencode_postdata,
)
bitrates = rendition.get('bitrates')
if not isinstance(bitrates, dict):
continue
- m3u8_url = bitrates.get('hls')
- if not isinstance(m3u8_url, compat_str):
+ m3u8_url = url_or_none(bitrates.get('hls'))
+ if not m3u8_url:
continue
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
if not isinstance(cc_file, list) or len(cc_file) < 3:
continue
cc_lang = cc_file[0]
- cc_url = cc_file[2]
- if not isinstance(cc_lang, compat_str) or not isinstance(
- cc_url, compat_str):
+ cc_url = url_or_none(cc_file[2])
+ if not isinstance(cc_lang, compat_str) or not cc_url:
continue
subtitles.setdefault(cc_lang, []).append({
'url': cc_url,
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
mimetype2ext,
parse_duration,
qualities,
+ url_or_none,
)
for encoding in video_metadata.get('encodings', []):
if not encoding or not isinstance(encoding, dict):
continue
- video_url = encoding.get('videoUrl')
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(encoding.get('videoUrl'))
+ if not video_url:
continue
- ext = determine_ext(video_url, mimetype2ext(encoding.get('mimeType')))
+ ext = mimetype2ext(encoding.get(
+ 'mimeType')) or determine_ext(video_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, 'mp4', entry_protocol='m3u8_native',
class ImgurIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z]+)?$'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
}, {
'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True,
+ }, {
+ 'url': 'https://i.imgur.com/crGpqCV.mp4',
+ 'only_matching': True,
}]
def _real_extract(self, url):
lowercase_escape,
std_headers,
try_get,
+ url_or_none,
)
node = try_get(edge, lambda x: x['node'], dict)
if not node:
continue
- node_video_url = try_get(node, lambda x: x['video_url'], compat_str)
+ node_video_url = url_or_none(node.get('video_url'))
if not node_video_url:
continue
entries.append({
class InternazionaleIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?internazionale\.it/video/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.internazionale.it/video/2015/02/19/richard-linklater-racconta-una-scena-di-boyhood',
'md5': '3e39d32b66882c1218e305acbf8348ca',
'info_dict': {
'params': {
'format': 'bestvideo',
},
- }
+ }, {
+ 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi',
+ 'md5': '9db8663704cab73eb972d1cee0082c79',
+ 'info_dict': {
+ 'id': '761344',
+ 'display_id': 'telefono-stare-con-noi-stessi',
+ 'ext': 'mp4',
+ 'title': 'Usiamo il telefono per evitare di stare con noi stessi',
+ 'description': 'md5:75ccfb0d6bcefc6e7428c68b4aa1fe44',
+ 'timestamp': 1535528954,
+ 'upload_date': '20180829',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
DATA_RE % 'job-id', webpage, 'video id', group='value')
video_path = self._search_regex(
DATA_RE % 'video-path', webpage, 'video path', group='value')
+ video_available_abroad = self._search_regex(
+ DATA_RE % 'video-available_abroad', webpage,
+ 'video available aboard', default='1', group='value')
+ video_available_abroad = video_available_abroad == '1'
- video_base = 'https://video.internazionale.it/%s/%s.' % (video_path, video_id)
+ video_base = 'https://video%s.internazionale.it/%s/%s.' % \
+ ('' if video_available_abroad else '-ita', video_path, video_id)
formats = self._extract_m3u8_formats(
video_base + 'm3u8', display_id, 'mp4',
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://play\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)'
_GEO_BYPASS = False
_TESTS = [{
# geo restricted
'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1',
'only_matching': True,
+ }, {
+ # iframe api.play-backend.iprima.cz
+ 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2',
+ 'only_matching': True,
+ }, {
+ # iframe prima.iprima.cz
+ 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ self._set_cookie('play.iprima.cz', 'ott_adult_confirmed', '1')
+
webpage = self._download_webpage(url, video_id)
- video_id = self._search_regex(r'data-product="([^"]+)">', webpage, 'real id')
+ video_id = self._search_regex(
+ (r'<iframe[^>]+\bsrc=["\'](?:https?:)?//(?:api\.play-backend\.iprima\.cz/prehravac/embedded|prima\.iprima\.cz/[^/]+/[^/]+)\?.*?\bid=(p\d+)',
+ r'data-product="([^"]+)">'),
+ webpage, 'real id')
playerpage = self._download_webpage(
'http://play.iprima.cz/prehravac/init',
compat_etree_register_namespace,
)
from ..utils import (
+ determine_ext,
+ ExtractorError,
extract_attributes,
- xpath_with_ns,
- xpath_element,
- xpath_text,
int_or_none,
+ merge_dicts,
parse_duration,
smuggle_url,
- ExtractorError,
- determine_ext,
+ url_or_none,
+ xpath_with_ns,
+ xpath_element,
+ xpath_text,
)
resp_env = self._download_xml(
params['data-playlist-url'], video_id,
- headers=headers, data=etree.tostring(req_env))
- playlist = xpath_element(resp_env, './/Playlist')
- if playlist is None:
- fault_code = xpath_text(resp_env, './/faultcode')
- fault_string = xpath_text(resp_env, './/faultstring')
- if fault_code == 'InvalidGeoRegion':
- self.raise_geo_restricted(
- msg=fault_string, countries=self._GEO_COUNTRIES)
- elif fault_code not in (
- 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
- info.update({
- 'title': self._og_search_title(webpage),
- 'episode_title': params.get('data-video-episode'),
- 'series': params.get('data-video-title'),
- })
- else:
- title = xpath_text(playlist, 'EpisodeTitle', default=None)
- info.update({
- 'title': title,
- 'episode_title': title,
- 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
- 'series': xpath_text(playlist, 'ProgrammeTitle'),
- 'duration': parse_duration(xpath_text(playlist, 'Duration')),
- })
- video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
- media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
- rtmp_url = media_files.attrib['base']
+ headers=headers, data=etree.tostring(req_env), fatal=False)
+ if resp_env:
+ playlist = xpath_element(resp_env, './/Playlist')
+ if playlist is None:
+ fault_code = xpath_text(resp_env, './/faultcode')
+ fault_string = xpath_text(resp_env, './/faultstring')
+ if fault_code == 'InvalidGeoRegion':
+ self.raise_geo_restricted(
+ msg=fault_string, countries=self._GEO_COUNTRIES)
+ elif fault_code not in (
+ 'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
+ info.update({
+ 'title': self._og_search_title(webpage),
+ 'episode_title': params.get('data-video-episode'),
+ 'series': params.get('data-video-title'),
+ })
+ else:
+ title = xpath_text(playlist, 'EpisodeTitle', default=None)
+ info.update({
+ 'title': title,
+ 'episode_title': title,
+ 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
+ 'series': xpath_text(playlist, 'ProgrammeTitle'),
+ 'duration': parse_duration(xpath_text(playlist, 'Duration')),
+ })
+ video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
+ media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
+ rtmp_url = media_files.attrib['base']
- for media_file in media_files.findall('MediaFile'):
- play_path = xpath_text(media_file, 'URL')
- if not play_path:
- continue
- tbr = int_or_none(media_file.get('bitrate'), 1000)
- f = {
- 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
- 'play_path': play_path,
- # Providing this swfVfy allows to avoid truncated downloads
- 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
- 'page_url': url,
- 'tbr': tbr,
- 'ext': 'flv',
- }
- app = self._search_regex(
- 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
- if app:
- f.update({
- 'url': rtmp_url.split('?', 1)[0],
- 'app': app,
- })
- else:
- f['url'] = rtmp_url
- formats.append(f)
+ for media_file in media_files.findall('MediaFile'):
+ play_path = xpath_text(media_file, 'URL')
+ if not play_path:
+ continue
+ tbr = int_or_none(media_file.get('bitrate'), 1000)
+ f = {
+ 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
+ 'play_path': play_path,
+ # Providing this swfVfy allows to avoid truncated downloads
+ 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+ 'page_url': url,
+ 'tbr': tbr,
+ 'ext': 'flv',
+ }
+ app = self._search_regex(
+ 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+ if app:
+ f.update({
+ 'url': rtmp_url.split('?', 1)[0],
+ 'app': app,
+ })
+ else:
+ f['url'] = rtmp_url
+ formats.append(f)
- for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
- if caption_url.text:
- extract_subtitle(caption_url.text)
+ for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
+ if caption_url.text:
+ extract_subtitle(caption_url.text)
ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
hmac = params.get('data-video-hmac')
for sub in subs:
if not isinstance(sub, dict):
continue
- href = sub.get('Href')
- if isinstance(href, compat_str):
+ href = url_or_none(sub.get('Href'))
+ if href:
extract_subtitle(href)
if not info.get('duration'):
info['duration'] = parse_duration(video_data.get('Duration'))
'formats': formats,
'subtitles': subtitles,
})
- return info
+
+ webpage_info = self._search_json_ld(webpage, video_id, default={})
+ if not webpage_info.get('title'):
+ webpage_info['title'] = self._html_search_regex(
+ r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
+ webpage, 'title', default=None) or self._og_search_title(
+ webpage, default=None) or self._html_search_meta(
+ 'twitter:title', webpage, 'title',
+ default=None) or webpage_info['episode']
+
+ return merge_dicts(info, webpage_info)
class ITVBTCCIE(InfoExtractor):
int_or_none,
mimetype2ext,
remove_end,
+ url_or_none,
)
formats = []
for a_format in video_data:
+ format_uri = url_or_none(a_format.get('uri'))
+ if not format_uri:
+ continue
format_id = a_format.get('resolution')
height = int_or_none(self._search_regex(
r'(\d+)p', format_id, 'height', default=None))
formats.append({
- 'url': a_format['uri'],
+ 'url': self._proto_relative_url(format_uri, 'https:'),
'format_id': format_id,
'ext': mimetype2ext(a_format.get('mime')) or 'mp4',
'height': height,
joj:|\r
https?://media\.joj\.sk/embed/\r
)\r
- (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\r
+ (?P<id>[^/?#^]+)\r
'''\r
_TESTS = [{\r
'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',\r
'thumbnail': r're:^https?://.*\.jpg$',\r
'duration': 3118,\r
}\r
+ }, {\r
+ 'url': 'https://media.joj.sk/embed/9i1cxv',\r
+ 'only_matching': True,\r
}, {\r
'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',\r
'only_matching': True,\r
+ }, {\r
+ 'url': 'joj:9i1cxv',\r
+ 'only_matching': True,\r
}]\r
\r
@staticmethod\r
def _extract_urls(webpage):\r
- return re.findall(\r
- r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',\r
- webpage)\r
+ return [\r
+ mobj.group('url')\r
+ for mobj in re.finditer(\r
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',\r
+ webpage)]\r
\r
def _real_extract(self, url):\r
video_id = self._match_id(url)\r
from .common import InfoExtractor
from ..aes import aes_decrypt_text
-from ..compat import (
- compat_str,
- compat_urllib_parse_unquote,
-)
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
str_to_int,
strip_or_none,
+ url_or_none,
)
encrypted = False
def extract_format(format_url, height=None):
- if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//')):
+ format_url = url_or_none(format_url)
+ if not format_url or not format_url.startswith(('http', '//')):
return
if format_url in format_urls:
return
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ dict_get,
+ int_or_none,
+)
+
+
+class KinoPoiskIE(InfoExtractor):
+ _GEO_COUNTRIES = ['RU']
+ _VALID_URL = r'https?://(?:www\.)?kinopoisk\.ru/film/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.kinopoisk.ru/film/81041/watch/',
+ 'md5': '4f71c80baea10dfa54a837a46111d326',
+ 'info_dict': {
+ 'id': '81041',
+ 'ext': 'mp4',
+ 'title': 'Алеша попович и тугарин змей',
+ 'description': 'md5:43787e673d68b805d0aa1df5a5aea701',
+ 'thumbnail': r're:^https?://.*',
+ 'duration': 4533,
+ 'age_limit': 12,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
+ 'url': 'https://www.kinopoisk.ru/film/81041',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://ott-widget.kinopoisk.ru/v1/kp/', video_id,
+ query={'kpId': video_id})
+
+ data = self._parse_json(
+ self._search_regex(
+ r'(?s)<script[^>]+\btype=["\']application/json[^>]+>(.+?)<',
+ webpage, 'data'),
+ video_id)['models']
+
+ film = data['filmStatus']
+ title = film.get('title') or film['originalTitle']
+
+ formats = self._extract_m3u8_formats(
+ data['playlistEntity']['uri'], video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ description = dict_get(
+ film, ('descriptscription', 'description',
+ 'shortDescriptscription', 'shortDescription'))
+ thumbnail = film.get('coverUrl') or film.get('posterUrl')
+ duration = int_or_none(film.get('duration'))
+ age_limit = int_or_none(film.get('restrictionAge'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'age_limit': age_limit,
+ 'formats': formats,
+ }
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ url_or_none,
)
captions = source.get('captionsAvailableLanguages')
if isinstance(captions, dict):
for lang, subtitle_url in captions.items():
- if lang != 'none' and isinstance(subtitle_url, compat_str):
+ subtitle_url = url_or_none(subtitle_url)
+ if lang != 'none' and subtitle_url:
subtitles.setdefault(lang, []).append({'url': subtitle_url})
return {
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- wat_id = self._search_regex(r'data-watid=[\'"](\d+)', webpage, 'wat id')
+ wat_id = self._search_regex(
+ (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'),
+ webpage, 'wat id')
return self.url_result('wat:' + wat_id, 'Wat', wat_id)
from .common import InfoExtractor
from ..compat import (
- compat_HTTPError,
compat_str,
compat_urlparse,
)
form_data = self._hidden_inputs(form_html)
form_data.update(extra_form_data)
- try:
- response = self._download_json(
- action_url, None, note,
- data=urlencode_postdata(form_data),
- headers={
- 'Referer': referrer_url,
- 'X-Requested-With': 'XMLHttpRequest',
- })
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500:
- response = self._parse_json(e.cause.read().decode('utf-8'), None)
- self._check_error(response, ('email', 'password'))
- raise
-
- self._check_error(response, 'ErrorMessage')
+ response = self._download_json(
+ action_url, None, note,
+ data=urlencode_postdata(form_data),
+ headers={
+ 'Referer': referrer_url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ }, expected_status=(418, 500, ))
+
+ self._check_error(response, ('email', 'password', 'ErrorMessage'))
return response, action_url
def _real_extract(self, url):
playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
+ webpage = self._download_webpage(
+ # Downloading for some hosts (e.g. dajto, doma) fails with 500
+ # although everything seems to be OK, so considering 500
+ # status code to be expected.
+ url, playlist_id, expected_status=500)
entries = [
self.url_result('http://videoarchiv.markiza.sk/video/%s' % video_id)
import re
-from .common import InfoExtractor
-from ..compat import compat_str
+from .theplatform import ThePlatformBaseIE
from ..utils import (
- determine_ext,
- parse_duration,
- try_get,
- unified_strdate,
+ ExtractorError,
+ int_or_none,
+ update_url_query,
)
-class MediasetIE(InfoExtractor):
+class MediasetIE(ThePlatformBaseIE):
+ _TP_TLD = 'eu'
_VALID_URL = r'''(?x)
(?:
mediaset:|
https?://
- (?:www\.)?video\.mediaset\.it/
+ (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
(?:
(?:video|on-demand)/(?:[^/]+/)+[^/]+_|
- player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
+ player/index\.html\?.*?\bprogramGuid=
)
- )(?P<id>[0-9]+)
+ )(?P<id>[0-9A-Z]{16})
'''
_TESTS = [{
# full episode
- 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
+ 'url': 'https://www.mediasetplay.mediaset.it/video/hellogoodbye/quarta-puntata_FAFU000000661824',
'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
'info_dict': {
- 'id': '661824',
+ 'id': 'FAFU000000661824',
'ext': 'mp4',
'title': 'Quarta puntata',
- 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 1414,
- 'creator': 'mediaset',
+ 'duration': 1414.26,
'upload_date': '20161107',
'series': 'Hello Goodbye',
- 'categories': ['reality'],
+ 'timestamp': 1478532900,
+ 'uploader': 'Rete 4',
+ 'uploader_id': 'R4',
},
- 'expected_warnings': ['is not a supported codec'],
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/matrix/puntata-del-25-maggio_F309013801000501',
+ 'md5': '288532f0ad18307705b01e581304cd7b',
+ 'info_dict': {
+ 'id': 'F309013801000501',
+ 'ext': 'mp4',
+ 'title': 'Puntata del 25 maggio',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 6565.007,
+ 'upload_date': '20180526',
+ 'series': 'Matrix',
+ 'timestamp': 1527326245,
+ 'uploader': 'Canale 5',
+ 'uploader_id': 'C5',
+ },
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
# clip
- 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
+ 'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
'only_matching': True,
}, {
# iframe simple
- 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
+ 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665924&id=665924',
'only_matching': True,
}, {
# iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
- 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
+ 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104',
'only_matching': True,
}, {
- 'url': 'mediaset:661824',
+ 'url': 'mediaset:FAFU000000665924',
'only_matching': True,
}]
webpage)]
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- video_list = self._download_json(
- 'http://cdnsel01.mediaset.net/GetCdn.aspx',
- video_id, 'Downloading video CDN JSON', query={
- 'streamid': video_id,
- 'format': 'json',
- })['videoList']
+ guid = self._match_id(url)
+ tp_path = 'PR1GhC/media/guid/2702976343/' + guid
+ info = self._extract_theplatform_metadata(tp_path, guid)
formats = []
- for format_url in video_list:
- if '.ism' in format_url:
- formats.extend(self._extract_ism_formats(
- format_url, video_id, ism_id='mss', fatal=False))
- else:
- formats.append({
- 'url': format_url,
- 'format_id': determine_ext(format_url),
- })
+ subtitles = {}
+ first_e = None
+ for asset_type in ('SD', 'HD'):
+ for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'):
+ try:
+ tp_formats, tp_subtitles = self._extract_theplatform_smil(
+ update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
+ 'mbr': 'true',
+ 'formats': f,
+ 'assetTypes': asset_type,
+ }), guid, 'Downloading %s %s SMIL data' % (f, asset_type))
+ except ExtractorError as e:
+ if not first_e:
+ first_e = e
+ break
+ for tp_f in tp_formats:
+ tp_f['quality'] = 1 if asset_type == 'HD' else 0
+ formats.extend(tp_formats)
+ subtitles = self._merge_subtitles(subtitles, tp_subtitles)
+ if first_e and not formats:
+ raise first_e
self._sort_formats(formats)
- mediainfo = self._download_json(
- 'http://plr.video.mediaset.it/html/metainfo.sjson',
- video_id, 'Downloading video info JSON', query={
- 'id': video_id,
- })['video']
-
- title = mediainfo['title']
-
- creator = try_get(
- mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
- category = try_get(
- mediainfo, lambda x: x['brand-info']['category'], compat_str)
- categories = [category] if category else None
+ fields = []
+ for templ, repls in (('tvSeason%sNumber', ('', 'Episode')), ('mediasetprogram$%s', ('brandTitle', 'numberOfViews', 'publishInfo'))):
+ fields.extend(templ % repl for repl in repls)
+ feed_data = self._download_json(
+ 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs/guid/-/' + guid,
+ guid, fatal=False, query={'fields': ','.join(fields)})
+ if feed_data:
+ publish_info = feed_data.get('mediasetprogram$publishInfo') or {}
+ info.update({
+ 'episode_number': int_or_none(feed_data.get('tvSeasonEpisodeNumber')),
+ 'season_number': int_or_none(feed_data.get('tvSeasonNumber')),
+ 'series': feed_data.get('mediasetprogram$brandTitle'),
+ 'uploader': publish_info.get('description'),
+ 'uploader_id': publish_info.get('channel'),
+ 'view_count': int_or_none(feed_data.get('mediasetprogram$numberOfViews')),
+ })
- return {
- 'id': video_id,
- 'title': title,
- 'description': mediainfo.get('short-description'),
- 'thumbnail': mediainfo.get('thumbnail'),
- 'duration': parse_duration(mediainfo.get('duration')),
- 'creator': creator,
- 'upload_date': unified_strdate(mediainfo.get('production-date')),
- 'webpage_url': mediainfo.get('url'),
- 'series': mediainfo.get('brand-value'),
- 'categories': categories,
+ info.update({
+ 'id': guid,
'formats': formats,
- }
+ 'subtitles': subtitles,
+ })
+ return info
mimetype2ext,
unescapeHTML,
unsmuggle_url,
+ url_or_none,
urljoin,
)
stream_formats = []
for unum, VideoUrl in enumerate(video_urls):
- video_url = VideoUrl.get('Location')
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(VideoUrl.get('Location'))
+ if not video_url:
continue
# XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS
# coding: utf-8
from __future__ import unicode_literals
-import json
-import uuid
-
from .common import InfoExtractor
-from .ooyala import OoyalaIE
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
from ..utils import (
int_or_none,
- extract_attributes,
- determine_ext,
smuggle_url,
parse_duration,
)
-class MiTeleBaseIE(InfoExtractor):
- def _get_player_info(self, url, webpage):
- player_data = extract_attributes(self._search_regex(
- r'(?s)(<ms-video-player.+?</ms-video-player>)',
- webpage, 'ms video player'))
- video_id = player_data['data-media-id']
- if player_data.get('data-cms-id') == 'ooyala':
- return self.url_result(
- 'ooyala:%s' % video_id, ie=OoyalaIE.ie_key(), video_id=video_id)
- config_url = compat_urlparse.urljoin(url, player_data['data-config'])
- config = self._download_json(
- config_url, video_id, 'Downloading config JSON')
- mmc_url = config['services']['mmc']
-
- duration = None
- formats = []
- for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')):
- mmc = self._download_json(
- m_url, video_id, 'Downloading mmc JSON')
- if not duration:
- duration = int_or_none(mmc.get('duration'))
- for location in mmc['locations']:
- gat = self._proto_relative_url(location.get('gat'), 'http:')
- gcp = location.get('gcp')
- ogn = location.get('ogn')
- if None in (gat, gcp, ogn):
- continue
- token_data = {
- 'gcp': gcp,
- 'ogn': ogn,
- 'sta': 0,
- }
- media = self._download_json(
- gat, video_id, data=json.dumps(token_data).encode('utf-8'),
- headers={
- 'Content-Type': 'application/json;charset=utf-8',
- 'Referer': url,
- })
- stream = media.get('stream') or media.get('file')
- if not stream:
- continue
- ext = determine_ext(stream)
- if ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
- video_id, f4m_id='hds', fatal=False))
- elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- stream, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'formats': formats,
- 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'),
- 'duration': duration,
- }
-
-
class MiTeleIE(InfoExtractor):
IE_DESC = 'mitele.es'
_VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player'
_TESTS = [{
'url': 'http://www.mitele.es/programas-tv/diario-de/57b0dfb9c715da65618b4afa/player',
'info_dict': {
- 'id': '57b0dfb9c715da65618b4afa',
+ 'id': 'FhYW1iNTE6J6H7NkQRIEzfne6t2quqPg',
'ext': 'mp4',
'title': 'Tor, la web invisible',
'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
# no explicit title
'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/57b0de3dc915da14058b4876/player',
'info_dict': {
- 'id': '57b0de3dc915da14058b4876',
+ 'id': 'oyNG1iNTE6TAPP-JmCjbwfwJqqMMX3Vq',
'ext': 'mp4',
'title': 'Cuarto Milenio Temporada 6 Programa 226',
'description': 'md5:5ff132013f0cd968ffbf1f5f3538a65f',
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- gigya_url = self._search_regex(
- r'<gigya-api>[^>]*</gigya-api>[^>]*<script\s+src="([^"]*)">[^>]*</script>',
- webpage, 'gigya', default=None)
- gigya_sc = self._download_webpage(
- compat_urlparse.urljoin('http://www.mitele.es/', gigya_url),
- video_id, 'Downloading gigya script')
-
- # Get a appKey/uuid for getting the session key
- appKey = self._search_regex(
- r'constant\s*\(\s*["\']_appGridApplicationKey["\']\s*,\s*["\']([0-9a-f]+)',
- gigya_sc, 'appKey')
-
- session_json = self._download_json(
- 'https://appgrid-api.cloud.accedo.tv/session',
- video_id, 'Downloading session keys', query={
- 'appKey': appKey,
- 'uuid': compat_str(uuid.uuid4()),
- })
paths = self._download_json(
- 'https://appgrid-api.cloud.accedo.tv/metadata/general_configuration,%20web_configuration',
- video_id, 'Downloading paths JSON',
- query={'sessionKey': compat_str(session_json['sessionKey'])})
+ 'https://www.mitele.es/amd/agp/web/metadata/general_configuration',
+ video_id, 'Downloading paths JSON')
ooyala_s = paths['general_configuration']['api_configuration']['ooyala_search']
+ base_url = ooyala_s.get('base_url', 'cdn-search-mediaset.carbyne.ps.ooyala.com')
+ full_path = ooyala_s.get('full_path', '/search/v1/full/providers/')
source = self._download_json(
- 'http://%s%s%s/docs/%s' % (
- ooyala_s['base_url'], ooyala_s['full_path'],
- ooyala_s['provider_id'], video_id),
+ '%s://%s%s%s/docs/%s' % (
+ ooyala_s.get('protocol', 'https'), base_url, full_path,
+ ooyala_s.get('provider_id', '104951'), video_id),
video_id, 'Downloading data JSON', query={
'include_titles': 'Series,Season',
- 'product_name': 'test',
+ 'product_name': ooyala_s.get('product_name', 'test'),
'format': 'full',
})['hits']['hits'][0]['_source']
title = self._html_search_regex(
r'id="view-upload-title">\s+([^<]+)<', webpage, 'title')
- video_url = self._html_search_regex(
- r'setup\(\{\s+"file".+: "([^"]+)",', webpage, 'video URL')
+ video_url = (self._html_search_regex(
+ (r'setup\(\{\s*["\']file["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+ r'fileurl\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1'),
+ webpage, 'video URL', default=None, group='url') or
+ 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id)
age_limit = self._rta_search(webpage)
view_count = str_to_int(self._html_search_regex(
r'<strong>Views</strong>\s+([^<]+)<',
class MotherlessGroupIE(InfoExtractor):
- _VALID_URL = 'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
+ _VALID_URL = r'https?://(?:www\.)?motherless\.com/gv?/(?P<id>[a-z0-9_]+)'
_TESTS = [{
'url': 'http://motherless.com/g/movie_scenes',
'info_dict': {
if not entries:
entries = [
self.url_result(
- compat_urlparse.urljoin(base, '/' + video_id),
- ie=MotherlessIE.ie_key(), video_id=video_id)
- for video_id in orderedSet(re.findall(
+ compat_urlparse.urljoin(base, '/' + entry_id),
+ ie=MotherlessIE.ie_key(), video_id=entry_id)
+ for entry_id in orderedSet(re.findall(
r'data-codename=["\']([A-Z0-9]+)', webpage))]
return entries
from .common import InfoExtractor
from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
+from ..compat import compat_urllib_parse_unquote
from ..utils import (
find_xpath_attr,
smuggle_url,
'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310',
'only_matching': True,
},
+ {
+ # Percent escaped url
+ 'url': 'https://www.nbc.com/up-all-night/video/day-after-valentine%27s-day/n2189',
+ 'only_matching': True,
+ }
]
def _real_extract(self, url):
permalink, video_id = re.match(self._VALID_URL, url).groups()
- permalink = 'http' + permalink
+ permalink = 'http' + compat_urllib_parse_unquote(permalink)
response = self._download_json(
'https://api.nbc.com/v3/videos', video_id, query={
'filter[permalink]': permalink,
},
'timing_constraint': 'unlimited'
}
- }))
+ }).encode())
resolution = video_quality.get('resolution', {})
from .common import InfoExtractor
from ..utils import (
clean_html,
+ int_or_none,
+ js_to_json,
+ qualities,
unified_strdate,
+ url_or_none,
)
+class NovaEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1',
+ 'md5': 'b3834f6de5401baabf31ed57456463f7',
+ 'info_dict': {
+ 'id': '8o0n0r',
+ 'ext': 'mp4',
+ 'title': '2180. díl',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 2578,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ bitrates = self._parse_json(
+ self._search_regex(
+ r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'),
+ video_id, transform_source=js_to_json)
+
+ QUALITIES = ('lq', 'mq', 'hq', 'hd')
+ quality_key = qualities(QUALITIES)
+
+ formats = []
+ for format_id, format_list in bitrates.items():
+ if not isinstance(format_list, list):
+ continue
+ for format_url in format_list:
+ format_url = url_or_none(format_url)
+ if not format_url:
+ continue
+ f = {
+ 'url': format_url,
+ }
+ f_id = format_id
+ for quality in QUALITIES:
+ if '%s.mp4' % quality in format_url:
+ f_id += '-%s' % quality
+ f.update({
+ 'quality': quality_key(quality),
+ 'format_note': quality.upper(),
+ })
+ break
+ f['format_id'] = f_id
+ formats.append(f)
+ self._sort_formats(formats)
+
+ title = self._og_search_title(
+ webpage, default=None) or self._search_regex(
+ (r'<value>(?P<title>[^<]+)',
+ r'videoTitle\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+ 'title', group='value')
+ thumbnail = self._og_search_thumbnail(
+ webpage, default=None) or self._search_regex(
+ r'poster\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'thumbnail', fatal=False, group='value')
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
+
+
class NovaIE(InfoExtractor):
IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz'
_VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
_TESTS = [{
- 'url': 'http://tvnoviny.nova.cz/clanek/novinky/co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou.html?utm_source=tvnoviny&utm_medium=cpfooter&utm_campaign=novaplus',
- 'info_dict': {
- 'id': '1608920',
- 'display_id': 'co-na-sebe-sportaci-praskli-vime-jestli-pujde-hrdlicka-na-materskou',
- 'ext': 'flv',
- 'title': 'Duel: Michal Hrdlička a Petr Suchoň',
- 'description': 'md5:d0cc509858eee1b1374111c588c6f5d5',
- 'thumbnail': r're:^https?://.*\.(?:jpg)',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
'info_dict': {
'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53',
'thumbnail': r're:^https?://.*\.(?:jpg)',
}
- }, {
- 'url': 'http://novaplus.nova.cz/porad/policie-modrava/video/5591-policie-modrava-15-dil-blondynka-na-hrbitove',
- 'info_dict': {
- 'id': '1756825',
- 'display_id': '5591-policie-modrava-15-dil-blondynka-na-hrbitove',
- 'ext': 'flv',
- 'title': 'Policie Modrava - 15. díl - Blondýnka na hřbitově',
- 'description': 'md5:dc24e50be5908df83348e50d1431295e', # Make sure this description is clean of html tags
- 'thumbnail': r're:^https?://.*\.(?:jpg)',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://novaplus.nova.cz/porad/televizni-noviny/video/5585-televizni-noviny-30-5-2015/',
- 'info_dict': {
- 'id': '1756858',
- 'ext': 'flv',
- 'title': 'Televizní noviny - 30. 5. 2015',
- 'thumbnail': r're:^https?://.*\.(?:jpg)',
- 'upload_date': '20150530',
- },
- 'params': {
- # rtmp download
- 'skip_download': True,
- }
}, {
'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html',
'info_dict': {
# rtmp download
'skip_download': True,
}
+ }, {
+ # media.cms.nova.cz embed
+ 'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil',
+ 'info_dict': {
+ 'id': '8o0n0r',
+ 'ext': 'mp4',
+ 'title': '2180. díl',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 2578,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [NovaEmbedIE.ie_key()],
}, {
'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
'only_matching': True,
webpage = self._download_webpage(url, display_id)
+ # novaplus
+ embed_id = self._search_regex(
+ r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',
+ webpage, 'embed url', default=None)
+ if embed_id:
+ return self.url_result(
+ 'https://media.cms.nova.cz/embed/%s' % embed_id,
+ ie=NovaEmbedIE.ie_key(), video_id=embed_id)
+
video_id = self._search_regex(
[r"(?:media|video_id)\s*:\s*'(\d+)'",
r'media=(\d+)',
webpage, 'video id')
config_url = self._search_regex(
- r'src="(http://tn\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
+ r'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"',
webpage, 'config url', default=None)
+ config_params = {}
+
+ if not config_url:
+ player = self._parse_json(
+ self._search_regex(
+ r'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s*:\s*["\']?\d+.+?})\s*\)', webpage,
+ 'player', default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+ if player:
+ config_url = url_or_none(player.get('configUrl'))
+ params = player.get('configParams')
+ if isinstance(params, dict):
+ config_params = params
if not config_url:
DEFAULT_SITE_ID = '23000'
}
site_id = self._search_regex(
- r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(site, DEFAULT_SITE_ID)
+ r'site=(\d+)', webpage, 'site id', default=None) or SITES.get(
+ site, DEFAULT_SITE_ID)
- config_url = ('http://tn.nova.cz/bin/player/videojs/config.php?site=%s&media=%s&jsVar=vjsconfig'
- % (site_id, video_id))
+ config_url = 'https://api.nova.cz/bin/player/videojs/config.php'
+ config_params = {
+ 'site': site_id,
+ 'media': video_id,
+ 'quality': 3,
+ 'version': 1,
+ }
config = self._download_json(
config_url, display_id,
- 'Downloading config JSON',
+ 'Downloading config JSON', query=config_params,
transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1])
mediafile = config['mediafile']
video_url = stream_info.get('url')
if not video_url or video_url in urls:
continue
- urls.add(item_url)
+ urls.add(video_url)
if determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
video_url, video_id, ext='mp4',
import re
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import (
+ compat_str,
+ compat_urllib_parse_unquote,
+)
from ..utils import (
ExtractorError,
int_or_none,
+ JSON_LD_RE,
+ NO_DEFAULT,
parse_age_limit,
parse_duration,
+ try_get,
)
}]
+class NRKTVEpisodeIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
+ _TEST = {
+ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1/episode/8',
+ 'info_dict': {
+ 'id': 'MSUI14000816AA',
+ 'ext': 'mp4',
+ 'title': 'Backstage 8:30',
+ 'description': 'md5:de6ca5d5a2d56849e4021f2bf2850df4',
+ 'duration': 1320,
+ 'series': 'Backstage',
+ 'season_number': 1,
+ 'episode_number': 8,
+ 'episode': '8:30',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ nrk_id = self._parse_json(
+ self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'),
+ display_id)['@id']
+
+ assert re.match(NRKTVIE._EPISODE_RE, nrk_id)
+ return self.url_result(
+ 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id)
+
+
+class NRKTVSerieBaseIE(InfoExtractor):
+ def _extract_series(self, webpage, display_id, fatal=True):
+ config = self._parse_json(
+ self._search_regex(
+ r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config',
+ default='{}' if not fatal else NO_DEFAULT),
+ display_id, fatal=False)
+ if not config:
+ return
+ return try_get(config, lambda x: x['series'], dict)
+
+ def _extract_episodes(self, season):
+ entries = []
+ if not isinstance(season, dict):
+ return entries
+ episodes = season.get('episodes')
+ if not isinstance(episodes, list):
+ return entries
+ for episode in episodes:
+ nrk_id = episode.get('prfId')
+ if not nrk_id or not isinstance(nrk_id, compat_str):
+ continue
+ entries.append(self.url_result(
+ 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id))
+ return entries
+
+
+class NRKTVSeasonIE(NRKTVSerieBaseIE):
+ _VALID_URL = r'https?://tv\.nrk\.no/serie/[^/]+/sesong/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://tv.nrk.no/serie/backstage/sesong/1',
+ 'info_dict': {
+ 'id': '1',
+ 'title': 'Sesong 1',
+ },
+ 'playlist_mincount': 30,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return (False if NRKTVIE.suitable(url) or NRKTVEpisodeIE.suitable(url)
+ else super(NRKTVSeasonIE, cls).suitable(url))
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ series = self._extract_series(webpage, display_id)
+
+ season = next(
+ s for s in series['seasons']
+ if int(display_id) == s.get('seasonNumber'))
+
+ title = try_get(season, lambda x: x['titles']['title'], compat_str)
+ return self.playlist_result(
+ self._extract_episodes(season), display_id, title)
+
+
+class NRKTVSeriesIE(NRKTVSerieBaseIE):
+ _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
+ _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
+ _TESTS = [{
+ # new layout
+ 'url': 'https://tv.nrk.no/serie/backstage',
+ 'info_dict': {
+ 'id': 'backstage',
+ 'title': 'Backstage',
+ 'description': 'md5:c3ec3a35736fca0f9e1207b5511143d3',
+ },
+ 'playlist_mincount': 60,
+ }, {
+ # old layout
+ 'url': 'https://tv.nrk.no/serie/groenn-glede',
+ 'info_dict': {
+ 'id': 'groenn-glede',
+ 'title': 'Grønn glede',
+ 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
+ },
+ 'playlist_mincount': 9,
+ }, {
+ 'url': 'http://tv.nrksuper.no/serie/labyrint',
+ 'info_dict': {
+ 'id': 'labyrint',
+ 'title': 'Labyrint',
+ 'description': 'md5:58afd450974c89e27d5a19212eee7115',
+ },
+ 'playlist_mincount': 3,
+ }, {
+ 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tv.nrk.no/serie/saving-the-human-race',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tv.nrk.no/serie/postmann-pat',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return (
+ False if any(ie.suitable(url)
+ for ie in (NRKTVIE, NRKTVEpisodeIE, NRKTVSeasonIE))
+ else super(NRKTVSeriesIE, cls).suitable(url))
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, series_id)
+
+ # New layout (e.g. https://tv.nrk.no/serie/backstage)
+ series = self._extract_series(webpage, series_id, fatal=False)
+ if series:
+ title = try_get(series, lambda x: x['titles']['title'], compat_str)
+ description = try_get(
+ series, lambda x: x['titles']['subtitle'], compat_str)
+ entries = []
+ for season in series['seasons']:
+ entries.extend(self._extract_episodes(season))
+ return self.playlist_result(entries, series_id, title, description)
+
+ # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede)
+ entries = [
+ self.url_result(
+ 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
+ series=series_id, season=season_id))
+ for season_id in re.findall(self._ITEM_RE, webpage)
+ ]
+
+ title = self._html_search_meta(
+ 'seriestitle', webpage,
+ 'title', default=None) or self._og_search_title(
+ webpage, fatal=False)
+
+ description = self._html_search_meta(
+ 'series_description', webpage,
+ 'description', default=None) or self._og_search_description(webpage)
+
+ return self.playlist_result(entries, series_id, title, description)
+
+
class NRKTVDirekteIE(NRKTVIE):
IE_DESC = 'NRK TV Direkte and NRK Radio Direkte'
_VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)'
r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False)
-class NRKTVSeriesIE(InfoExtractor):
- _VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
- _ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
- _TESTS = [{
- 'url': 'https://tv.nrk.no/serie/groenn-glede',
- 'info_dict': {
- 'id': 'groenn-glede',
- 'title': 'Grønn glede',
- 'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
- },
- 'playlist_mincount': 9,
- }, {
- 'url': 'http://tv.nrksuper.no/serie/labyrint',
- 'info_dict': {
- 'id': 'labyrint',
- 'title': 'Labyrint',
- 'description': 'md5:58afd450974c89e27d5a19212eee7115',
- },
- 'playlist_mincount': 3,
- }, {
- 'url': 'https://tv.nrk.no/serie/broedrene-dal-og-spektralsteinene',
- 'only_matching': True,
- }, {
- 'url': 'https://tv.nrk.no/serie/saving-the-human-race',
- 'only_matching': True,
- }, {
- 'url': 'https://tv.nrk.no/serie/postmann-pat',
- 'only_matching': True,
- }]
-
- @classmethod
- def suitable(cls, url):
- return False if NRKTVIE.suitable(url) else super(NRKTVSeriesIE, cls).suitable(url)
-
- def _real_extract(self, url):
- series_id = self._match_id(url)
-
- webpage = self._download_webpage(url, series_id)
-
- entries = [
- self.url_result(
- 'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
- series=series_id, season=season_id))
- for season_id in re.findall(self._ITEM_RE, webpage)
- ]
-
- title = self._html_search_meta(
- 'seriestitle', webpage,
- 'title', default=None) or self._og_search_title(
- webpage, fatal=False)
-
- description = self._html_search_meta(
- 'series_description', webpage,
- 'description', default=None) or self._og_search_description(webpage)
-
- return self.playlist_result(entries, series_id, title, description)
-
-
class NRKSkoleIE(InfoExtractor):
IE_DESC = 'NRK Skole'
_VALID_URL = r'https?://(?:www\.)?nrk\.no/skole/?\?.*\bmediaId=(?P<id>\d+)'
strip_jsonp,
strip_or_none,
unified_strdate,
+ url_or_none,
US_RATINGS,
)
if redirect_url and redirect_url not in redirect_urls:
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ encodings = info.get('encodings')
+ if isinstance(encodings, list):
+ for encoding in encodings:
+ encoding_url = url_or_none(encoding)
+ if encoding_url and encoding_url not in redirect_urls:
+ redirects.append({'url': encoding_url})
+ redirect_urls.add(encoding_url)
chapters = []
# Player pages may also serve different qualities
parse_resolution,
try_get,
unified_timestamp,
+ url_or_none,
urljoin,
)
videos\.tcit\.fr|
peertube\.cpy\.re
)'''
+ _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
_VALID_URL = r'''(?x)
- https?://
- %s
- /(?:videos/(?:watch|embed)|api/v\d/videos)/
- (?P<id>[^/?\#&]+)
- ''' % _INSTANCES_RE
+ (?:
+ peertube:(?P<host>[^:]+):|
+ https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/
+ )
+ (?P<id>%s)
+ ''' % (_INSTANCES_RE, _UUID_RE)
_TESTS = [{
'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c',
'md5': '80f24ff364cc9d333529506a263e7feb',
}, {
'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
'only_matching': True,
+ }, {
+ 'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
+ 'only_matching': True,
}]
@staticmethod
- def _extract_urls(webpage):
- return [
- mobj.group('url')
- for mobj in re.finditer(
- r'''(?x)<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s/videos/embed/[^/?\#&]+)\1'''
- % PeerTubeIE._INSTANCES_RE, webpage)]
+ def _extract_peertube_url(webpage, source_url):
+ mobj = re.match(
+ r'https?://(?P<host>[^/]+)/videos/watch/(?P<id>%s)'
+ % PeerTubeIE._UUID_RE, source_url)
+ if mobj and any(p in webpage for p in (
+ '<title>PeerTube<',
+ 'There will be other non JS-based clients to access PeerTube',
+ '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
+ return 'peertube:%s:%s' % mobj.group('host', 'id')
+
+ @staticmethod
+ def _extract_urls(webpage, source_url):
+ entries = re.findall(
+ r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)'''
+ % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage)
+ if not entries:
+ peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url)
+ if peertube_url:
+ entries = [peertube_url]
+ return entries
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host') or mobj.group('host_2')
+ video_id = mobj.group('id')
video = self._download_json(
- urljoin(url, '/api/v1/videos/%s' % video_id), video_id)
+ 'https://%s/api/v1/videos/%s' % (host, video_id), video_id)
title = video['name']
for file_ in video['files']:
if not isinstance(file_, dict):
continue
- file_url = file_.get('fileUrl')
- if not file_url or not isinstance(file_url, compat_str):
+ file_url = url_or_none(file_.get('fileUrl'))
+ if not file_url:
continue
file_size = int_or_none(file_.get('size'))
format_id = try_get(
class PluralsightBaseIE(InfoExtractor):
_API_BASE = 'https://app.pluralsight.com'
+ _GRAPHQL_EP = '%s/player/api/graphql' % _API_BASE
+ _GRAPHQL_HEADERS = {
+ 'Content-Type': 'application/json;charset=UTF-8',
+ }
+ _GRAPHQL_COURSE_TMPL = '''
+query BootstrapPlayer {
+ rpc {
+ bootstrapPlayer {
+ profile {
+ firstName
+ lastName
+ email
+ username
+ userHandle
+ authed
+ isAuthed
+ plan
+ }
+ course(courseId: "%s") {
+ name
+ title
+ courseHasCaptions
+ translationLanguages {
+ code
+ name
+ }
+ supportsWideScreenVideoFormats
+ timestamp
+ modules {
+ name
+ title
+ duration
+ formattedDuration
+ author
+ authorized
+ clips {
+ authorized
+ clipId
+ duration
+ formattedDuration
+ id
+ index
+ moduleIndex
+ moduleTitle
+ name
+ title
+ watched
+ }
+ }
+ }
+ }
+ }
+}'''
+
def _download_course(self, course_id, url, display_id):
try:
return self._download_course_rpc(course_id, url, display_id)
def _download_course_rpc(self, course_id, url, display_id):
response = self._download_json(
- '%s/player/functions/rpc' % self._API_BASE, display_id,
- 'Downloading course JSON',
- data=json.dumps({
- 'fn': 'bootstrapPlayer',
- 'payload': {
- 'courseId': course_id,
- },
- }).encode('utf-8'),
- headers={
- 'Content-Type': 'application/json;charset=utf-8',
- 'Referer': url,
- })
-
- course = try_get(response, lambda x: x['payload']['course'], dict)
+ self._GRAPHQL_EP, display_id, data=json.dumps({
+ 'query': self._GRAPHQL_COURSE_TMPL % course_id,
+ 'variables': {}
+ }).encode('utf-8'), headers=self._GRAPHQL_HEADERS)
+
+ course = try_get(
+ response, lambda x: x['data']['rpc']['bootstrapPlayer']['course'],
+ dict)
if course:
return course
'only_matching': True,
}]
+ GRAPHQL_VIEWCLIP_TMPL = '''
+query viewClip {
+ viewClip(input: {
+ author: "%(author)s",
+ clipIndex: %(clipIndex)d,
+ courseName: "%(courseName)s",
+ includeCaptions: %(includeCaptions)s,
+ locale: "%(locale)s",
+ mediaType: "%(mediaType)s",
+ moduleName: "%(moduleName)s",
+ quality: "%(quality)s"
+ }) {
+ urls {
+ url
+ cdn
+ rank
+ source
+ },
+ status
+ }
+}'''
+
def _real_initialize(self):
self._login()
f = QUALITIES[quality].copy()
clip_post = {
'author': author,
- 'includeCaptions': False,
+ 'includeCaptions': 'false',
'clipIndex': int(clip_idx),
'courseName': course_name,
'locale': 'en',
'quality': '%dx%d' % (f['width'], f['height']),
}
format_id = '%s-%s' % (ext, quality)
- viewclip = self._download_json(
- '%s/video/clips/viewclip' % self._API_BASE, display_id,
- 'Downloading %s viewclip JSON' % format_id, fatal=False,
- data=json.dumps(clip_post).encode('utf-8'),
- headers={'Content-Type': 'application/json;charset=utf-8'})
+
+ try:
+ viewclip = self._download_json(
+ self._GRAPHQL_EP, display_id,
+ 'Downloading %s viewclip graphql' % format_id,
+ data=json.dumps({
+ 'query': self.GRAPHQL_VIEWCLIP_TMPL % clip_post,
+ 'variables': {}
+ }).encode('utf-8'),
+ headers=self._GRAPHQL_HEADERS)['data']['viewClip']
+ except ExtractorError:
+ # Still works but most likely will go soon
+ viewclip = self._download_json(
+ '%s/video/clips/viewclip' % self._API_BASE, display_id,
+ 'Downloading %s viewclip JSON' % format_id, fatal=False,
+ data=json.dumps(clip_post).encode('utf-8'),
+ headers={'Content-Type': 'application/json;charset=utf-8'})
# Pluralsight tracks multiple sequential calls to ViewClip API and start
# to return 429 HTTP errors after some time (see
config = self._parse_json(
self._search_regex(
- r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=',
+ (r'=\s*({.+?})\s*;\s*v1ar\b',
+ r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='),
webpage, 'config', default='{}'),
display_id, transform_source=js_to_json, fatal=False)
'height': int(height),
'filesize_approx': parse_filesize(filesize),
} for format_url, height, filesize in re.findall(
- r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<',
+ r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<',
webpage)]
thumbnail = None
duration = None
import functools
import itertools
import operator
-# import os
import re
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
- # compat_urllib_parse_unquote,
- # compat_urllib_parse_unquote_plus,
- # compat_urllib_parse_urlparse,
+ compat_str,
)
from ..utils import (
ExtractorError,
int_or_none,
js_to_json,
orderedSet,
- # sanitized_Request,
remove_quotes,
str_to_int,
+ url_or_none,
)
-# from ..aes import (
-# aes_decrypt_text
-# )
class PornHubIE(InfoExtractor):
'id': '1331683002',
'ext': 'mp4',
'title': '重庆婷婷女王足交',
- 'uploader': 'cj397186295',
+ 'uploader': 'Unknown',
'duration': 1753,
'view_count': int,
'like_count': int,
'params': {
'skip_download': True,
},
+ }, {
+ # subtitles
+ 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
+ 'info_dict': {
+ 'id': 'ph5af5fef7c2aa7',
+ 'ext': 'mp4',
+ 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
+ 'uploader': 'BFFs',
+ 'duration': 622,
+ 'view_count': int,
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ 'tags': list,
+ 'categories': list,
+ 'subtitles': {
+ 'en': [{
+ "ext": 'srt'
+ }]
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
'only_matching': True,
self._set_cookie('pornhub.com', 'platform', platform)
return self._download_webpage(
'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
- video_id)
+ video_id, 'Downloading %s webpage' % platform)
webpage = dl_webpage('pc')
'PornHub said: %s' % error_msg,
expected=True, video_id=video_id)
- tv_webpage = dl_webpage('tv')
-
- assignments = self._search_regex(
- r'(var.+?mediastring.+?)</script>', tv_webpage,
- 'encoded url').split(';')
-
- js_vars = {}
-
- def parse_js_value(inp):
- inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
- if '+' in inp:
- inps = inp.split('+')
- return functools.reduce(
- operator.concat, map(parse_js_value, inps))
- inp = inp.strip()
- if inp in js_vars:
- return js_vars[inp]
- return remove_quotes(inp)
-
- for assn in assignments:
- assn = assn.strip()
- if not assn:
- continue
- assn = re.sub(r'var\s+', '', assn)
- vname, value = assn.split('=', 1)
- js_vars[vname] = parse_js_value(value)
-
- video_url = js_vars['mediastring']
-
- title = self._search_regex(
- r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
-
# video_title from flashvars contains whitespace instead of non-ASCII (see
# http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
# on that anymore.
- title = title or self._html_search_meta(
+ title = self._html_search_meta(
'twitter:title', webpage, default=None) or self._search_regex(
(r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
webpage, 'title', group='title')
+ video_urls = []
+ video_urls_set = set()
+ subtitles = {}
+
flashvars = self._parse_json(
self._search_regex(
r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
video_id)
if flashvars:
+ subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
+ if subtitle_url:
+ subtitles.setdefault('en', []).append({
+ 'url': subtitle_url,
+ 'ext': 'srt',
+ })
thumbnail = flashvars.get('image_url')
duration = int_or_none(flashvars.get('video_duration'))
+ media_definitions = flashvars.get('mediaDefinitions')
+ if isinstance(media_definitions, list):
+ for definition in media_definitions:
+ if not isinstance(definition, dict):
+ continue
+ video_url = definition.get('videoUrl')
+ if not video_url or not isinstance(video_url, compat_str):
+ continue
+ if video_url in video_urls_set:
+ continue
+ video_urls_set.add(video_url)
+ video_urls.append(
+ (video_url, int_or_none(definition.get('quality'))))
else:
- title, thumbnail, duration = [None] * 3
+ thumbnail, duration = [None] * 2
+
+ if not video_urls:
+ tv_webpage = dl_webpage('tv')
+
+ assignments = self._search_regex(
+ r'(var.+?mediastring.+?)</script>', tv_webpage,
+ 'encoded url').split(';')
+
+ js_vars = {}
+
+ def parse_js_value(inp):
+ inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
+ if '+' in inp:
+ inps = inp.split('+')
+ return functools.reduce(
+ operator.concat, map(parse_js_value, inps))
+ inp = inp.strip()
+ if inp in js_vars:
+ return js_vars[inp]
+ return remove_quotes(inp)
+
+ for assn in assignments:
+ assn = assn.strip()
+ if not assn:
+ continue
+ assn = re.sub(r'var\s+', '', assn)
+ vname, value = assn.split('=', 1)
+ js_vars[vname] = parse_js_value(value)
+
+ video_url = js_vars['mediastring']
+ if video_url not in video_urls_set:
+ video_urls.append((video_url, None))
+ video_urls_set.add(video_url)
+
+ for mobj in re.finditer(
+ r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage):
+ video_url = mobj.group('url')
+ if video_url not in video_urls_set:
+ video_urls.append((video_url, None))
+ video_urls_set.add(video_url)
+
+ formats = []
+ for video_url, height in video_urls:
+ tbr = None
+ mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
+ if mobj:
+ if not height:
+ height = int(mobj.group('height'))
+ tbr = int(mobj.group('tbr'))
+ formats.append({
+ 'url': video_url,
+ 'format_id': '%dp' % height if height else None,
+ 'height': height,
+ 'tbr': tbr,
+ })
+ self._sort_formats(formats)
video_uploader = self._html_search_regex(
- r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
+ r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
view_count = self._extract_count(
return {
'id': video_id,
- 'url': video_url,
'uploader': video_uploader,
'title': title,
'thumbnail': thumbnail,
'like_count': like_count,
'dislike_count': dislike_count,
'comment_count': comment_count,
- # 'formats': formats,
+ 'formats': formats,
'age_limit': 18,
'tags': tags,
'categories': categories,
+ 'subtitles': subtitles,
}
class PornHubUserVideosIE(PornHubPlaylistBaseIE):
- _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos'
+ _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
_TESTS = [{
'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
'info_dict': {
}, {
'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
'only_matching': True,
+ }, {
+ 'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
+ 'only_matching': True,
}]
def _real_extract(self, url):
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+ parse_resolution,
+ str_or_none,
+ try_get,
+ unified_timestamp,
+ url_or_none,
+ urljoin,
+)
+
+
+class PuhuTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-izle'
+ IE_NAME = 'puhutv'
+ _TESTS = [{
+ # film
+ 'url': 'https://puhutv.com/sut-kardesler-izle',
+ 'md5': 'fbd8f2d8e7681f8bcd51b592475a6ae7',
+ 'info_dict': {
+ 'id': '5085',
+ 'display_id': 'sut-kardesler',
+ 'ext': 'mp4',
+ 'title': 'Süt Kardeşler',
+ 'description': 'md5:405fd024df916ca16731114eb18e511a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 4832.44,
+ 'creator': 'Arzu Film',
+ 'timestamp': 1469778212,
+ 'upload_date': '20160729',
+ 'release_year': 1976,
+ 'view_count': int,
+ 'tags': ['Aile', 'Komedi', 'Klasikler'],
+ },
+ }, {
+ # episode, geo restricted, bypassable with --geo-verification-proxy
+ 'url': 'https://puhutv.com/jet-sosyete-1-bolum-izle',
+ 'only_matching': True,
+ }, {
+ # 4k, with subtitles
+ 'url': 'https://puhutv.com/dip-1-bolum-izle',
+ 'only_matching': True,
+ }]
+ _SUBTITLE_LANGS = {
+ 'English': 'en',
+ 'Deutsch': 'de',
+ 'عربى': 'ar'
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ info = self._download_json(
+ urljoin(url, '/api/slug/%s-izle' % display_id),
+ display_id)['data']
+
+ video_id = compat_str(info['id'])
+ title = info.get('name') or info['title']['name']
+ if info.get('display_name'):
+ title = '%s %s' % (title, info.get('display_name'))
+
+ try:
+ videos = self._download_json(
+ 'https://puhutv.com/api/assets/%s/videos' % video_id,
+ display_id, 'Downloading video JSON',
+ headers=self.geo_verification_headers())
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_geo_restricted()
+ raise
+
+ formats = []
+ for video in videos['data']['videos']:
+ media_url = url_or_none(video.get('url'))
+ if not media_url:
+ continue
+ playlist = video.get('is_playlist')
+ if video.get('stream_type') == 'hls' and playlist is True:
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ continue
+ quality = int_or_none(video.get('quality'))
+ f = {
+ 'url': media_url,
+ 'ext': 'mp4',
+ 'height': quality
+ }
+ video_format = video.get('video_format')
+ if video_format == 'hls' and playlist is False:
+ format_id = 'hls'
+ f['protocol'] = 'm3u8_native'
+ elif video_format == 'mp4':
+ format_id = 'http'
+
+ else:
+ continue
+ if quality:
+ format_id += '-%sp' % quality
+ f['format_id'] = format_id
+ formats.append(f)
+ self._sort_formats(formats)
+
+ description = try_get(
+ info, lambda x: x['title']['description'],
+ compat_str) or info.get('description')
+ timestamp = unified_timestamp(info.get('created_at'))
+ creator = try_get(
+ info, lambda x: x['title']['producer']['name'], compat_str)
+
+ duration = float_or_none(
+ try_get(info, lambda x: x['content']['duration_in_ms'], int),
+ scale=1000)
+ view_count = try_get(info, lambda x: x['content']['watch_count'], int)
+
+ images = try_get(
+ info, lambda x: x['content']['images']['wide'], dict) or {}
+ thumbnails = []
+ for image_id, image_url in images.items():
+ if not isinstance(image_url, compat_str):
+ continue
+ if not image_url.startswith(('http', '//')):
+ image_url = 'https://%s' % image_url
+ t = parse_resolution(image_id)
+ t.update({
+ 'id': image_id,
+ 'url': image_url
+ })
+ thumbnails.append(t)
+
+ release_year = try_get(info, lambda x: x['title']['released_at'], int)
+
+ season_number = int_or_none(info.get('season_number'))
+ season_id = str_or_none(info.get('season_id'))
+ episode_number = int_or_none(info.get('episode_number'))
+
+ tags = []
+ for genre in try_get(info, lambda x: x['title']['genres'], list) or []:
+ if not isinstance(genre, dict):
+ continue
+ genre_name = genre.get('name')
+ if genre_name and isinstance(genre_name, compat_str):
+ tags.append(genre_name)
+
+ subtitles = {}
+ for subtitle in try_get(
+ info, lambda x: x['content']['subtitles'], list) or []:
+ if not isinstance(subtitle, dict):
+ continue
+ lang = subtitle.get('language')
+ sub_url = url_or_none(subtitle.get('url'))
+ if not lang or not isinstance(lang, compat_str) or not sub_url:
+ continue
+ subtitles[self._SUBTITLE_LANGS.get(lang, lang)] = [{
+ 'url': sub_url
+ }]
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'season_id': season_id,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'release_year': release_year,
+ 'timestamp': timestamp,
+ 'creator': creator,
+ 'view_count': view_count,
+ 'duration': duration,
+ 'tags': tags,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'formats': formats
+ }
+
+
+class PuhuTVSerieIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?puhutv\.com/(?P<id>[^/?#&]+)-detay'
+ IE_NAME = 'puhutv:serie'
+ _TESTS = [{
+ 'url': 'https://puhutv.com/deniz-yildizi-detay',
+ 'info_dict': {
+ 'title': 'Deniz Yıldızı',
+ 'id': 'deniz-yildizi',
+ },
+ 'playlist_mincount': 205,
+ }, {
+ # a film detail page which is using same url with serie page
+ 'url': 'https://puhutv.com/kaybedenler-kulubu-detay',
+ 'only_matching': True,
+ }]
+
+ def _extract_entries(self, seasons):
+ for season in seasons:
+ season_id = season.get('id')
+ if not season_id:
+ continue
+ page = 1
+ has_more = True
+ while has_more is True:
+ season = self._download_json(
+ 'https://galadriel.puhutv.com/seasons/%s' % season_id,
+ season_id, 'Downloading page %s' % page, query={
+ 'page': page,
+ 'per': 40,
+ })
+ episodes = season.get('episodes')
+ if isinstance(episodes, list):
+ for ep in episodes:
+ slug_path = str_or_none(ep.get('slugPath'))
+ if not slug_path:
+ continue
+ video_id = str_or_none(int_or_none(ep.get('id')))
+ yield self.url_result(
+ 'https://puhutv.com/%s' % slug_path,
+ ie=PuhuTVIE.ie_key(), video_id=video_id,
+ video_title=ep.get('name') or ep.get('eventLabel'))
+ page += 1
+ has_more = season.get('hasMore')
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ info = self._download_json(
+ urljoin(url, '/api/slug/%s-detay' % playlist_id),
+ playlist_id)['data']
+
+ seasons = info.get('seasons')
+ if seasons:
+ return self.playlist_result(
+ self._extract_entries(seasons), playlist_id, info.get('name'))
+
+ # For films, these are using same url with series
+ video_id = info.get('slug') or info['assets'][0]['slug']
+ return self.url_result(
+ 'https://puhutv.com/%s-izle' % video_id,
+ PuhuTVIE.ie_key(), video_id)
from .common import InfoExtractor
from ..utils import (
- unified_strdate,
+ parse_resolution,
str_to_int,
+ unified_strdate,
+ urlencode_postdata,
+ urljoin,
)
def _real_extract(self, url):
video_id = self._match_id(url)
+ download_host = self._download_json(
+ 'https://www.radiojavan.com/videos/video_host', video_id,
+ data=urlencode_postdata({'id': video_id}),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': url,
+ }).get('host', 'https://host1.rjmusicmedia.com')
+
webpage = self._download_webpage(url, video_id)
- formats = [{
- 'url': 'https://media.rdjavan.com/media/music_video/%s' % video_path,
- 'format_id': '%sp' % height,
- 'height': int(height),
- } for height, video_path in re.findall(r"RJ\.video(\d+)p\s*=\s*'/?([^']+)'", webpage)]
+ formats = []
+ for format_id, _, video_path in re.findall(
+ r'RJ\.video(?P<format_id>\d+[pPkK])\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2',
+ webpage):
+ f = parse_resolution(format_id)
+ f.update({
+ 'url': urljoin(download_host, video_path),
+ 'format_id': format_id,
+ })
+ formats.append(f)
self._sort_formats(formats)
title = self._og_search_title(webpage)
_GEO_BYPASS = False
def _extract_relinker_info(self, relinker_url, video_id):
+ if not re.match(r'https?://', relinker_url):
+ return {'formats': [{'url': relinker_url}]}
+
formats = []
geoprotection = None
is_live = None
'params': {
'skip_download': True,
},
+ }, {
+ # Direct MMS URL
+ 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html',
+ 'only_matching': True,
}]
def _extract_from_content_id(self, content_id, url):
from .common import InfoExtractor
from .vimeo import VimeoIE
+from ..compat import compat_str
from ..utils import (
- extract_attributes,
ExtractorError,
- smuggle_url,
- unsmuggle_url,
+ int_or_none,
+ merge_dicts,
+ try_get,
+ unescapeHTML,
+ unified_timestamp,
urljoin,
)
class RayWenderlichIE(InfoExtractor):
- _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ videos\.raywenderlich\.com/courses|
+ (?:www\.)?raywenderlich\.com
+ )/
+ (?P<course_id>[^/]+)/lessons/(?P<id>\d+)
+ '''
_TESTS = [{
- 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1',
+ 'url': 'https://www.raywenderlich.com/3530-testing-in-ios/lessons/1',
'info_dict': {
'id': '248377018',
'ext': 'mp4',
- 'title': 'Testing In iOS Episode 1: Introduction',
+ 'title': 'Introduction',
+ 'description': 'md5:804d031b3efa9fcb49777d512d74f722',
+ 'timestamp': 1513906277,
+ 'upload_date': '20171222',
'duration': 133,
'uploader': 'Ray Wenderlich',
'uploader_id': 'user3304672',
'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_video_id(data, lesson_id):
+ if not data:
+ return
+ groups = try_get(data, lambda x: x['groups'], list) or []
+ if not groups:
+ return
+ for group in groups:
+ if not isinstance(group, dict):
+ continue
+ contents = try_get(data, lambda x: x['contents'], list) or []
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ ordinal = int_or_none(content.get('ordinal'))
+ if ordinal != lesson_id:
+ continue
+ video_id = content.get('identifier')
+ if video_id:
+ return compat_str(video_id)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_id, lesson_id = mobj.group('course_id', 'id')
+ display_id = '%s/%s' % (course_id, lesson_id)
+
+ webpage = self._download_webpage(url, display_id)
+
+ thumbnail = self._og_search_thumbnail(
+ webpage, default=None) or self._html_search_meta(
+ 'twitter:image', webpage, 'thumbnail')
+
+ if '>Subscribe to unlock' in webpage:
+ raise ExtractorError(
+ 'This content is only available for subscribers',
+ expected=True)
+
+ info = {
+ 'thumbnail': thumbnail,
+ }
+
+ vimeo_id = self._search_regex(
+ r'data-vimeo-id=["\'](\d+)', webpage, 'vimeo id', default=None)
+
+ if not vimeo_id:
+ data = self._parse_json(
+ self._search_regex(
+ r'data-collection=(["\'])(?P<data>{.+?})\1', webpage,
+ 'data collection', default='{}', group='data'),
+ display_id, transform_source=unescapeHTML, fatal=False)
+ video_id = self._extract_video_id(
+ data, lesson_id) or self._search_regex(
+ r'/videos/(\d+)/', thumbnail, 'video id')
+ headers = {
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest',
+ }
+ csrf_token = self._html_search_meta(
+ 'csrf-token', webpage, 'csrf token', default=None)
+ if csrf_token:
+ headers['X-CSRF-Token'] = csrf_token
+ video = self._download_json(
+ 'https://videos.raywenderlich.com/api/v1/videos/%s.json'
+ % video_id, display_id, headers=headers)['video']
+ vimeo_id = video['clips'][0]['provider_id']
+ info.update({
+ '_type': 'url_transparent',
+ 'title': video.get('name'),
+ 'description': video.get('description') or video.get(
+ 'meta_description'),
+ 'duration': int_or_none(video.get('duration')),
+ 'timestamp': unified_timestamp(video.get('created_at')),
+ })
+
+ return merge_dicts(info, self.url_result(
+ VimeoIE._smuggle_referrer(
+ 'https://player.vimeo.com/video/%s' % vimeo_id, url),
+ ie=VimeoIE.ie_key(), video_id=vimeo_id))
+
+
+class RayWenderlichCourseIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ videos\.raywenderlich\.com/courses|
+ (?:www\.)?raywenderlich\.com
+ )/
+ (?P<id>[^/]+)
+ '''
+
+ _TEST = {
+ 'url': 'https://www.raywenderlich.com/3530-testing-in-ios',
'info_dict': {
'title': 'Testing in iOS',
- 'id': '105-testing-in-ios',
+ 'id': '3530-testing-in-ios',
},
'params': {
'noplaylist': False,
},
'playlist_count': 29,
- }]
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if RayWenderlichIE.suitable(url) else super(
+ RayWenderlichCourseIE, cls).suitable(url)
def _real_extract(self, url):
- url, smuggled_data = unsmuggle_url(url, {})
+ course_id = self._match_id(url)
- mobj = re.match(self._VALID_URL, url)
- course_id, lesson_id = mobj.group('course_id', 'id')
- video_id = '%s/%s' % (course_id, lesson_id)
-
- webpage = self._download_webpage(url, video_id)
-
- no_playlist = self._downloader.params.get('noplaylist')
- if no_playlist or smuggled_data.get('force_video', False):
- if no_playlist:
- self.to_screen(
- 'Downloading just video %s because of --no-playlist'
- % video_id)
- if '>Subscribe to unlock' in webpage:
- raise ExtractorError(
- 'This content is only available for subscribers',
- expected=True)
- vimeo_id = self._search_regex(
- r'data-vimeo-id=["\'](\d+)', webpage, 'video id')
- return self.url_result(
- VimeoIE._smuggle_referrer(
- 'https://player.vimeo.com/video/%s' % vimeo_id, url),
- ie=VimeoIE.ie_key(), video_id=vimeo_id)
-
- self.to_screen(
- 'Downloading playlist %s - add --no-playlist to just download video'
- % course_id)
-
- lesson_ids = set((lesson_id, ))
- for lesson in re.findall(
- r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage):
- attrs = extract_attributes(lesson)
- if not attrs:
- continue
- lesson_url = attrs.get('href')
- if not lesson_url:
- continue
- lesson_id = self._search_regex(
- r'/lessons/(\d+)', lesson_url, 'lesson id', default=None)
- if not lesson_id:
- continue
- lesson_ids.add(lesson_id)
+ webpage = self._download_webpage(url, course_id)
entries = []
- for lesson_id in sorted(lesson_ids):
+ lesson_urls = set()
+ for lesson_url in re.findall(
+ r'<a[^>]+\bhref=["\'](/%s/lessons/\d+)' % course_id, webpage):
+ if lesson_url in lesson_urls:
+ continue
+ lesson_urls.add(lesson_url)
entries.append(self.url_result(
- smuggle_url(urljoin(url, lesson_id), {'force_video': True}),
- ie=RayWenderlichIE.ie_key()))
+ urljoin(url, lesson_url), ie=RayWenderlichIE.ie_key()))
- title = self._search_regex(
- r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title',
- default=None)
+ title = self._og_search_title(
+ webpage, default=None) or self._html_search_meta(
+ 'twitter:title', webpage, 'title', default=None)
return self.playlist_result(entries, course_id, title)
class RedBullTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?P<id>AP-\w+)'
+ _VALID_URL = r'https?://(?:www\.)?redbull(?:\.tv|\.com/(?:[^/]+/)?tv)/video/(?P<id>AP-\w+)'
_TESTS = [{
# film
'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11',
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://www.redbull.com/int-en/tv/video/AP-1UWHCAR9S1W11/rob-meets-sam-gaze?playlist=playlists::3f81040a-2f31-4832-8e2e-545b1d39d173',
+ 'only_matching': True,
}]
def _real_extract(self, url):
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
str_to_int,
unified_strdate,
+ url_or_none,
)
video_id, fatal=False)
if medias and isinstance(medias, list):
for media in medias:
- format_url = media.get('videoUrl')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(media.get('videoUrl'))
+ if not format_url:
continue
format_id = media.get('quality')
formats.append({
from ..utils import (
determine_ext,
int_or_none,
+ url_or_none,
)
title = config['title']
formats = []
for video in config['src']:
- src = video.get('src')
- if not src or not isinstance(src, compat_str):
+ src = url_or_none(video.get('src'))
+ if not src:
continue
ext = determine_ext(src)
if ext == 'm3u8':
http_url = data.get('url')
if formats and http_url and re.search(height_re, http_url):
http_url = fix_url(http_url)
- for m3u8_f in formats.copy():
+ for m3u8_f in formats[:]:
height = m3u8_f.get('height')
if not height:
continue
int_or_none,
try_get,
unified_timestamp,
+ url_or_none,
)
break
for result in results:
- video_url = result.get('video_url')
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(result.get('video_url'))
+ if not video_url:
continue
entry = self._extract_video(result, require_title=False)
entry.update({
description = info.get('description') or self._og_search_description(webpage)
return self.playlist_result([
- self.url_result(url, ie=SeznamZpravyIE.ie_key())
- for url in SeznamZpravyIE._extract_urls(webpage)],
+ self.url_result(entry_url, ie=SeznamZpravyIE.ie_key())
+ for entry_url in SeznamZpravyIE._extract_urls(webpage)],
article_id, title, description)
class SixPlayIE(InfoExtractor):
IE_NAME = '6play'
- _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay.be)/.+?-c_)(?P<id>[0-9]+)'
+ _VALID_URL = r'(?:6play:|https?://(?:www\.)?(?P<domain>6play\.fr|rtlplay\.be|play\.rtl\.hr)/.+?-c_)(?P<id>[0-9]+)'
_TESTS = [{
'url': 'https://www.6play.fr/minute-par-minute-p_9533/le-but-qui-a-marque-lhistoire-du-football-francais-c_12041051',
'md5': '31fcd112637baa0c2ab92c4fcd8baf27',
}, {
'url': 'https://www.rtlplay.be/rtl-info-13h-p_8551/les-titres-du-rtlinfo-13h-c_12045869',
'only_matching': True,
+ }, {
+ 'url': 'https://play.rtl.hr/pj-masks-p_9455/epizoda-34-sezona-1-catboyevo-cudo-na-dva-kotaca-c_11984989',
+ 'only_matching': True,
}]
def _real_extract(self, url):
service, consumer_name = {
'6play.fr': ('6play', 'm6web'),
'rtlplay.be': ('rtlbe_rtl_play', 'rtlbe'),
+ 'play.rtl.hr': ('rtlhr_rtl_play', 'rtlhr'),
}.get(domain, ('6play', 'm6web'))
data = self._download_json(
if container == 'm3u8' or ext == 'm3u8':
if protocol == 'usp':
if compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]:
- urlh = self._request_webpage(asset_url, video_id, fatal=False)
+ urlh = self._request_webpage(
+ asset_url, video_id, fatal=False,
+ headers=self.geo_verification_headers())
if not urlh:
continue
asset_url = urlh.geturl()
class SlidesLiveIE(InfoExtractor):
_VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)'
_TESTS = [{
+ # video_service_name = YOUTUBE
'url': 'https://slideslive.com/38902413/gcc-ia16-backend',
'md5': 'b29fcd6c6952d0c79c5079b0e7a07e6f',
'info_dict': {
'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
'upload_date': '20170925',
}
+ }, {
+ # video_service_name = youtube
+ 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
url, video_id, headers={'Accept': 'application/json'})
- service_name = video_data['video_service_name']
- if service_name == 'YOUTUBE':
+ service_name = video_data['video_service_name'].lower()
+ if service_name == 'youtube':
yt_video_id = video_data['video_service_id']
return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id)
else:
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class SlutloadIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:\w+\.)?slutload\.com/video/[^/]+/(?P<id>[^/]+)/?$'
+ _VALID_URL = r'https?://(?:\w+\.)?slutload\.com/(?:video/[^/]+|embed_player|watch)/(?P<id>[^/]+)'
_TESTS = [{
'url': 'http://www.slutload.com/video/virginie-baisee-en-cam/TD73btpBqSxc/',
'md5': '868309628ba00fd488cf516a113fd717',
'title': 'virginie baisee en cam',
'age_limit': 18,
'thumbnail': r're:https?://.*?\.jpg'
- }
+ },
}, {
# mobile site
'url': 'http://mobile.slutload.com/video/masturbation-solo/fviFLmc6kzJ/',
'only_matching': True,
+ }, {
+ 'url': 'http://www.slutload.com/embed_player/TD73btpBqSxc/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.slutload.com/watch/TD73btpBqSxc/Virginie-Baisee-En-Cam.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- desktop_url = re.sub(r'^(https?://)mobile\.', r'\1', url)
- webpage = self._download_webpage(desktop_url, video_id)
+ embed_page = self._download_webpage(
+ 'http://www.slutload.com/embed_player/%s' % video_id, video_id,
+ 'Downloading embed page', fatal=False)
- video_title = self._html_search_regex(r'<h1><strong>([^<]+)</strong>',
- webpage, 'title').strip()
+ if embed_page:
+ def extract(what):
+ return self._html_search_regex(
+ r'data-video-%s=(["\'])(?P<url>(?:(?!\1).)+)\1' % what,
+ embed_page, 'video %s' % what, default=None, group='url')
- video_url = self._html_search_regex(
- r'(?s)<div id="vidPlayer"\s+data-url="([^"]+)"',
- webpage, 'video URL')
- thumbnail = self._html_search_regex(
- r'(?s)<div id="vidPlayer"\s+.*?previewer-file="([^"]+)"',
- webpage, 'thumbnail', fatal=False)
+ video_url = extract('url')
+ if video_url:
+ title = self._html_search_regex(
+ r'<title>([^<]+)', embed_page, 'title', default=video_id)
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': extract('preview'),
+ 'age_limit': 18
+ }
- return {
+ webpage = self._download_webpage(
+ 'http://www.slutload.com/video/_/%s/' % video_id, video_id)
+ title = self._html_search_regex(
+ r'<h1><strong>([^<]+)</strong>', webpage, 'title').strip()
+ info = self._parse_html5_media_entries(url, webpage, video_id)[0]
+ info.update({
'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'thumbnail': thumbnail,
- 'age_limit': 18
- }
+ 'title': title,
+ 'age_limit': 18,
+ })
+ return info
'title': title,
'url': video_url,
'thumbnail': thumbnail,
+ 'http_headers': {
+ 'Referer': url,
+ },
}
determine_ext,
dict_get,
int_or_none,
+ orderedSet,
+ strip_or_none,
try_get,
urljoin,
compat_str,
class SVTPlayIE(SVTPlayBaseIE):
IE_DESC = 'SVT Play and Öppet arkiv'
- _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'''(?x)
+ (?:
+ svt:(?P<svt_id>[^/?#&]+)|
+ https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
+ )
+ '''
_TESTS = [{
'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
}, {
'url': 'https://www.svtplay.se/kanaler/svt1',
'only_matching': True,
+ }, {
+ 'url': 'svt:1376446-003A',
+ 'only_matching': True,
+ }, {
+ 'url': 'svt:14278044',
+ 'only_matching': True,
}]
+ def _adjust_title(self, info):
+ if info['is_live']:
+ info['title'] = self._live_title(info['title'])
+
+ def _extract_by_video_id(self, video_id, webpage=None):
+ data = self._download_json(
+ 'https://api.svt.se/videoplayer-api/video/%s' % video_id,
+ video_id, headers=self.geo_verification_headers())
+ info_dict = self._extract_video(data, video_id)
+ if not info_dict.get('title'):
+ title = dict_get(info_dict, ('episode', 'series'))
+ if not title and webpage:
+ title = re.sub(
+ r'\s*\|\s*.+?$', '', self._og_search_title(webpage))
+ if not title:
+ title = video_id
+ info_dict['title'] = title
+ self._adjust_title(info_dict)
+ return info_dict
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id, svt_id = mobj.group('id', 'svt_id')
+
+ if svt_id:
+ return self._extract_by_video_id(svt_id)
webpage = self._download_webpage(url, video_id)
thumbnail = self._og_search_thumbnail(webpage)
- def adjust_title(info):
- if info['is_live']:
- info['title'] = self._live_title(info['title'])
-
if data:
video_info = try_get(
data, lambda x: x['context']['dispatcher']['stores']['VideoTitlePageStore']['data']['video'],
'title': data['context']['dispatcher']['stores']['MetaStore']['title'],
'thumbnail': thumbnail,
})
- adjust_title(info_dict)
+ self._adjust_title(info_dict)
return info_dict
- video_id = self._search_regex(
+ svt_id = self._search_regex(
r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)',
- webpage, 'video id', default=None)
+ webpage, 'video id')
- if video_id:
- data = self._download_json(
- 'https://api.svt.se/videoplayer-api/video/%s' % video_id,
- video_id, headers=self.geo_verification_headers())
- info_dict = self._extract_video(data, video_id)
- if not info_dict.get('title'):
- info_dict['title'] = re.sub(
- r'\s*\|\s*.+?$', '',
- info_dict.get('episode') or self._og_search_title(webpage))
- adjust_title(info_dict)
- return info_dict
+ return self._extract_by_video_id(svt_id, webpage)
class SVTSeriesIE(SVTPlayBaseIE):
return self.playlist_result(
entries, series_id, title, metadata.get('description'))
+
+
+class SVTPageIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)'
+ _TESTS = [{
+ 'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
+ 'info_dict': {
+ 'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
+ 'title': 'GUIDE: Sommarträning du kan göra var och när du vill',
+ },
+ 'playlist_count': 7,
+ }, {
+ 'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
+ 'info_dict': {
+ 'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
+ 'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”',
+ },
+ 'playlist_count': 1,
+ }, {
+ # only programTitle
+ 'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
+ 'info_dict': {
+ 'id': '2900353',
+ 'ext': 'mp4',
+ 'title': 'Stjärnorna skojar till det - under SVT-intervjun',
+ 'duration': 27,
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://www.svt.se/nyheter/lokalt/vast/svt-testar-tar-nagon-upp-skrapet-1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.svt.se/vader/manadskronikor/maj2018',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, playlist_id)
+
+ entries = [
+ self.url_result(
+ 'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id)
+ for video_id in orderedSet(re.findall(
+ r'data-video-id=["\'](\d+)', webpage))]
+
+ title = strip_or_none(self._og_search_title(webpage, default=None))
+
+ return self.playlist_result(entries, playlist_id, title)
from ..compat import compat_str
from ..utils import (
+ float_or_none,
int_or_none,
try_get,
+ url_or_none,
)
'''
_TESTS = [{
'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
- 'md5': '0de43ac406aa3e4ea74b66c9c7789b13',
+ 'md5': 'b0ce2b05ca215042124fbc9e3886493a',
'info_dict': {
'id': '102',
'ext': 'mp4',
'uploader': 'Dan Dennett',
'width': 853,
'duration': 1308,
- }
+ 'view_count': int,
+ 'comment_count': int,
+ 'tags': list,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
- 'md5': 'b899ac15e345fb39534d913f7606082b',
+ # missing HTTP bitrates
+ 'url': 'https://www.ted.com/talks/vishal_sikka_the_beauty_and_power_of_algorithms',
'info_dict': {
- 'id': 'tSVI8ta_P4w',
+ 'id': '6069',
'ext': 'mp4',
- 'title': 'Vishal Sikka: The beauty and power of algorithms',
+ 'title': 'The beauty and power of algorithms',
'thumbnail': r're:^https?://.+\.jpg',
- 'description': 'md5:6261fdfe3e02f4f579cbbfc00aff73f4',
- 'upload_date': '20140122',
- 'uploader_id': 'TEDInstitute',
- 'uploader': 'TED Institute',
+ 'description': 'md5:734e352710fb00d840ab87ae31aaf688',
+ 'uploader': 'Vishal Sikka',
+ },
+ 'params': {
+ 'skip_download': True,
},
- 'add_ie': ['Youtube'],
}, {
'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
- 'md5': '71b3ab2f4233012dce09d515c9c39ce2',
+ 'md5': 'e6b9617c01a7970ceac8bb2c92c346c0',
'info_dict': {
'id': '1972',
'ext': 'mp4',
'description': 'md5:5174aed4d0f16021b704120360f72b92',
'duration': 1128,
},
+ 'params': {
+ 'skip_download': True,
+ },
}, {
'url': 'http://www.ted.com/playlists/who_are_the_hackers',
'info_dict': {
'skip_download': True,
},
}, {
- # YouTube video
- 'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
- 'add_ie': ['Youtube'],
+ # no nativeDownloads
+ 'url': 'https://www.ted.com/talks/tom_thum_the_orchestra_in_my_mouth',
'info_dict': {
- 'id': 'aFBIPO-P7LM',
+ 'id': '1792',
'ext': 'mp4',
- 'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
- 'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
- 'uploader': 'TEDx Talks',
- 'uploader_id': 'TEDxTalks',
- 'upload_date': '20111216',
+ 'title': 'The orchestra in my mouth',
+ 'description': 'md5:5d1d78650e2f8dfcbb8ebee2951ac29a',
+ 'uploader': 'Tom Thum',
+ 'view_count': int,
+ 'comment_count': int,
+ 'tags': list,
},
'params': {
'skip_download': True,
info = self._extract_info(webpage)
- talk_info = try_get(
- info, lambda x: x['__INITIAL_DATA__']['talks'][0],
- dict) or info['talks'][0]
+ data = try_get(info, lambda x: x['__INITIAL_DATA__'], dict) or info
+ talk_info = data['talks'][0]
title = talk_info['title'].strip()
- external = talk_info.get('external')
- if external:
- service = external['service']
- self.to_screen('Found video from %s' % service)
- ext_url = None
- if service.lower() == 'youtube':
- ext_url = external.get('code')
- return {
- '_type': 'url',
- 'url': ext_url or external['uri'],
- }
-
native_downloads = try_get(
- talk_info, lambda x: x['downloads']['nativeDownloads'],
- dict) or talk_info['nativeDownloads']
+ talk_info,
+ (lambda x: x['downloads']['nativeDownloads'],
+ lambda x: x['nativeDownloads']),
+ dict) or {}
formats = [{
'url': format_url,
player_talk = talk_info['player_talks'][0]
+ external = player_talk.get('external')
+ if isinstance(external, dict):
+ service = external.get('service')
+ if isinstance(service, compat_str):
+ ext_url = None
+ if service.lower() == 'youtube':
+ ext_url = external.get('code')
+ return {
+ '_type': 'url',
+ 'url': ext_url or external['uri'],
+ }
+
resources_ = player_talk.get('resources') or talk_info.get('resources')
http_url = None
for format_id, resources in resources_.items():
+ if not isinstance(resources, dict):
+ continue
if format_id == 'h264':
for resource in resources:
h264_url = resource.get('file')
'tbr': int_or_none(resource.get('bitrate')),
})
elif format_id == 'hls':
+ stream_url = url_or_none(resources.get('stream'))
+ if not stream_url:
+ continue
formats.extend(self._extract_m3u8_formats(
- resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
+ stream_url, video_name, 'mp4', m3u8_id=format_id,
+ fatal=False))
m3u8_formats = list(filter(
lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
if not bitrate:
continue
+ bitrate_url = re.sub(r'\d+k', bitrate, http_url)
+ if not self._is_valid_url(
+ bitrate_url, video_name, '%s bitrate' % bitrate):
+ continue
f = m3u8_format.copy()
f.update({
- 'url': re.sub(r'\d+k', bitrate, http_url),
+ 'url': bitrate_url,
'format_id': m3u8_format['format_id'].replace('hls', 'http'),
'protocol': 'http',
})
'description': self._og_search_description(webpage),
'subtitles': self._get_subtitles(video_id, talk_info),
'formats': formats,
- 'duration': talk_info.get('duration'),
+ 'duration': float_or_none(talk_info.get('duration')),
+ 'view_count': int_or_none(data.get('viewed_count')),
+ 'comment_count': int_or_none(
+ try_get(data, lambda x: x['comments']['count'])),
+ 'tags': try_get(talk_info, lambda x: x['tags'], list),
}
def _get_subtitles(self, video_id, talk_info):
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .nexx import NexxIE
+from ..compat import compat_urlparse
+
+
+class Tele5IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:mediathek|tv)/(?P<id>[^?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
+ 'info_dict': {
+ 'id': '1549416',
+ 'ext': 'mp4',
+ 'upload_date': '20180814',
+ 'timestamp': 1534290623,
+ 'title': 'Pandorum',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.tele5.de/tv/kalkofes-mattscheibe/video-clips/politik-und-gesellschaft?ve_id=1551191',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tele5.de/tv/dark-matter/videos',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ video_id = (qs.get('vid') or qs.get('ve_id') or [None])[0]
+
+ if not video_id:
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ video_id = self._html_search_regex(
+ r'id\s*=\s*["\']video-player["\'][^>]+data-id\s*=\s*["\'](\d+)',
+ webpage, 'video id')
+
+ return self.url_result(
+ 'https://api.nexx.cloud/v3/759/videos/byid/%s' % video_id,
+ ie=NexxIE.ie_key(), video_id=video_id)
# coding: utf-8
from __future__ import unicode_literals
-from .mitele import MiTeleBaseIE
+import json
+import re
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+from ..utils import (
+ clean_html,
+ determine_ext,
+ int_or_none,
+ str_or_none,
+ urljoin,
+)
-class TelecincoIE(MiTeleBaseIE):
+
+class TelecincoIE(InfoExtractor):
IE_DESC = 'telecinco.es, cuatro.com and mediaset.es'
_VALID_URL = r'https?://(?:www\.)?(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html'
_TESTS = [{
'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html',
- 'md5': '8d7b2d5f699ee2709d992a63d5cd1712',
'info_dict': {
- 'id': 'JEA5ijCnF6p5W08A1rNKn7',
- 'ext': 'mp4',
+ 'id': '1876350223',
'title': 'Bacalao con kokotxas al pil-pil',
'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
- 'duration': 662,
},
+ 'playlist': [{
+ 'md5': 'adb28c37238b675dad0f042292f209a7',
+ 'info_dict': {
+ 'id': 'JEA5ijCnF6p5W08A1rNKn7',
+ 'ext': 'mp4',
+ 'title': 'Con Martín Berasategui, hacer un bacalao al pil-pil es fácil y divertido',
+ 'duration': 662,
+ },
+ }]
}, {
'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html',
- 'md5': '284393e5387b3b947b77c613ef04749a',
+ 'md5': '9468140ebc300fbb8b9d65dc6e5c4b43',
'info_dict': {
'id': 'jn24Od1zGLG4XUZcnUnZB6',
'ext': 'mp4',
},
}, {
'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html',
- 'md5': '749afab6ea5a136a8806855166ae46a2',
+ 'md5': 'ae2dc6b7b50b2392076a51c0f70e01f6',
'info_dict': {
'id': 'aywerkD2Sv1vGNqq9b85Q2',
'ext': 'mp4',
'only_matching': True,
}]
+ def _parse_content(self, content, url):
+ video_id = content['dataMediaId']
+ if content.get('dataCmsId') == 'ooyala':
+ return self.url_result(
+ 'ooyala:%s' % video_id, OoyalaIE.ie_key(), video_id)
+ config_url = urljoin(url, content['dataConfig'])
+ config = self._download_json(
+ config_url, video_id, 'Downloading config JSON')
+ title = config['info']['title']
+
+ def mmc_url(mmc_type):
+ return re.sub(
+ r'/(?:flash|html5)\.json', '/%s.json' % mmc_type,
+ config['services']['mmc'])
+
+ duration = None
+ formats = []
+ for mmc_type in ('flash', 'html5'):
+ mmc = self._download_json(
+ mmc_url(mmc_type), video_id,
+ 'Downloading %s mmc JSON' % mmc_type, fatal=False)
+ if not mmc:
+ continue
+ if not duration:
+ duration = int_or_none(mmc.get('duration'))
+ for location in mmc['locations']:
+ gat = self._proto_relative_url(location.get('gat'), 'http:')
+ gcp = location.get('gcp')
+ ogn = location.get('ogn')
+ if None in (gat, gcp, ogn):
+ continue
+ token_data = {
+ 'gcp': gcp,
+ 'ogn': ogn,
+ 'sta': 0,
+ }
+ media = self._download_json(
+ gat, video_id, data=json.dumps(token_data).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json;charset=utf-8',
+ 'Referer': url,
+ }, fatal=False) or {}
+ stream = media.get('stream') or media.get('file')
+ if not stream:
+ continue
+ ext = determine_ext(stream)
+ if ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ stream + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
+ video_id, f4m_id='hds', fatal=False))
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ stream, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': content.get('dataPoster') or config.get('poster', {}).get('imageUrl'),
+ 'duration': duration,
+ }
+
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- title = self._html_search_meta(
- ['og:title', 'twitter:title'], webpage, 'title')
- info = self._get_player_info(url, webpage)
+ article = self._parse_json(self._search_regex(
+ r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})',
+ webpage, 'article'), display_id)['article']
+ title = article.get('title')
+ description = clean_html(article.get('leadParagraph'))
+ if article.get('editorialType') != 'VID':
+ entries = []
+ for p in article.get('body', []):
+ content = p.get('content')
+ if p.get('type') != 'video' or not content:
+ continue
+ entries.append(self._parse_content(content, url))
+ return self.playlist_result(
+ entries, str_or_none(article.get('id')), title, description)
+ content = article['opening']['content']
+ info = self._parse_content(content, url)
info.update({
- 'display_id': display_id,
- 'title': title,
- 'description': self._html_search_meta(
- ['og:description', 'twitter:description'],
- webpage, 'title', fatal=False),
+ 'description': description,
})
return info
class ThePlatformBaseIE(OnceIE):
+ _TP_TLD = 'com'
+
def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
meta = self._download_xml(
smil_url, video_id, note=note, query={'format': 'SMIL'},
headers=self.geo_verification_headers())
error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
if error_element is not None and error_element.attrib['src'].startswith(
- 'http://link.theplatform.com/s/errorFiles/Unavailable.'):
+ 'http://link.theplatform.%s/s/errorFiles/Unavailable.' % self._TP_TLD):
raise ExtractorError(error_element.attrib['abstract'], expected=True)
smil_formats = self._parse_smil_formats(
return formats, subtitles
def _download_theplatform_metadata(self, path, video_id):
- info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
+ info_url = 'http://link.theplatform.%s/s/%s?format=preview' % (self._TP_TLD, path)
return self._download_json(info_url, video_id)
def _parse_theplatform_metadata(self, info):
class ThePlatformFeedIE(ThePlatformBaseIE):
_URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
- _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
+ _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[^&]+))'
_TESTS = [{
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
'uploader': 'NBCU-NEWS',
},
+ }, {
+ 'url': 'http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews?byGuid=nn_netcast_180306.Copy.01',
+ 'only_matching': True,
}]
def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
update_url_query,
ExtractorError,
strip_or_none,
+ url_or_none,
)
subtitles = {}
for source in video_data.findall('closedCaptions/source'):
for track in source.findall('track'):
- track_url = track.get('url')
- if not isinstance(track_url, compat_str) or track_url.endswith('/big'):
+ track_url = url_or_none(track.get('url'))
+ if not track_url or track_url.endswith('/big'):
continue
lang = track.get('lang') or track.get('label') or 'en'
subtitles.setdefault(lang, []).append({
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
int_or_none,
unescapeHTML,
+ url_or_none,
)
for stream in self._download_json(data_file, video_id):
if not isinstance(stream, dict):
continue
- stream_url = stream.get('url')
- if (stream_url in stream_urls or not stream_url or
- not isinstance(stream_url, compat_str)):
+ stream_url = url_or_none(stream.get('url'))
+ if stream_url in stream_urls or not stream_url:
continue
stream_urls.add(stream_url)
formats.extend(self._extract_m3u8_formats(
_VIDEO_FIELDS = (
'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort',
'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode',
- 'manifest.dashclear', 'format.title', 'format.defaultImage169Format',
- 'format.defaultImage169Logo')
+ 'manifest.dashclear', 'manifest.hlsclear', 'manifest.smoothclear',
+ 'format.title', 'format.defaultImage169Format', 'format.defaultImage169Logo')
def _call_api(self, path, video_id, query):
return self._download_json(
video_id = compat_str(info['id'])
title = info['title']
- mpd_url = info['manifest']['dashclear']
- if not mpd_url:
+ paths = []
+ for manifest_url in (info.get('manifest') or {}).values():
+ if not manifest_url:
+ continue
+ manifest_url = update_url_query(manifest_url, {'filter': ''})
+ path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+ if path in paths:
+ continue
+ paths.append(path)
+
+ def url_repl(proto, suffix):
+ return re.sub(
+ r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+ r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+ '.ism/' + suffix, manifest_url))
+
+ formats = self._extract_mpd_formats(
+ url_repl('dash', '.mpd'), video_id,
+ mpd_id='dash', fatal=False)
+ formats.extend(self._extract_ism_formats(
+ url_repl('hss', 'Manifest'),
+ video_id, ism_id='mss', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ url_repl('hls', '.m3u8'), video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ if formats:
+ break
+ else:
if info.get('isDrm'):
raise ExtractorError(
'Video %s is DRM protected' % video_id, expected=True)
if info.get('geoblocked'):
- raise ExtractorError(
- 'Video %s is not available from your location due to geo restriction' % video_id,
- expected=True)
+ raise self.raise_geo_restricted()
if not info.get('free', True):
raise ExtractorError(
'Video %s is not available for free' % video_id, expected=True)
-
- mpd_url = update_url_query(mpd_url, {'filter': ''})
- formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash', fatal=False)
- formats.extend(self._extract_ism_formats(
- mpd_url.replace('dash.', 'hss.').replace('/.mpd', '/Manifest'),
- video_id, ism_id='mss', fatal=False))
- formats.extend(self._extract_m3u8_formats(
- mpd_url.replace('dash.', 'hls.').replace('/.mpd', '/.m3u8'),
- video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
self._sort_formats(formats)
description = info.get('articleLong') or info.get('articleShort')
class TVNowIE(TVNowBaseIE):
_VALID_URL = r'''(?x)
https?://
- (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/
+ (?:www\.)?tvnow\.(?:de|at|ch)/(?P<station>[^/]+)/
(?P<show_id>[^/]+)/
(?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
'''
}]
def _real_extract(self, url):
- display_id = '%s/%s' % re.match(self._VALID_URL, url).groups()
+ mobj = re.match(self._VALID_URL, url)
+ display_id = '%s/%s' % mobj.group(2, 3)
info = self._call_api(
'movies/' + display_id, display_id, query={
'fields': ','.join(self._VIDEO_FIELDS),
+ 'station': mobj.group(1),
})
return self._extract_video(info, display_id)
try_get,
unsmuggle_url,
update_url_query,
+ url_or_none,
)
https?://
(?:www\.)?
(?:
- tvplay(?:\.skaties)?\.lv/parraides|
- (?:tv3play|play\.tv3)\.lt/programos|
+ tvplay(?:\.skaties)?\.lv(?:/parraides)?|
+ (?:tv3play|play\.tv3)\.lt(?:/programos)?|
tv3play(?:\.tv3)?\.ee/sisu|
(?:tv(?:3|6|8|10)play|viafree)\.se/program|
(?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer|
- play\.novatv\.bg/programi
+ play\.nova(?:tv)?\.bg/programi
)
/(?:[^/]+/)+
)
'skip_download': True,
},
},
+ {
+ 'url': 'https://play.nova.bg/programi/zdravei-bulgariya/764300?autostart=true',
+ 'only_matching': True,
+ },
{
'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true',
'only_matching': True,
},
+ {
+ 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/418113/?autostart=true',
+ 'only_matching': True,
+ },
{
# views is null
'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183',
quality = qualities(['hls', 'medium', 'high'])
formats = []
for format_id, video_url in streams.get('streams', {}).items():
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(video_url)
+ if not video_url:
continue
ext = determine_ext(video_url)
if ext == 'f4m':
'url': m.group('url'),
'app': m.group('app'),
'play_path': m.group('playpath'),
+ 'preference': -1,
})
else:
fmt.update({
'skip_rtmp': True,
}),
ie=TVPlayIE.ie_key(), video_id=video_id)
+
+
+class TVPlayHomeIE(InfoExtractor):
+ _VALID_URL = r'https?://tvplay\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/[^/]+/[^/?#&]+-(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/',
+ 'info_dict': {
+ 'id': '366367',
+ 'ext': 'mp4',
+ 'title': 'Aferistai',
+ 'description': 'Aferistai. Kalėdinė pasaka.',
+ 'series': 'Aferistai [N-7]',
+ 'season': '1 sezonas',
+ 'season_number': 1,
+ 'duration': 464,
+ 'timestamp': 1394209658,
+ 'upload_date': '20140307',
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [TVPlayIE.ie_key()],
+ }, {
+ 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_id = self._search_regex(
+ r'data-asset-id\s*=\s*["\'](\d{5,7})\b', webpage, 'video id',
+ default=None)
+
+ if video_id:
+ return self.url_result(
+ 'mtg:%s' % video_id, ie=TVPlayIE.ie_key(), video_id=video_id)
+
+ m3u8_url = self._search_regex(
+ r'data-file\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'm3u8 url', group='url')
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ r'data-title\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'title', default=None, group='value') or self._html_search_meta(
+ 'title', webpage, default=None) or self._og_search_title(
+ webpage)
+
+ description = self._html_search_meta(
+ 'description', webpage,
+ default=None) or self._og_search_description(webpage)
+
+ thumbnail = self._search_regex(
+ r'data-image\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'thumbnail', default=None, group='url') or self._html_search_meta(
+ 'thumbnail', webpage, default=None) or self._og_search_thumbnail(
+ webpage)
+
+ duration = int_or_none(self._search_regex(
+ r'data-duration\s*=\s*["\'](\d+)', webpage, 'duration',
+ fatal=False))
+
+ season = self._search_regex(
+ (r'data-series-title\s*=\s*(["\'])[^/]+/(?P<value>(?:(?!\1).)+)\1',
+ r'\bseason\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1'), webpage,
+ 'season', default=None, group='value')
+ season_number = int_or_none(self._search_regex(
+ r'(\d+)(?:[.\s]+sezona|\s+HOOAEG)', season or '', 'season number',
+ default=None))
+ episode = self._search_regex(
+ r'(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'episode',
+ default=None, group='value')
+ episode_number = int_or_none(self._search_regex(
+ r'(?:S[eē]rija|Osa)\s+(\d+)', episode or '', 'episode number',
+ default=None))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'season': season,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'formats': formats,
+ }
import itertools
import re
import random
+import json
from .common import InfoExtractor
from ..compat import (
- compat_HTTPError,
compat_kwargs,
compat_parse_qs,
compat_str,
try_get,
unified_timestamp,
update_url_query,
- urlencode_postdata,
+ url_or_none,
urljoin,
)
_API_BASE = 'https://api.twitch.tv'
_USHER_BASE = 'https://usher.ttvnw.net'
- _LOGIN_URL = 'https://www.twitch.tv/login'
- _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6'
+ _LOGIN_FORM_URL = 'https://www.twitch.tv/login'
+ _LOGIN_POST_URL = 'https://passport.twitch.tv/login'
+ _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko'
_NETRC_MACHINE = 'twitch'
def _handle_error(self, response):
page_url = urlh.geturl()
post_url = self._search_regex(
r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page,
- 'post url', default=page_url, group='url')
+ 'post url', default=self._LOGIN_POST_URL, group='url')
post_url = urljoin(page_url, post_url)
- headers = {'Referer': page_url}
+ headers = {
+ 'Referer': page_url,
+ 'Origin': page_url,
+ 'Content-Type': 'text/plain;charset=UTF-8',
+ }
- try:
- response = self._download_json(
- post_url, None, note,
- data=urlencode_postdata(form),
- headers=headers)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
- response = self._parse_json(
- e.cause.read().decode('utf-8'), None)
- fail(response.get('message') or response['errors'][0])
- raise
+ response = self._download_json(
+ post_url, None, note, data=json.dumps(form).encode(),
+ headers=headers, expected_status=400)
+ error = response.get('error_description') or response.get('error_code')
+ if error:
+ fail(error)
if 'Authenticated successfully' in response.get('message', ''):
return None, None
headers=headers)
login_page, handle = self._download_webpage_handle(
- self._LOGIN_URL, None, 'Downloading login page')
+ self._LOGIN_FORM_URL, None, 'Downloading login page')
# Some TOR nodes and public proxies are blocked completely
if 'blacklist_message' in login_page:
login_page, handle, 'Logging in', {
'username': username,
'password': password,
+ 'client_id': self._CLIENT_ID,
})
# Successful login
_VALID_URL = r'''(?x)
https?://
(?:
- (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v|videos)/|
+ (?:(?:www|go|m)\.)?twitch\.tv/(?:[^/]+/v(?:ideo)?|videos)/|
player\.twitch\.tv/\?.*?\bvideo=v
)
(?P<id>\d+)
}, {
'url': 'https://m.twitch.tv/beagsandjam/v/247478721',
'only_matching': True,
+ }, {
+ 'url': 'https://www.twitch.tv/northernlion/video/291940395',
+ 'only_matching': True,
}]
def _real_extract(self, url):
for option in status['quality_options']:
if not isinstance(option, dict):
continue
- source = option.get('source')
- if not source or not isinstance(source, compat_str):
+ source = url_or_none(option.get('source'))
+ if not source:
continue
formats.append({
'url': source,
sanitized_Request,
try_get,
unescapeHTML,
+ url_or_none,
urlencode_postdata,
)
if not isinstance(source_list, list):
return
for source in source_list:
- video_url = source.get('file') or source.get('src')
- if not video_url or not isinstance(video_url, compat_str):
+ video_url = url_or_none(source.get('file') or source.get('src'))
+ if not video_url:
continue
if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
continue
if track.get('kind') != 'captions':
continue
- src = track.get('src')
- if not src or not isinstance(src, compat_str):
+ src = url_or_none(track.get('src'))
+ if not src:
continue
lang = track.get('language') or track.get(
'srclang') or track.get('label')
for cc in captions:
if not isinstance(cc, dict):
continue
- cc_url = cc.get('url')
- if not cc_url or not isinstance(cc_url, compat_str):
+ cc_url = url_or_none(cc.get('url'))
+ if not cc_url:
continue
lang = try_get(cc, lambda x: x['locale']['locale'], compat_str)
sub_dict = (automatic_captions if cc.get('source') == 'auto'
'aftenposten.no/webtv': 'aptv',
'ap.vgtv.no/webtv': 'aptv',
'tv.aftonbladet.se/abtv': 'abtv',
+ 'www.aftonbladet.se/tv': 'abtv',
}
_APP_NAME_TO_VENDOR = {
(?:
(?:\#!/)?(?:video|live)/|
embed?.*id=|
- articles/
+ a(?:rticles)?/
)|
(?P<appname>
%s
'url': 'http://tv.aftonbladet.se/abtv/articles/36015',
'only_matching': True,
},
+ {
+ 'url': 'https://www.aftonbladet.se/tv/a/36015',
+ 'only_matching': True,
+ },
{
'url': 'abtv:140026',
'only_matching': True,
streams = data['streamUrls']
stream_type = data.get('streamType')
-
+ is_live = stream_type == 'live'
formats = []
hls_url = streams.get('hls')
if hls_url:
formats.extend(self._extract_m3u8_formats(
- hls_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ hls_url, video_id, 'mp4',
+ entry_protocol='m3u8' if is_live else 'm3u8_native',
+ m3u8_id='hls', fatal=False))
hds_url = streams.get('hds')
if hds_url:
info.update({
'id': video_id,
- 'title': self._live_title(data['title']) if stream_type == 'live' else data['title'],
+ 'title': self._live_title(data['title']) if is_live else data['title'],
'description': data['description'],
'thumbnail': data['images']['main'] + '?t[]=900x506q80',
'timestamp': data['published'],
'duration': float_or_none(data['duration'], 1000),
'view_count': data['displays'],
- 'is_live': True if stream_type == 'live' else False,
+ 'is_live': is_live,
})
return info
import itertools
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
-)
+from ..compat import compat_HTTPError
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
parse_iso8601,
+ url_or_none,
)
formats = []
for f in video.get('formats', []):
- format_url = f.get('uri')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(f.get('uri'))
+ if not format_url:
continue
format_type = f.get('type')
if format_type == 'dash':
class VidziIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
_TESTS = [{
'url': 'http://vidzi.tv/cghql9yq6emu.html',
'md5': '4f16c71ca0c8c8635ab6932b5f3f1660',
}, {
'url': 'https://vidzi.si/rph9gztxj1et.html',
'only_matching': True,
+ }, {
+ 'url': 'http://vidzi.nu/cghql9yq6emu.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
# We try to find out to which variable is assigned the config dic
m_variable_name = re.search(r'(\w)\.video\.id', webpage)
if m_variable_name is not None:
- config_re = r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))
+ config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
else:
config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
+ config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
config = self._search_regex(config_re, webpage, 'info section',
flags=re.DOTALL)
config = json.loads(config)
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ url_or_none,
+)
+
+
+class ViqeoIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ viqeo:|
+ https?://cdn\.viqeo\.tv/embed/*\?.*?\bvid=|
+ https?://api\.viqeo\.tv/v\d+/data/startup?.*?\bvideo(?:%5B%5D|\[\])=
+ )
+ (?P<id>[\da-f]+)
+ '''
+ _TESTS = [{
+ 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837',
+ 'md5': 'a169dd1a6426b350dca4296226f21e76',
+ 'info_dict': {
+ 'id': 'cde96f09d25f39bee837',
+ 'ext': 'mp4',
+ 'title': 'cde96f09d25f39bee837',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 76,
+ },
+ }, {
+ 'url': 'viqeo:cde96f09d25f39bee837',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://api.viqeo.tv/v1/data/startup?video%5B%5D=71bbec412ade45c3216c&profile=112',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://cdn.viqeo.tv/embed/?vid=%s' % video_id, video_id)
+
+ data = self._parse_json(
+ self._search_regex(
+ r'SLOT_DATA\s*=\s*({.+?})\s*;', webpage, 'slot data'),
+ video_id)
+
+ formats = []
+ thumbnails = []
+ for media_file in data['mediaFiles']:
+ if not isinstance(media_file, dict):
+ continue
+ media_url = url_or_none(media_file.get('url'))
+ if not media_url or not media_url.startswith(('http', '//')):
+ continue
+ media_type = str_or_none(media_file.get('type'))
+ if not media_type:
+ continue
+ media_kind = media_type.split('/')[0].lower()
+ f = {
+ 'url': media_url,
+ 'width': int_or_none(media_file.get('width')),
+ 'height': int_or_none(media_file.get('height')),
+ }
+ format_id = str_or_none(media_file.get('quality'))
+ if media_kind == 'image':
+ f['id'] = format_id
+ thumbnails.append(f)
+ elif media_kind in ('video', 'audio'):
+ is_audio = media_kind == 'audio'
+ f.update({
+ 'format_id': 'audio' if is_audio else format_id,
+ 'fps': int_or_none(media_file.get('fps')),
+ 'vcodec': 'none' if is_audio else None,
+ })
+ formats.append(f)
+ self._sort_formats(formats)
+
+ duration = int_or_none(data.get('duration'))
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'duration': duration,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
'skip': 'Geo-restricted to Hong Kong',
}]
+ _AREA_ID = {
+ 'HK': 1,
+ 'SG': 2,
+ 'TH': 4,
+ 'PH': 5,
+ }
+
def _real_extract(self, url):
country_code, video_id = re.match(self._VALID_URL, url).groups()
+ query = {
+ 'r': 'vod/ajax-detail',
+ 'platform_flag_label': 'web',
+ 'product_id': video_id,
+ }
+
+ area_id = self._AREA_ID.get(country_code.upper())
+ if area_id:
+ query['area_id'] = area_id
+
product_data = self._download_json(
'http://www.viu.com/ott/%s/index.php' % country_code, video_id,
- 'Downloading video info', query={
- 'r': 'vod/ajax-detail',
- 'platform_flag_label': 'web',
- 'product_id': video_id,
- })['data']
+ 'Downloading video info', query=query)['data']
video_data = product_data.get('current_product')
if not video_data:
'https://d1k2us671qcoau.cloudfront.net/distribute_web_%s.php' % country_code,
video_id, 'Downloading stream info', query={
'ccs_product_id': video_data['ccs_product_id'],
+ }, headers={
+ 'Referer': url,
+ 'Origin': re.search(r'https?://[^/]+', url).group(0),
})['data']['stream']
stream_sizes = stream_data.get('size', {})
int_or_none,
orderedSet,
remove_start,
+ str_or_none,
str_to_int,
unescapeHTML,
unified_timestamp,
+ url_or_none,
urlencode_postdata,
)
from .dailymotion import DailymotionIE
'ext': 'mp4',
'title': 'ProtivoGunz - Хуёвая песня',
'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
+ 'uploader_id': '-77521',
'duration': 195,
- 'timestamp': 1329060660,
+ 'timestamp': 1329049880,
'upload_date': '20120212',
- 'view_count': int,
},
},
{
'info_dict': {
'id': '165548505',
'ext': 'mp4',
- 'uploader': 'Tom Cruise',
'title': 'No name',
+ 'uploader': 'Tom Cruise',
+ 'uploader_id': '205387401',
'duration': 9,
- 'timestamp': 1374374880,
- 'upload_date': '20130721',
- 'view_count': int,
+ 'timestamp': 1374364108,
+ 'upload_date': '20130720',
}
},
{
'id': 'V3K4mi0SYkc',
'ext': 'webm',
'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate",
- 'description': 'md5:d9903938abdc74c738af77f527ca0596',
- 'duration': 178,
+ 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a',
+ 'duration': 179,
'upload_date': '20130116',
- 'uploader': "Children's Joy Foundation",
+ 'uploader': "Children's Joy Foundation Inc.",
'uploader_id': 'thecjf',
'view_count': int,
},
'id': 'k3lz2cmXyRuJQSjGHUv',
'ext': 'mp4',
'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
+ # TODO: fix test by fixing dailymotion description extraction
'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
'uploader': 'AniLibria.Tv',
'upload_date': '20160914',
'ext': 'mp4',
'title': 'S-Dance, репетиции к The way show',
'uploader': 'THE WAY SHOW | 17 апреля',
- 'timestamp': 1454870100,
+ 'uploader_id': '-110305615',
+ 'timestamp': 1454859345,
'upload_date': '20160207',
- 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
},
},
{
video_id = mobj.group('videoid')
if video_id:
- info_url = 'https://vk.com/al_video.php?act=show&al=1&module=video&video=%s' % video_id
+ info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
# Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id')
if list_id:
r'<!>This video is no longer available, because its author has been blocked.':
'Video %s is no longer available, because its author has been blocked.',
+
+ r'<!>This video is no longer available, because it has been deleted.':
+ 'Video %s is no longer available, because it has been deleted.',
}
for error_re, error_msg in ERRORS.items():
if not data:
data = self._parse_json(
self._search_regex(
- r'<!json>\s*({.+?})\s*<!>', info_page, 'json', default='{}'),
+ [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
+ info_page, 'json', default='{}'),
video_id)
if data:
data = data['player']['params'][0]
timestamp = unified_timestamp(self._html_search_regex(
r'class=["\']mv_info_date[^>]+>([^<]+)(?:<|from)', info_page,
- 'upload date', fatal=False))
+ 'upload date', default=None)) or int_or_none(data.get('date'))
view_count = str_to_int(self._search_regex(
r'class=["\']mv_views_count[^>]+>\s*([\d,.]+)',
formats = []
for format_id, format_url in data.items():
- if not isinstance(format_url, compat_str) or not format_url.startswith(('http', '//', 'rtmp')):
+ format_url = url_or_none(format_url)
+ if not format_url or not format_url.startswith(('http', '//', 'rtmp')):
continue
if (format_id.startswith(('url', 'cache')) or
format_id in ('extra_data', 'live_mp4', 'postlive_mp4')):
'title': title,
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
+ 'uploader_id': str_or_none(data.get('author_id')),
'duration': data.get('duration'),
'timestamp': timestamp,
'view_count': view_count,
+ 'like_count': int_or_none(data.get('liked')),
+ 'dislike_count': int_or_none(data.get('nolikes')),
'is_live': is_live,
}
video_id = self._match_id(url)
webpage = self._download_webpage(
- 'http://www.vlive.tv/video/%s' % video_id, video_id)
+ 'https://www.vlive.tv/video/%s' % video_id, video_id)
VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
VIDEO_PARAMS_FIELD = 'video params'
def _live(self, video_id, webpage):
init_page = self._download_webpage(
- 'http://www.vlive.tv/video/init/view',
+ 'https://www.vlive.tv/video/init/view',
video_id, note='Downloading live webpage',
data=urlencode_postdata({'videoSeq': video_id}),
headers={
- 'Referer': 'http://www.vlive.tv/video/%s' % video_id,
+ 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
'Content-Type': 'application/x-www-form-urlencoded'
})
class VRVIE(VRVBaseIE):
IE_NAME = 'vrv'
_VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
'info_dict': {
'id': 'GR9PNZ396',
# m3u8 download
'skip_download': True,
},
- }
+ }]
+
+ def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang):
+ if not url or stream_format not in ('hls', 'dash'):
+ return []
+ stream_id = hardsub_lang or audio_lang
+ format_id = '%s-%s' % (stream_format, stream_id)
+ if stream_format == 'hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s m3u8 information' % stream_id,
+ fatal=False)
+ elif stream_format == 'dash':
+ adaptive_formats = self._extract_mpd_formats(
+ url, video_id, mpd_id=format_id,
+ note='Downloading %s MPD information' % stream_id,
+ fatal=False)
+ if audio_lang:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_lang
+ return adaptive_formats
def _real_extract(self, url):
video_id = self._match_id(url)
for stream_type, streams in streams_json.get('streams', {}).items():
if stream_type in ('adaptive_hls', 'adaptive_dash'):
for stream in streams.values():
- stream_url = stream.get('url')
- if not stream_url:
- continue
- stream_id = stream.get('hardsub_locale') or audio_locale
- format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
- if stream_type == 'adaptive_hls':
- adaptive_formats = self._extract_m3u8_formats(
- stream_url, video_id, 'mp4', m3u8_id=format_id,
- note='Downloading %s m3u8 information' % stream_id,
- fatal=False)
- else:
- adaptive_formats = self._extract_mpd_formats(
- stream_url, video_id, mpd_id=format_id,
- note='Downloading %s MPD information' % stream_id,
- fatal=False)
- if audio_locale:
- for f in adaptive_formats:
- if f.get('acodec') != 'none':
- f['language'] = audio_locale
- formats.extend(adaptive_formats)
+ formats.extend(self._extract_vrv_formats(
+ stream.get('url'), video_id, stream_type.split('_')[1],
+ audio_locale, stream.get('hardsub_locale')))
self._sort_formats(formats)
subtitles = {}
js_to_json,
strip_or_none,
try_get,
+ unescapeHTML,
unified_timestamp,
)
webpage = self._download_webpage(url, video_id)
- source = self._parse_json(
+ player_config = self._parse_json(
self._search_regex(
- r'(?s)source["\']?\s*:\s*({.+?})\s*[,}]', webpage, 'source',
- default='{}'),
- video_id, transform_source=js_to_json, fatal=False) or {}
+ r'data-player-conf=(["\'])(?P<data>{.+?})\1', webpage,
+ 'player config', default='{}', group='data'),
+ video_id, transform_source=unescapeHTML, fatal=False)
+
+ if not player_config:
+ player_config = self._parse_json(
+ self._search_regex(
+ r'playerConf\s*=\s*({.+?})\s*;', webpage, 'player config',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False) or {}
+
+ source = player_config.get('source') or {}
video_id = compat_str(source.get('videoId') or video_id)
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ int_or_none,
+ orderedSet,
+)
class WebOfStoriesIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
entries = [
- self.url_result('http://www.webofstories.com/play/%s' % video_number, 'WebOfStories')
- for video_number in set(re.findall(r'href="/playAll/%s\?sId=(\d+)"' % playlist_id, webpage))
+ self.url_result(
+ 'http://www.webofstories.com/play/%s' % video_id,
+ 'WebOfStories', video_id=video_id)
+ for video_id in orderedSet(re.findall(r'\bid=["\']td_(\d+)', webpage))
]
title = self._search_regex(
(r'powerwatch\.pw', 'PowerWatch'),
(r'rapidvideo\.ws', 'Rapidvideo.ws'),
(r'thevideobee\.to', 'TheVideoBee'),
- (r'vidto\.me', 'Vidto'),
+ (r'vidto\.(?:me|se)', 'Vidto'),
(r'streamin\.to', 'Streamin.To'),
(r'xvidstage\.com', 'XVIDSTAGE'),
(r'vidabc\.com', 'Vid ABC'),
'only_matching': True,
}, {
'url': 'http://www.fastvideo.me/k8604r8nk8sn/FAST_FURIOUS_8_-_Trailer_italiano_ufficiale.mp4.html',
- 'only_matching': True
+ 'only_matching': True,
+ }, {
+ 'url': 'http://vidto.se/1tx1pf6t12cg.html',
+ 'only_matching': True,
}]
@staticmethod
parse_duration,
try_get,
unified_strdate,
+ url_or_none,
)
else:
format_url = format_item
filesize = None
- if not isinstance(format_url, compat_str):
+ format_url = url_or_none(format_url)
+ if not format_url:
continue
formats.append({
'format_id': '%s-%s' % (format_id, quality),
default='{}'),
video_id, fatal=False)
for format_id, format_url in sources.items():
- if not isinstance(format_url, compat_str):
+ format_url = url_or_none(format_url)
+ if not format_url:
continue
if format_url in format_urls:
continue
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
qualities,
unescapeHTML,
+ url_or_none,
)
formats = []
for format_id in QUALITIES:
is_hd = format_id == 'hd'
- format_url = playlist.get(
- 'file%s' % ('_hd' if is_hd else ''))
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(playlist.get(
+ 'file%s' % ('_hd' if is_hd else '')))
+ if not format_url:
continue
formats.append({
'url': format_url,
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
determine_ext,
int_or_none,
parse_duration,
+ url_or_none,
)
for encoding in encodings:
if not isinstance(encoding, dict):
continue
- format_url = encoding.get('filename')
- if not isinstance(format_url, compat_str):
+ format_url = url_or_none(encoding.get('filename'))
+ if not format_url:
continue
if determine_ext(format_url) == 'm3u8':
formats.extend(self._extract_m3u8_formats(
import re
from .common import InfoExtractor
-from ..compat import compat_str
from ..utils import (
int_or_none,
sanitized_Request,
str_to_int,
unescapeHTML,
unified_strdate,
+ url_or_none,
)
from ..aes import aes_decrypt_text
for definition in definitions:
if not isinstance(definition, dict):
continue
- video_url = definition.get('videoUrl')
- if isinstance(video_url, compat_str) and video_url:
+ video_url = url_or_none(definition.get('videoUrl'))
+ if video_url:
links.append(video_url)
# Fallback #1, this also contains extra low quality 180p format
--- /dev/null
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import urljoin
+
+
+class YourPornIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?yourporn\.sexy/post/(?P<id>[^/?#&.]+)'
+ _TEST = {
+ 'url': 'https://yourporn.sexy/post/57ffcb2e1179b.html',
+ 'md5': '6f8682b6464033d87acaa7a8ff0c092e',
+ 'info_dict': {
+ 'id': '57ffcb2e1179b',
+ 'ext': 'mp4',
+ 'title': 'md5:c9f43630bd968267672651ba905a7d35',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = urljoin(url, self._parse_json(
+ self._search_regex(
+ r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info',
+ group='data'),
+ video_id)[video_id])
+
+ title = (self._search_regex(
+ r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title',
+ default=None) or self._og_search_description(webpage)).strip()
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ }
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}'
+ _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
def _set_language(self):
self._set_cookie(
warn('Unable to extract result entry')
return False
- tfa = try_get(res, lambda x: x[0][0], list)
- if tfa:
- tfa_str = try_get(tfa, lambda x: x[2], compat_str)
- if tfa_str == 'TWO_STEP_VERIFICATION':
+ login_challenge = try_get(res, lambda x: x[0][0], list)
+ if login_challenge:
+ challenge_str = try_get(login_challenge, lambda x: x[2], compat_str)
+ if challenge_str == 'TWO_STEP_VERIFICATION':
# SEND_SUCCESS - TFA code has been successfully sent to phone
# QUOTA_EXCEEDED - reached the limit of TFA codes
- status = try_get(tfa, lambda x: x[5], compat_str)
+ status = try_get(login_challenge, lambda x: x[5], compat_str)
if status == 'QUOTA_EXCEEDED':
warn('Exceeded the limit of TFA codes, try later')
return False
check_cookie_url = try_get(
tfa_results, lambda x: x[0][-1][2], compat_str)
+ else:
+ CHALLENGES = {
+ 'LOGIN_CHALLENGE': "This device isn't recognized. For your security, Google wants to make sure it's really you.",
+ 'USERNAME_RECOVERY': 'Please provide additional information to aid in the recovery process.',
+ 'REAUTH': "There is something unusual about your activity. For your security, Google wants to make sure it's really you.",
+ }
+ challenge = CHALLENGES.get(
+ challenge_str,
+ '%s returned error %s.' % (self.IE_NAME, challenge_str))
+ warn('%s\nGo to https://accounts.google.com/, login and solve a challenge.' % challenge)
+ return False
else:
check_cookie_url = try_get(res, lambda x: x[2], compat_str)
def _parse_sig_js(self, jscode):
funcname = self._search_regex(
(r'(["\'])signature\1\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
- r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\('),
+ r'\.sig\|\|(?P<sig>[a-zA-Z0-9$]+)\(',
+ r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\(',
+ r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P<sig>[a-zA-Z0-9$]+)\('),
jscode, 'Initial JS player signature function name', group='sig')
jsi = JSInterpreter(jscode)
youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
)
(
- (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,}
+ (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
}, {
'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
+ }, {
+ # music album playlist
+ 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
+ 'only_matching': True,
}]
def _real_initialize(self):
ExtractorError,
int_or_none,
try_get,
+ url_or_none,
urlencode_postdata,
)
for watch in watch_urls:
if not isinstance(watch, dict):
continue
- watch_url = watch.get('url')
- if not watch_url or not isinstance(watch_url, compat_str):
+ watch_url = url_or_none(watch.get('url'))
+ if not watch_url:
continue
format_id_list = [stream_type]
maxrate = watch.get('maxrate')
try_get,
unified_timestamp,
update_url_query,
+ url_or_none,
urljoin,
)
def _extract_subtitles(src):
subtitles = {}
for caption in try_get(src, lambda x: x['captions'], list) or []:
- subtitle_url = caption.get('uri')
- if subtitle_url and isinstance(subtitle_url, compat_str):
+ subtitle_url = url_or_none(caption.get('uri'))
+ if subtitle_url:
lang = caption.get('language', 'deu')
subtitles.setdefault(lang, []).append({
'url': subtitle_url,
return subtitles
def _extract_format(self, video_id, formats, format_urls, meta):
- format_url = meta.get('url')
- if not format_url or not isinstance(format_url, compat_str):
+ format_url = url_or_none(meta.get('url'))
+ if not format_url:
return
if format_url in format_urls:
return
content, lambda x: x['teaserImageRef']['layouts'], dict)
if layouts:
for layout_key, layout_url in layouts.items():
- if not isinstance(layout_url, compat_str):
+ layout_url = url_or_none(layout_url)
+ if not layout_url:
continue
thumbnail = {
'url': layout_url,
postproc.add_option(
'--prefer-avconv',
action='store_false', dest='prefer_ffmpeg',
- help='Prefer avconv over ffmpeg for running the postprocessors (default)')
+ help='Prefer avconv over ffmpeg for running the postprocessors')
postproc.add_option(
'--prefer-ffmpeg',
action='store_true', dest='prefer_ffmpeg',
- help='Prefer ffmpeg over avconv for running the postprocessors')
+ help='Prefer ffmpeg over avconv for running the postprocessors (default)')
postproc.add_option(
'--ffmpeg-location', '--avconv-location', metavar='PATH',
dest='ffmpeg_location',
def _determine_executables(self):
programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe']
- prefer_ffmpeg = False
+ prefer_ffmpeg = True
self.basename = None
self.probe_basename = None
self._paths = None
self._versions = None
if self._downloader:
- prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', False)
+ prefer_ffmpeg = self._downloader.params.get('prefer_ffmpeg', True)
location = self._downloader.params.get('ffmpeg_location')
if location is not None:
if not os.path.exists(location):
(p, get_exe_version(p, args=['-version'])) for p in programs)
self._paths = dict((p, p) for p in programs)
- if prefer_ffmpeg:
- prefs = ('ffmpeg', 'avconv')
- else:
+ if prefer_ffmpeg is False:
prefs = ('avconv', 'ffmpeg')
+ else:
+ prefs = ('ffmpeg', 'avconv')
for p in prefs:
if self._versions[p]:
self.basename = p
break
- if prefer_ffmpeg:
- prefs = ('ffprobe', 'avprobe')
- else:
+ if prefer_ffmpeg is False:
prefs = ('avprobe', 'ffprobe')
+ else:
+ prefs = ('ffprobe', 'avprobe')
for p in prefs:
if self._versions[p]:
self.probe_basename = p
compat_os_name,
compat_parse_qs,
compat_shlex_quote,
- compat_socket_create_connection,
compat_str,
compat_struct_pack,
compat_struct_unpack,
compiled_regex_type = type(re.compile(''))
std_headers = {
- 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)',
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
])
PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
+JSON_LD_RE = r'(?is)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
def preferredencoding():
kwargs['strict'] = True
hc = http_class(*args, **compat_kwargs(kwargs))
source_address = ydl_handler._params.get('source_address')
+
if source_address is not None:
+ # This is to workaround _create_connection() from socket where it will try all
+ # address data from getaddrinfo() including IPv6. This filters the result from
+ # getaddrinfo() based on the source_address value.
+ # This is based on the cpython socket.create_connection() function.
+ # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
+ def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
+ host, port = address
+ err = None
+ addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
+ af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
+ ip_addrs = [addr for addr in addrs if addr[0] == af]
+ if addrs and not ip_addrs:
+ ip_version = 'v4' if af == socket.AF_INET else 'v6'
+ raise socket.error(
+ "No remote IP%s addresses available for connect, can't use '%s' as source address"
+ % (ip_version, source_address[0]))
+ for res in ip_addrs:
+ af, socktype, proto, canonname, sa = res
+ sock = None
+ try:
+ sock = socket.socket(af, socktype, proto)
+ if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
+ sock.settimeout(timeout)
+ sock.bind(source_address)
+ sock.connect(sa)
+ err = None # Explicitly break reference cycle
+ return sock
+ except socket.error as _:
+ err = _
+ if sock is not None:
+ sock.close()
+ if err is not None:
+ raise err
+ else:
+ raise socket.error('getaddrinfo returns an empty list')
+ if hasattr(hc, '_create_connection'):
+ hc._create_connection = _create_connection
sa = (source_address, 0)
if hasattr(hc, 'source_address'): # Python 2.7+
hc.source_address = sa
else: # Python 2.6
def _hc_connect(self, *args, **kwargs):
- sock = compat_socket_create_connection(
+ sock = _create_connection(
(self.host, self.port), self.timeout, sa)
if is_https:
self.sock = ssl.wrap_socket(
return None if v is None else v.strip()
+def url_or_none(url):
+ if not url or not isinstance(url, compat_str):
+ return None
+ url = url.strip()
+ return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None
+
+
def parse_duration(s):
if not isinstance(s, compat_basestring):
return None
def strip_jsonp(code):
return re.sub(
r'''(?sx)^
- (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
+ (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
(?:\s*&&\s*(?P=func_name))?
\s*\(\s*(?P<callback_data>.*)\);?
\s*?(?://[^\n]*)*$''',
vcodec, acodec = None, None
for full_codec in splited_codecs:
codec = full_codec.split('.')[0]
- if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
+ if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01'):
if not vcodec:
vcodec = full_codec
elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
setattr(self, '%s_open' % type,
lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
meth(r, proxy, type))
- return compat_urllib_request.ProxyHandler.__init__(self, proxies)
+ compat_urllib_request.ProxyHandler.__init__(self, proxies)
def proxy_open(self, req, proxy, type):
req_proxy = req.headers.get('Ytdl-request-proxy')
from __future__ import unicode_literals
-__version__ = '2018.06.18'
+__version__ = '2018.09.10'