+version 2019.01.16
+
+Core
++ [test/helper] Add support for maxcount and count collection len checkers
+* [downloader/hls] Fix uplynk ad skipping (#18824)
+* [postprocessor/ffmpeg] Improve ffmpeg version parsing (#18813)
+
+Extractors
+* [youtube] Skip unsupported adaptive stream type (#18804)
++ [youtube] Extract DASH formats from player response (#18804)
+* [funimation] Fix extraction (#14089)
+* [skylinewebcams] Fix extraction (#18853)
++ [curiositystream] Add support for non app URLs
++ [bitchute] Check formats (#18833)
+* [wistia] Extend URL regular expression (#18823)
++ [playplustv] Add support for playplus.com (#18789)
+
+
+version 2019.01.10
+
+Core
+* [extractor/common] Use episode name as title in _json_ld
++ [extractor/common] Add support for movies in _json_ld
+* [postprocessor/ffmpeg] Embed subtitles with non-standard language codes
+ (#18765)
++ [utils] Add language codes replaced in 1989 revision of ISO 639
+ to ISO639Utils (#18765)
+
+Extractors
+* [youtube] Extract live HLS URL from player response (#18799)
++ [outsidetv] Add support for outsidetv.com (#18774)
+* [jwplatform] Use JW Platform Delivery API V2 and add support for more URLs
++ [fox] Add support National Geographic (#17985, #15333, #14698)
++ [playplustv] Add support for playplus.tv (#18789)
+* [globo] Set GLBID cookie manually (#17346)
++ [gaia] Add support for gaia.com (#14605)
+* [youporn] Fix title and description extraction (#18748)
++ [hungama] Add support for hungama.com (#17402, #18771)
+* [dtube] Fix extraction (#18741)
+* [tvnow] Fix and rework extractors and prepare for a switch to the new API
+ (#17245, #18499)
+* [carambatv:page] Fix extraction (#18739)
+
+
+version 2019.01.02
+
+Extractors
+* [discovery] Use geo verification headers (#17838)
++ [packtpub] Add support for subscription.packtpub.com (#18718)
+* [yourporn] Fix extraction (#18583)
++ [acast:channel] Add support for play.acast.com (#18587)
++ [extractors] Add missing age limits (#18621)
++ [rmcdecouverte] Add support for live stream
+* [rmcdecouverte] Bypass geo restriction
+* [rmcdecouverte] Update URL regular expression (#18595, 18697)
+* [manyvids] Fix extraction (#18604, #18614)
+* [bitchute] Fix extraction (#18567)
+
+
+version 2018.12.31
+
+Extractors
++ [bbc] Add support for another embed pattern (#18643)
++ [npo:live] Add support for npostart.nl (#18644)
+* [beeg] Fix extraction (#18610, #18626)
+* [youtube] Unescape HTML for series (#18641)
++ [youtube] Extract more format metadata
+* [youtube] Detect DRM protected videos (#1774)
+* [youtube] Relax HTML5 player regular expressions (#18465, #18466)
+* [youtube] Extend HTML5 player regular expression (#17516)
++ [liveleak] Add support for another embed type and restore original
+ format extraction
++ [crackle] Extract ISM and HTTP formats
++ [twitter] Pass Referer with card request (#18579)
+* [mediasite] Extend URL regular expression (#18558)
++ [lecturio] Add support for lecturio.de (#18562)
++ [discovery] Add support for Scripps Networks watch domains (#17947)
+
+
+version 2018.12.17
+
+Extractors
+* [ard:beta] Improve geo restricted videos extraction
+* [ard:beta] Fix subtitles extraction
+* [ard:beta] Improve extraction robustness
+* [ard:beta] Relax URL regular expression (#18441)
+* [acast] Add support for embed.acast.com and play.acast.com (#18483)
+* [iprima] Relax URL regular expression (#18515, #18540)
+* [vrv] Fix initial state extraction (#18553)
+* [youtube] Fix mark watched (#18546)
++ [safari] Add support for learning.oreilly.com (#18510)
+* [youtube] Fix multifeed extraction (#18531)
+* [lecturio] Improve subtitles extraction (#18488)
+* [uol] Fix format URL extraction (#18480)
++ [ard:mediathek] Add support for classic.ardmediathek.de (#18473)
+
+
+version 2018.12.09
+
+Core
+* [YoutubeDL] Keep session cookies in cookie file between runs
+* [YoutubeDL] Recognize session cookies with expired set to 0 (#12929)
+
+Extractors
++ [teachable] Add support for teachable platform sites (#5451, #18150, #18272)
++ [aenetworks] Add support for historyvault.com (#18460)
+* [imgur] Improve gallery and album detection and extraction (#9133, #16577,
+ #17223, #18404)
+* [iprima] Relax URL regular expression (#18453)
+* [hotstar] Fix video data extraction (#18386)
+* [ard:mediathek] Fix title and description extraction (#18349, #18371)
+* [xvideos] Switch to HTTPS (#18422, #18427)
++ [lecturio] Add support for lecturio.com (#18405)
++ [nrktv:series] Add support for extra materials
+* [nrktv:season,series] Fix extraction (#17159, #17258)
+* [nrktv] Relax URL regular expression (#18304, #18387)
+* [yourporn] Fix extraction (#18424, #18425)
+* [tbs] Fix info extraction (#18403)
++ [gamespot] Add support for review URLs
+
+
+version 2018.12.03
+
+Core
+* [utils] Fix random_birthday to generate existing dates only (#18284)
+
+Extractors
++ [tiktok] Add support for tiktok.com (#18108, #18135)
+* [pornhub] Use actual URL host for requests (#18359)
+* [lynda] Fix authentication (#18158, #18217)
+* [gfycat] Update API endpoint (#18333, #18343)
++ [hotstar] Add support for alternative app state layout (#18320)
+* [azmedien] Fix extraction (#18334, #18336)
++ [vimeo] Add support for VHX (Vimeo OTT) (#14835)
+* [joj] Fix extraction (#18280, #18281)
++ [wistia] Add support for fast.wistia.com (#18287)
+
+
+version 2018.11.23
+
+Core
++ [setup.py] Add more relevant classifiers
+
+Extractors
+* [mixcloud] Fallback to hardcoded decryption key (#18016)
+* [nbc:news] Fix article extraction (#16194)
+* [foxsports] Fix extraction (#17543)
+* [loc] Relax regular expression and improve formats extraction
++ [ciscolive] Add support for ciscolive.cisco.com (#17984)
+* [nzz] Relax kaltura regex (#18228)
+* [sixplay] Fix formats extraction
+* [bitchute] Improve title extraction
+* [kaltura] Limit requested MediaEntry fields
++ [americastestkitchen] Add support for zype embeds (#18225)
++ [pornhub] Add pornhub.net alias
+* [nova:embed] Fix extraction (#18222)
+
+
+version 2018.11.18
+
+Extractors
++ [wwe] Extract subtitles
++ [wwe] Add support for playlistst (#14781)
++ [wwe] Add support for wwe.com (#14781, #17450)
+* [vk] Detect geo restriction (#17767)
+* [openload] Use original host during extraction (#18211)
+* [atvat] Fix extraction (#18041)
++ [rte] Add support for new API endpoint (#18206)
+* [tnaflixnetwork:embed] Fix extraction (#18205)
+* [picarto] Use API and add token support (#16518)
++ [zype] Add support for player.zype.com (#18143)
+* [vivo] Fix extraction (#18139)
+* [ruutu] Update API endpoint (#18138)
+
+
version 2018.11.07
Extractors
**tl;dr:** [navigate me to examples](#output-template-examples).
-The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are:
+The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by formatting operations. Allowed names along with sequence type are:
- `id` (string): Video identifier
- `title` (string): Video title
```
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
-7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
+7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303). Add tests and code for as many as you want.
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](http://flake8.pycqa.org/en/latest/index.html#quickstart):
+
+ $ flake8 youtube_dl/extractor/yourextractor.py
+
+9. Make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+10. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
$ git add youtube_dl/extractor/extractors.py
$ git add youtube_dl/extractor/yourextractor.py
$ git commit -m '[yourextractor] Add new extractor'
$ git push origin yourextractor
-10. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
+11. Finally, [create a pull request](https://help.github.com/articles/creating-a-pull-request). We'll then review and merge it.
In any case, thank you very much for your contributions!
### Mandatory and optional metafields
-For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl:
+For extraction to work youtube-dl relies on metadata your extractor extracts and provides to youtube-dl expressed by an [information dictionary](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303) or simply *info dict*. Only the following meta fields in the *info dict* are considered mandatory for a successful extraction process by youtube-dl:
- `id` (media identifier)
- `title` (media title)
In fact only the last option is technically mandatory (i.e. if you can't figure out the download location of the media the extraction does not make any sense). But by convention youtube-dl also treats `id` and `title` as mandatory. Thus the aforementioned metafields are the critical data that the extraction does not make any sense without and if any of them fail to be extracted then the extractor is considered completely broken.
-[Any field](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
+[Any field](https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303) apart from the aforementioned ones are considered **optional**. That means that extraction should be **tolerant** to situations when sources for these fields can potentially be unavailable (even if they are always available at the moment) and **future-proof** in order not to break the extraction of general purpose mandatory fields.
#### Example
This code will try to extract from `meta` first and if it fails it will try extracting `og:title` from a `webpage`.
-### Make regular expressions flexible
+### Regular expressions
+
+#### Don't capture groups you don't use
+
+Capturing group must be an indication that it's used somewhere in the code. Any group that is not used must be non capturing.
+
+##### Example
+
+Don't capture id attribute name here since you can't use it for anything anyway.
+
+Correct:
+
+```python
+r'(?:id|ID)=(?P<id>\d+)'
+```
+
+Incorrect:
+```python
+r'(id|ID)=(?P<id>\d+)'
+```
+
-When using regular expressions try to write them fuzzy and flexible.
+#### Make regular expressions relaxed and flexible
+
+When using regular expressions try to write them fuzzy, relaxed and flexible, skipping insignificant parts that are more likely to change, allowing both single and double quotes for quoted values and so on.
-#### Example
+##### Example
Say you need to extract `title` from the following HTML code:
webpage, 'title', group='title')
```
+### Long lines policy
+
+There is a soft limit to keep lines of code under 80 characters long. This means it should be respected if possible and if it does not make readability and code maintenance worse.
+
+For example, you should **never** split long string literals like URLs or some other often copied entities over multiple lines to fit this limit:
+
+Correct:
+
+```python
+'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
+```
+
+Incorrect:
+
+```python
+'https://www.youtube.com/watch?v=FqZTN594JQw&list='
+'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
+```
+
### Use safe conversion functions
Wrap all extracted numeric data into safe functions from [`youtube_dl/utils.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/utils.py): `int_or_none`, `float_or_none`. Use them for string to number conversions as well.
downloading each video. The special sequences may be formatted according
to python string formatting operations. For example, %(NAME)s or
%(NAME)05d. To clarify, that is a percent symbol followed by a name in
-parentheses, followed by a formatting operations. Allowed names along
-with sequence type are:
+parentheses, followed by formatting operations. Allowed names along with
+sequence type are:
- id (string): Video identifier
- title (string): Video title
methods and a detailed description of what your extractor should and
may return. Add tests and code for as many as you want.
8. Make sure your code follows youtube-dl coding conventions and check
- the code with flake8. Also make sure your code works under all
- Python versions claimed supported by youtube-dl, namely 2.6, 2.7,
- and 3.2+.
-9. When the tests pass, add the new files and commit them and push the
+ the code with flake8:
+
+ $ flake8 youtube_dl/extractor/yourextractor.py
+
+9. Make sure your code works under all Python versions claimed
+ supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+10. When the tests pass, add the new files and commit them and push the
result, like this:
- $ git add youtube_dl/extractor/extractors.py
- $ git add youtube_dl/extractor/yourextractor.py
- $ git commit -m '[yourextractor] Add new extractor'
- $ git push origin yourextractor
+ $ git add youtube_dl/extractor/extractors.py
+ $ git add youtube_dl/extractor/yourextractor.py
+ $ git commit -m '[yourextractor] Add new extractor'
+ $ git push origin yourextractor
-10. Finally, create a pull request. We'll then review and merge it.
+11. Finally, create a pull request. We'll then review and merge it.
In any case, thank you very much for your contributions!
This code will try to extract from meta first and if it fails it will
try extracting og:title from a webpage.
-Make regular expressions flexible
+Regular expressions
+
+Don't capture groups you don't use
-When using regular expressions try to write them fuzzy and flexible.
+Capturing group must be an indication that it's used somewhere in the
+code. Any group that is not used must be non capturing.
+
+Example
+
+Don't capture id attribute name here since you can't use it for anything
+anyway.
+
+Correct:
+
+ r'(?:id|ID)=(?P<id>\d+)'
+
+Incorrect:
+
+ r'(id|ID)=(?P<id>\d+)'
+
+Make regular expressions relaxed and flexible
+
+When using regular expressions try to write them fuzzy, relaxed and
+flexible, skipping insignificant parts that are more likely to change,
+allowing both single and double quotes for quoted values and so on.
Example
r'<span style="position: absolute; left: 910px; width: 90px; float: right; z-index: 9999;" class="title">(.*?)</span>',
webpage, 'title', group='title')
+Long lines policy
+
+There is a soft limit to keep lines of code under 80 characters long.
+This means it should be respected if possible and if it does not make
+readability and code maintenance worse.
+
+For example, you should NEVER split long string literals like URLs or
+some other often copied entities over multiple lines to fit this limit:
+
+Correct:
+
+ 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
+
+Incorrect:
+
+ 'https://www.youtube.com/watch?v=FqZTN594JQw&list='
+ 'PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4'
+
Use safe conversion functions
Wrap all extracted numeric data into safe functions from
- **AdobeTVShow**
- **AdobeTVVideo**
- **AdultSwim**
- - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network
+ - **aenetworks**: A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault
- **afreecatv**: afreecatv.com
- **AirMozilla**
- **AliExpressLive**
- **chirbit**
- **chirbit:profile**
- **Cinchcast**
+ - **CiscoLiveSearch**
+ - **CiscoLiveSession**
- **CJSW**
- **cliphunter**
- **Clippit**
- **Fusion**
- **Fux**
- **FXNetworks**
+ - **Gaia**
- **GameInformer**
- **GameOne**
- **gameone:playlist**
- **HRTiPlaylist**
- **Huajiao**: 花椒直播
- **HuffPost**: Huffington Post
+ - **Hungama**
+ - **HungamaSong**
- **Hypem**
- **Iconosquare**
- **ign.com**
- **imdb**: Internet Movie Database trailers
- **imdb:list**: Internet Movie Database lists
- **Imgur**
- - **ImgurAlbum**
+ - **imgur:album**
+ - **imgur:gallery**
- **Ina**
- **Inc**
- **IndavideoEmbed**
- **Le**: 乐视网
- **Learnr**
- **Lecture2Go**
+ - **Lecturio**
+ - **LecturioCourse**
+ - **LecturioDeCourse**
- **LEGO**
- **Lemonde**
- **Lenta**
- **MyviEmbed**
- **MyVisionTV**
- **n-tv.de**
- - **natgeo**
- - **natgeo:episodeguide**
- **natgeo:video**
- **Naver**
- **NBA**
- **orf:oe1**: Radio Österreich 1
- **orf:tvthek**: ORF TVthek
- **OsnatelTV**
+ - **OutsideTV**
- **PacktPub**
- **PacktPubCourse**
- **PandaTV**: 熊猫TV
- **Pinkbike**
- **Pladform**
- **play.fm**
+ - **PlayPlusTV**
- **PlaysTV**
- **Playtvak**: Playtvak.cz, iDNES.cz and Lidovky.cz
- **Playvid**
- **TastyTrade**
- **TBS**
- **TDSLifeway**
+ - **Teachable**
+ - **TeachableCourse**
- **teachertube**: teachertube.com videos
- **teachertube:user:collection**: teachertube.com user and collection videos
- **TeachingChannel**
- **ThisAmericanLife**
- **ThisAV**
- **ThisOldHouse**
+ - **TikTok**
+ - **TikTokUser**
- **tinypic**: tinypic.com videos
- **TMZ**
- **TMZArticle**
- **TVNet**
- **TVNoe**
- **TVNow**
- - **TVNowList**
+ - **TVNowAnnual**
+ - **TVNowNew**
+ - **TVNowSeason**
- **TVNowShow**
- **tvp**: Telewizja Polska
- **tvp:embed**: Telewizja Polska
- **uol.com.br**
- **uplynk**
- **uplynk:preplay**
- - **Upskill**
- - **UpskillCourse**
- **Urort**: NRK P3 Urørt
- **URPlay**
- **USANetwork**
- **VevoPlaylist**
- **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
- **vh1.com**
+ - **vhx:embed**
- **Viafree**
- **vice**
- **vice:article**
- **wrzuta.pl:playlist**
- **WSJ**: Wall Street Journal
- **WSJArticle**
+ - **WWE**
- **XBef**
- **XboxClips**
- **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo, RapidVideo.TV, FastVideo.me
- **ZDF**
- **ZDFChannel**
- **zingmp3**: mp3.zing.vn
+ - **Zype**
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'License :: Public Domain',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 2',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: Implementation',
+ 'Programming Language :: Python :: Implementation :: CPython',
+ 'Programming Language :: Python :: Implementation :: IronPython',
+ 'Programming Language :: Python :: Implementation :: Jython',
+ 'Programming Language :: Python :: Implementation :: PyPy',
],
cmdclass={'build_lazy_extractors': build_lazy_extractors},
isinstance(got, compat_str),
'Expected field %s to be a unicode object, but got value %r of type %r' % (field, got, type(got)))
got = 'md5:' + md5(got)
- elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
+ elif isinstance(expected, compat_str) and re.match(r'^(?:min|max)?count:\d+', expected):
self.assertTrue(
isinstance(got, (list, dict)),
'Expected field %s to be a list or a dict, but it is of type %s' % (
field, type(got).__name__))
- expected_num = int(expected.partition(':')[2])
- assertGreaterEqual(
+ op, _, expected_num = expected.partition(':')
+ expected_num = int(expected_num)
+ if op == 'mincount':
+ assert_func = assertGreaterEqual
+ msg_tmpl = 'Expected %d items in field %s, but only got %d'
+ elif op == 'maxcount':
+ assert_func = assertLessEqual
+ msg_tmpl = 'Expected maximum %d items in field %s, but got %d'
+ elif op == 'count':
+ assert_func = assertEqual
+ msg_tmpl = 'Expected exactly %d items in field %s, but got %d'
+ else:
+ assert False
+ assert_func(
self, len(got), expected_num,
- 'Expected %d items in field %s, but only got %d' % (expected_num, field, len(got)))
+ msg_tmpl % (expected_num, field, len(got)))
return
self.assertEqual(
expected, got,
self.assertTrue(got >= expected, msg)
+def assertLessEqual(self, got, expected, msg=None):
+ if not (got <= expected):
+ if msg is None:
+ msg = '%r not less than or equal to %r' % (got, expected)
+ self.assertTrue(got <= expected, msg)
+
+
+def assertEqual(self, got, expected, msg=None):
+ if not (got == expected):
+ if msg is None:
+ msg = '%r not equal to %r' % (got, expected)
+ self.assertTrue(got == expected, msg)
+
+
def expect_warnings(ydl, warnings_re):
real_warning = ydl.report_warning
--- /dev/null
+#!/usr/bin/env python
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+import os
+import re
+import sys
+import tempfile
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.utils import YoutubeDLCookieJar
+
+
+class TestYoutubeDLCookieJar(unittest.TestCase):
+ def test_keep_session_cookies(self):
+ cookiejar = YoutubeDLCookieJar('./test/testdata/cookies/session_cookies.txt')
+ cookiejar.load(ignore_discard=True, ignore_expires=True)
+ tf = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ cookiejar.save(filename=tf.name, ignore_discard=True, ignore_expires=True)
+ temp = tf.read().decode('utf-8')
+ self.assertTrue(re.search(
+ r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpiresEmpty\s+YoutubeDLExpiresEmptyValue', temp))
+ self.assertTrue(re.search(
+ r'www\.foobar\.foobar\s+FALSE\s+/\s+TRUE\s+0\s+YoutubeDLExpires0\s+YoutubeDLExpires0Value', temp))
+ finally:
+ tf.close()
+ os.remove(tf.name)
+
+
+if __name__ == '__main__':
+ unittest.main()
def test_compat_expanduser(self):
old_home = os.environ.get('HOME')
- test_str = 'C:\Documents and Settings\тест\Application Data'
+ test_str = r'C:\Documents and Settings\тест\Application Data'
compat_setenv('HOME', test_str)
self.assertEqual(compat_expanduser('~'), test_str)
compat_setenv('HOME', old_home or '')
class TestMetadataFromTitle(unittest.TestCase):
def test_format_to_regex(self):
pp = MetadataFromTitlePP(None, '%(title)s - %(artist)s')
- self.assertEqual(pp._titleregex, '(?P<title>.+)\ \-\ (?P<artist>.+)')
+ self.assertEqual(pp._titleregex, r'(?P<title>.+)\ \-\ (?P<artist>.+)')
--- /dev/null
+# Netscape HTTP Cookie File
+# http://curl.haxx.se/rfc/cookie_spec.html
+# This is a generated file! Do not edit.
+
+www.foobar.foobar FALSE / TRUE YoutubeDLExpiresEmpty YoutubeDLExpiresEmptyValue
+www.foobar.foobar FALSE / TRUE 0 YoutubeDLExpires0 YoutubeDLExpires0Value
operations (https://docs.python.org/2/library/stdtypes.html#string-formatting).
For example, \f[C]%(NAME)s\f[] or \f[C]%(NAME)05d\f[].
To clarify, that is a percent symbol followed by a name in parentheses,
-followed by a formatting operations.
+followed by formatting operations.
Allowed names along with sequence type are:
.IP \[bu] 2
\f[C]id\f[] (string): Video identifier
\f[C]youtube_dl/extractor/common.py\f[] (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py)
for possible helper methods and a detailed description of what your
extractor should and may
-return (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252).
+return (https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303).
Add tests and code for as many as you want.
.IP " 8." 4
Make sure your code follows youtube\-dl coding conventions and check the
-code with flake8 (https://pypi.python.org/pypi/flake8).
-Also make sure your code works under all
-Python (https://www.python.org/) versions claimed supported by
-youtube\-dl, namely 2.6, 2.7, and 3.2+.
+code with
+flake8 (http://flake8.pycqa.org/en/latest/index.html#quickstart):
+.RS 4
+.IP
+.nf
+\f[C]
+\ $\ flake8\ youtube_dl/extractor/yourextractor.py
+\f[]
+.fi
+.RE
.IP " 9." 4
+Make sure your code works under all Python (https://www.python.org/)
+versions claimed supported by youtube\-dl, namely 2.6, 2.7, and 3.2+.
+.IP "10." 4
When the tests pass, add (https://git-scm.com/docs/git-add) the new
files and commit (https://git-scm.com/docs/git-commit) them and
push (https://git-scm.com/docs/git-push) the result, like this:
.IP
.nf
\f[C]
-\ $\ git\ add\ youtube_dl/extractor/extractors.py
-\ $\ git\ add\ youtube_dl/extractor/yourextractor.py
-\ $\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq]
-\ $\ git\ push\ origin\ yourextractor
+$\ git\ add\ youtube_dl/extractor/extractors.py
+$\ git\ add\ youtube_dl/extractor/yourextractor.py
+$\ git\ commit\ \-m\ \[aq][yourextractor]\ Add\ new\ extractor\[aq]
+$\ git\ push\ origin\ yourextractor
\f[]
.fi
.RE
-.IP "10." 4
+.IP "11." 4
Finally, create a pull
request (https://help.github.com/articles/creating-a-pull-request).
We\[aq]ll then review and merge it.
.PP
For extraction to work youtube\-dl relies on metadata your extractor
extracts and provides to youtube\-dl expressed by an information
-dictionary (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L75-L257)
+dictionary (https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L94-L303)
or simply \f[I]info dict\f[].
Only the following meta fields in the \f[I]info dict\f[] are considered
mandatory for a successful extraction process by youtube\-dl:
extracted then the extractor is considered completely broken.
.PP
Any
-field (https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L149-L257)
+field (https://github.com/rg3/youtube-dl/blob/7f41a598b3fba1bcab2817de64a08941200aa3c8/youtube_dl/extractor/common.py#L188-L303)
apart from the aforementioned ones are considered \f[B]optional\f[].
That means that extraction should be \f[B]tolerant\f[] to situations
when sources for these fields can potentially be unavailable (even if
.PP
This code will try to extract from \f[C]meta\f[] first and if it fails
it will try extracting \f[C]og:title\f[] from a \f[C]webpage\f[].
-.SS Make regular expressions flexible
+.SS Regular expressions
+.SS Don\[aq]t capture groups you don\[aq]t use
+.PP
+Capturing group must be an indication that it\[aq]s used somewhere in
+the code.
+Any group that is not used must be non capturing.
+.SS Example
+.PP
+Don\[aq]t capture id attribute name here since you can\[aq]t use it for
+anything anyway.
+.PP
+Correct:
+.IP
+.nf
+\f[C]
+r\[aq](?:id|ID)=(?P<id>\\d+)\[aq]
+\f[]
+.fi
+.PP
+Incorrect:
+.IP
+.nf
+\f[C]
+r\[aq](id|ID)=(?P<id>\\d+)\[aq]
+\f[]
+.fi
+.SS Make regular expressions relaxed and flexible
.PP
-When using regular expressions try to write them fuzzy and flexible.
+When using regular expressions try to write them fuzzy, relaxed and
+flexible, skipping insignificant parts that are more likely to change,
+allowing both single and double quotes for quoted values and so on.
.SS Example
.PP
Say you need to extract \f[C]title\f[] from the following HTML code:
\ \ \ \ webpage,\ \[aq]title\[aq],\ group=\[aq]title\[aq])
\f[]
.fi
+.SS Long lines policy
+.PP
+There is a soft limit to keep lines of code under 80 characters long.
+This means it should be respected if possible and if it does not make
+readability and code maintenance worse.
+.PP
+For example, you should \f[B]never\f[] split long string literals like
+URLs or some other often copied entities over multiple lines to fit this
+limit:
+.PP
+Correct:
+.IP
+.nf
+\f[C]
+\[aq]https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4\[aq]
+\f[]
+.fi
+.PP
+Incorrect:
+.IP
+.nf
+\f[C]
+\[aq]https://www.youtube.com/watch?v=FqZTN594JQw&list=\[aq]
+\[aq]PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4\[aq]
+\f[]
+.fi
.SS Use safe conversion functions
.PP
Wrap all extracted numeric data into safe functions from
version_tuple,
write_json_file,
write_string,
+ YoutubeDLCookieJar,
YoutubeDLCookieProcessor,
YoutubeDLHandler,
)
self.restore_console_title()
if self.params.get('cookiefile') is not None:
- self.cookiejar.save()
+ self.cookiejar.save(ignore_discard=True, ignore_expires=True)
def trouble(self, message=None, tb=None):
"""Determine action to take when a download problem appears.
self.cookiejar = compat_cookiejar.CookieJar()
else:
opts_cookiefile = expand_path(opts_cookiefile)
- self.cookiejar = compat_cookiejar.MozillaCookieJar(
- opts_cookiefile)
+ self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
if os.access(opts_cookiefile, os.R_OK):
- self.cookiejar.load()
+ self.cookiejar.load(ignore_discard=True, ignore_expires=True)
cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
if opts_proxy is not None:
fd.add_progress_hook(ph)
return fd.real_download(filename, info_dict)
- def is_ad_fragment(s):
+ def is_ad_fragment_start(s):
return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or
s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad'))
+ def is_ad_fragment_end(s):
+ return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=master' in s or
+ s.startswith('#UPLYNK-SEGMENT') and s.endswith(',segment'))
+
media_frags = 0
ad_frags = 0
ad_frag_next = False
if not line:
continue
if line.startswith('#'):
- if is_ad_fragment(line):
- ad_frags += 1
+ if is_ad_fragment_start(line):
ad_frag_next = True
+ elif is_ad_fragment_end(line):
+ ad_frag_next = False
continue
if ad_frag_next:
- ad_frag_next = False
+ ad_frags += 1
continue
media_frags += 1
if line:
if not line.startswith('#'):
if ad_frag_next:
- ad_frag_next = False
continue
frag_index += 1
if frag_index <= ctx['fragment_index']:
'start': sub_range_start,
'end': sub_range_start + int(splitted_byte_range[0]),
}
- elif is_ad_fragment(line):
+ elif is_ad_fragment_start(line):
ad_frag_next = True
+ elif is_ad_fragment_end(line):
+ ad_frag_next = False
self._finish_frag_download(ctx)
class ACastIE(InfoExtractor):
IE_NAME = 'acast'
- _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<channel>[^/]+)/(?P<id>[^/#?]+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:(?:embed|www)\.)?acast\.com/|
+ play\.acast\.com/s/
+ )
+ (?P<channel>[^/]+)/(?P<id>[^/#?]+)
+ '''
_TESTS = [{
- # test with one bling
- 'url': 'https://www.acast.com/condenasttraveler/-where-are-you-taipei-101-taiwan',
- 'md5': 'ada3de5a1e3a2a381327d749854788bb',
- 'info_dict': {
- 'id': '57de3baa-4bb0-487e-9418-2692c1277a34',
- 'ext': 'mp3',
- 'title': '"Where Are You?": Taipei 101, Taiwan',
- 'description': 'md5:a0b4ef3634e63866b542e5b1199a1a0e',
- 'timestamp': 1196172000,
- 'upload_date': '20071127',
- 'duration': 211,
- 'creator': 'Concierge',
- 'series': 'Condé Nast Traveler Podcast',
- 'episode': '"Where Are You?": Taipei 101, Taiwan',
- }
- }, {
- # test with multiple blings
'url': 'https://www.acast.com/sparpodcast/2.raggarmordet-rosterurdetforflutna',
'md5': 'a02393c74f3bdb1801c3ec2695577ce0',
'info_dict': {
'series': 'Spår',
'episode': '2. Raggarmordet - Röster ur det förflutna',
}
+ }, {
+ 'url': 'http://embed.acast.com/adambuxton/ep.12-adam-joeschristmaspodcast2015',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://play.acast.com/s/rattegangspodden/s04e09-styckmordet-i-helenelund-del-22',
+ 'only_matching': True,
}]
def _real_extract(self, url):
class ACastChannelIE(InfoExtractor):
IE_NAME = 'acast:channel'
- _VALID_URL = r'https?://(?:www\.)?acast\.com/(?P<id>[^/#?]+)'
- _TEST = {
- 'url': 'https://www.acast.com/condenasttraveler',
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?acast\.com/|
+ play\.acast\.com/s/
+ )
+ (?P<id>[^/#?]+)
+ '''
+ _TESTS = [{
+ 'url': 'https://www.acast.com/todayinfocus',
'info_dict': {
- 'id': '50544219-29bb-499e-a083-6087f4cb7797',
- 'title': 'Condé Nast Traveler Podcast',
- 'description': 'md5:98646dee22a5b386626ae31866638fbd',
+ 'id': '4efc5294-5385-4847-98bd-519799ce5786',
+ 'title': 'Today in Focus',
+ 'description': 'md5:9ba5564de5ce897faeb12963f4537a64',
},
- 'playlist_mincount': 20,
- }
- _API_BASE_URL = 'https://www.acast.com/api/'
+ 'playlist_mincount': 35,
+ }, {
+ 'url': 'http://play.acast.com/s/ft-banking-weekly',
+ 'only_matching': True,
+ }]
+ _API_BASE_URL = 'https://play.acast.com/api/'
_PAGE_SIZE = 10
@classmethod
channel_slug, note='Download page %d of channel data' % page)
for cast in casts:
yield self.url_result(
- 'https://www.acast.com/%s/%s' % (channel_slug, cast['url']),
+ 'https://play.acast.com/s/%s/%s' % (channel_slug, cast['url']),
'ACast', cast['id'])
def _real_extract(self, url):
class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
- IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
+ IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network and History Vault'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?
(?P<domain>
- (?:history|aetv|mylifetime|lifetimemovieclub)\.com|
+ (?:history(?:vault)?|aetv|mylifetime|lifetimemovieclub)\.com|
fyi\.tv
)/
(?:
shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
- specials/(?P<special_display_id>[^/]+)/full-special
+ specials/(?P<special_display_id>[^/]+)/full-special|
+ collections/[^/]+/(?P<collection_display_id>[^/]+)
)
'''
_TESTS = [{
}, {
'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
'only_matching': True
+ }, {
+ 'url': 'https://www.historyvault.com/collections/america-the-story-of-us/westward',
+ 'only_matching': True
}]
_DOMAIN_TO_REQUESTOR_ID = {
'history.com': 'HISTORY',
}
def _real_extract(self, url):
- domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups()
- display_id = show_path or movie_display_id or special_display_id
- webpage = self._download_webpage(url, display_id)
+ domain, show_path, movie_display_id, special_display_id, collection_display_id = re.match(self._VALID_URL, url).groups()
+ display_id = show_path or movie_display_id or special_display_id or collection_display_id
+ webpage = self._download_webpage(url, display_id, headers=self.geo_verification_headers())
if show_path:
url_parts = show_path.split('/')
url_parts_len = len(url_parts)
webpage = self._download_webpage(url, video_id)
- partner_id = self._search_regex(
- r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)',
- webpage, 'kaltura partner id')
-
video_data = self._parse_json(
self._search_regex(
r'window\.__INITIAL_STATE__\s*=\s*({.+?})\s*;\s*</script>',
(lambda x: x['episodeDetail']['content']['data'],
lambda x: x['videoDetail']['content']['data']), dict)
ep_meta = ep_data.get('full_video', {})
- external_id = ep_data.get('external_id') or ep_meta['external_id']
+
+ zype_id = ep_meta.get('zype_id')
+ if zype_id:
+ embed_url = 'https://player.zype.com/embed/%s.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ' % zype_id
+ ie_key = 'Zype'
+ else:
+ partner_id = self._search_regex(
+ r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)',
+ webpage, 'kaltura partner id')
+ external_id = ep_data.get('external_id') or ep_meta['external_id']
+ embed_url = 'kaltura:%s:%s' % (partner_id, external_id)
+ ie_key = 'Kaltura'
title = ep_data.get('title') or ep_meta.get('title')
description = clean_html(ep_meta.get('episode_description') or ep_data.get(
return {
'_type': 'url_transparent',
- 'url': 'kaltura:%s:%s' % (partner_id, external_id),
- 'ie_key': 'Kaltura',
+ 'url': embed_url,
+ 'ie_key': ie_key,
'title': title,
'description': description,
'thumbnail': thumbnail,
from ..utils import (
determine_ext,
ExtractorError,
- qualities,
int_or_none,
parse_duration,
+ qualities,
+ str_or_none,
+ try_get,
unified_strdate,
- xpath_text,
+ unified_timestamp,
update_url_query,
url_or_none,
+ xpath_text,
)
from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor):
IE_NAME = 'ARD:mediathek'
- _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
+ _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
_TESTS = [{
# available till 26.07.2022
# audio
'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158',
'only_matching': True,
+ }, {
+ 'url': 'https://classic.ardmediathek.de/tv/Panda-Gorilla-Co/Panda-Gorilla-Co-Folge-274/Das-Erste/Video?bcastId=16355486&documentId=58234698',
+ 'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url)
+
def _extract_media_info(self, media_info_url, webpage, video_id):
media_info = self._download_json(
media_info_url, video_id, 'Downloading media JSON')
title = self._html_search_regex(
[r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
r'<meta name="dcterms\.title" content="(.*?)"/>',
- r'<h4 class="headline">(.*?)</h4>'],
+ r'<h4 class="headline">(.*?)</h4>',
+ r'<title[^>]*>(.*?)</title>'],
webpage, 'title')
description = self._html_search_meta(
'dcterms.abstract', webpage, 'description', default=None)
if description is None:
description = self._html_search_meta(
- 'description', webpage, 'meta description')
+ 'description', webpage, 'meta description', default=None)
+ if description is None:
+ description = self._html_search_regex(
+ r'<p\s+class="teasertext">(.+?)</p>',
+ webpage, 'teaser text', default=None)
# Thumbnail is sometimes not present.
# It is in the mobile version, but that seems to use a different URL
class ARDBetaMediathekIE(InfoExtractor):
- _VALID_URL = r'https://beta\.ardmediathek\.de/[a-z]+/player/(?P<video_id>[a-zA-Z0-9]+)/(?P<display_id>[^/?#]+)'
+ _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?'
_TESTS = [{
'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita',
'md5': '2d02d996156ea3c397cfc5036b5d7f8f',
'upload_date': '20180826',
'ext': 'mp4',
},
+ }, {
+ 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3N3ci5kZS9hZXgvbzEwNzE5MTU/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.ardmediathek.de/swr/live/Y3JpZDovL3N3ci5kZS8xMzQ4MTA0Mg',
+ 'only_matching': True,
}]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('video_id')
- display_id = mobj.group('display_id')
+ display_id = mobj.group('display_id') or video_id
webpage = self._download_webpage(url, display_id)
data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json')
'display_id': display_id,
}
formats = []
+ subtitles = {}
+ geoblocked = False
for widget in data.values():
- if widget.get('_geoblocked'):
- raise ExtractorError('This video is not available due to geoblocking', expected=True)
-
+ if widget.get('_geoblocked') is True:
+ geoblocked = True
if '_duration' in widget:
- res['duration'] = widget['_duration']
+ res['duration'] = int_or_none(widget['_duration'])
if 'clipTitle' in widget:
res['title'] = widget['clipTitle']
if '_previewImage' in widget:
res['thumbnail'] = widget['_previewImage']
if 'broadcastedOn' in widget:
- res['upload_date'] = unified_strdate(widget['broadcastedOn'])
+ res['timestamp'] = unified_timestamp(widget['broadcastedOn'])
if 'synopsis' in widget:
res['description'] = widget['synopsis']
- if '_subtitleUrl' in widget:
- res['subtitles'] = {'de': [{
+ subtitle_url = url_or_none(widget.get('_subtitleUrl'))
+ if subtitle_url:
+ subtitles.setdefault('de', []).append({
'ext': 'ttml',
- 'url': widget['_subtitleUrl'],
- }]}
+ 'url': subtitle_url,
+ })
if '_quality' in widget:
- format_url = widget['_stream']['json'][0]
-
- if format_url.endswith('.f4m'):
+ format_url = url_or_none(try_get(
+ widget, lambda x: x['_stream']['json'][0]))
+ if not format_url:
+ continue
+ ext = determine_ext(format_url)
+ if ext == 'f4m':
formats.extend(self._extract_f4m_formats(
format_url + '?hdcore=3.11.0',
video_id, f4m_id='hds', fatal=False))
- elif format_url.endswith('m3u8'):
+ elif ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ format_url, video_id, 'mp4', m3u8_id='hls',
+ fatal=False))
else:
+ # HTTP formats are not available when geoblocked is True,
+ # other formats are fine though
+ if geoblocked:
+ continue
+ quality = str_or_none(widget.get('_quality'))
formats.append({
- 'format_id': 'http-' + widget['_quality'],
+ 'format_id': ('http-' + quality) if quality else 'http',
'url': format_url,
'preference': 10, # Plain HTTP, that's nice
})
+ if not formats and geoblocked:
+ self.raise_geo_restricted(
+ msg='This video is not available due to geoblocking',
+ countries=['DE'])
+
self._sort_formats(formats)
- res['formats'] = formats
+ res.update({
+ 'subtitles': subtitles,
+ 'formats': formats,
+ })
return res
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
video_data = self._parse_json(unescapeHTML(self._search_regex(
- r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="([^"]+)"',
- webpage, 'player data')), display_id)['config']['initial_video']
+ [r'flashPlayerOptions\s*=\s*(["\'])(?P<json>(?:(?!\1).)+)\1',
+ r'class="[^"]*jsb_video/FlashPlayer[^"]*"[^>]+data-jsb="(?P<json>[^"]+)"'],
+ webpage, 'player data', group='json')),
+ display_id)['config']['initial_video']
video_id = video_data['id']
video_title = video_data['title']
# Audiomack wraps a lot of soundcloud tracks in their branded wrapper
# if so, pass the work off to the soundcloud extractor
if SoundcloudIE.suitable(api_response['url']):
- return {'_type': 'url', 'url': api_response['url'], 'ie_key': 'Soundcloud'}
+ return self.url_result(api_response['url'], SoundcloudIE.ie_key())
return {
'id': compat_str(api_response.get('id', album_url_tag)),
'id': '1_anruz3wy',
'ext': 'mp4',
'title': 'Bundesrats-Vakanzen / EU-Rahmenabkommen',
- 'description': 'md5:dd9f96751ec9c35e409a698a328402f3',
'uploader_id': 'TVOnline',
'upload_date': '20180930',
'timestamp': 1538328802,
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
video_id = mobj.group('id')
entry_id = mobj.group('kaltura_id')
if not entry_id:
- webpage = self._download_webpage(url, video_id)
- api_path = self._search_regex(
- r'["\']apiPath["\']\s*:\s*["\']([^"^\']+)["\']',
- webpage, 'api path')
- api_url = 'https://www.%s%s' % (mobj.group('host'), api_path)
+ api_url = 'https://www.%s/api/pub/gql/%s' % (host, host.split('.')[0])
payload = {
'query': '''query VideoContext($articleId: ID!) {
article: node(id: $articleId) {
'uploader': 'Radio 3',
'uploader_id': 'bbc_radio_three',
},
+ }, {
+ 'url': 'http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227',
+ 'info_dict': {
+ 'id': 'p06w9tws',
+ 'ext': 'mp4',
+ 'title': 'md5:2fabf12a726603193a2879a055f72514',
+ 'description': 'Learn English words and phrases from this story',
+ },
+ 'add_ie': [BBCCoUkIE.ie_key()],
}]
@classmethod
if entries:
return self.playlist_result(entries, playlist_id, playlist_title, playlist_description)
+ # http://www.bbc.co.uk/learningenglish/chinese/features/lingohack/ep-181227
+ group_id = self._search_regex(
+ r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
+ webpage, 'group id', default=None)
+ if playlist_id:
+ return self.url_result(
+ 'https://www.bbc.co.uk/programmes/%s' % group_id,
+ ie=BBCCoUkIE.ie_key())
+
# single video story (e.g. http://www.bbc.com/travel/story/20150625-sri-lankas-spicy-secret)
programme_id = self._search_regex(
[r'data-(?:video-player|media)-vpid="(%s)"' % self._ID_REGEX,
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_chr,
- compat_ord,
- compat_urllib_parse_unquote,
-)
+from ..compat import compat_str
from ..utils import (
int_or_none,
- parse_iso8601,
- urljoin,
+ unified_timestamp,
)
webpage = self._download_webpage(url, video_id)
- cpl_url = self._search_regex(
- r'<script[^>]+src=(["\'])(?P<url>(?:/static|(?:https?:)?//static\.beeg\.com)/cpl/\d+\.js.*?)\1',
- webpage, 'cpl', default=None, group='url')
-
- cpl_url = urljoin(url, cpl_url)
-
- beeg_version, beeg_salt = [None] * 2
-
- if cpl_url:
- cpl = self._download_webpage(
- self._proto_relative_url(cpl_url), video_id,
- 'Downloading cpl JS', fatal=False)
- if cpl:
- beeg_version = int_or_none(self._search_regex(
- r'beeg_version\s*=\s*([^\b]+)', cpl,
- 'beeg version', default=None)) or self._search_regex(
- r'/(\d+)\.js', cpl_url, 'beeg version', default=None)
- beeg_salt = self._search_regex(
- r'beeg_salt\s*=\s*(["\'])(?P<beeg_salt>.+?)\1', cpl, 'beeg salt',
- default=None, group='beeg_salt')
-
- beeg_version = beeg_version or '2185'
- beeg_salt = beeg_salt or 'pmweAkq8lAYKdfWcFCUj0yoVgoPlinamH5UE1CB3H'
+ beeg_version = self._search_regex(
+ r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version',
+ default='1546225636701')
for api_path in ('', 'api.'):
video = self._download_json(
if video:
break
- def split(o, e):
- def cut(s, x):
- n.append(s[:x])
- return s[x:]
- n = []
- r = len(o) % e
- if r > 0:
- o = cut(o, r)
- while len(o) > e:
- o = cut(o, e)
- n.append(o)
- return n
-
- def decrypt_key(key):
- # Reverse engineered from http://static.beeg.com/cpl/1738.js
- a = beeg_salt
- e = compat_urllib_parse_unquote(key)
- o = ''.join([
- compat_chr(compat_ord(e[n]) - compat_ord(a[n % len(a)]) % 21)
- for n in range(len(e))])
- return ''.join(split(o, 3)[::-1])
-
- def decrypt_url(encrypted_url):
- encrypted_url = self._proto_relative_url(
- encrypted_url.replace('{DATA_MARKERS}', ''), 'https:')
- key = self._search_regex(
- r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None)
- if not key:
- return encrypted_url
- return encrypted_url.replace(key, decrypt_key(key))
-
formats = []
for format_id, video_url in video.items():
if not video_url:
if not height:
continue
formats.append({
- 'url': decrypt_url(video_url),
+ 'url': self._proto_relative_url(
+ video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'),
'format_id': format_id,
'height': int(height),
})
self._sort_formats(formats)
title = video['title']
- video_id = video.get('id') or video_id
+ video_id = compat_str(video.get('id') or video_id)
display_id = video.get('code')
description = video.get('desc')
+ series = video.get('ps_name')
- timestamp = parse_iso8601(video.get('date'), ' ')
+ timestamp = unified_timestamp(video.get('date'))
duration = int_or_none(video.get('duration'))
tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None
'display_id': display_id,
'title': title,
'description': description,
+ 'series': series,
'timestamp': timestamp,
'duration': duration,
'tags': tags,
import re
from .common import InfoExtractor
-from ..utils import urlencode_postdata
+from ..utils import (
+ orderedSet,
+ urlencode_postdata,
+)
class BitChuteIE(InfoExtractor):
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36',
})
- title = self._search_regex(
+ title = self._html_search_regex(
(r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'),
webpage, 'title', default=None) or self._html_search_meta(
'description', webpage, 'title',
default=None) or self._og_search_description(webpage)
+ format_urls = []
+ for mobj in re.finditer(
+ r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
+ format_urls.append(mobj.group('url'))
+ format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage))
+
formats = [
- {'url': mobj.group('url')}
- for mobj in re.finditer(
- r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage)]
+ {'url': format_url}
+ for format_url in orderedSet(format_urls)]
+ self._check_formats(formats, video_id)
self._sort_formats(formats)
description = self._html_search_regex(
_TESTS = [{
'url': 'https://www.cammodels.com/cam/AutumnKnight/',
'only_matching': True,
+ 'age_limit': 18
}]
def _real_extract(self, url):
'title': self._live_title(user_id),
'is_live': True,
'formats': formats,
+ 'age_limit': 18
}
'duration': 1274,
'timestamp': 1528018608,
'upload_date': '20180603',
+ 'age_limit': 18
},
'params': {
'skip_download': True,
'like_count': like_count,
'creator': creator,
'formats': formats,
+ 'age_limit': 18
}
'comment_count': int,
'uploader': 'MileenaK',
'upload_date': '20160322',
+ 'age_limit': 18,
},
'params': {
'skip_download': True,
'comment_count': comment_count,
'uploader': uploader,
'upload_date': upload_date,
+ 'age_limit': 18
}
webpage = self._download_webpage(url, video_id)
videomore_url = VideomoreIE._extract_url(webpage)
+ if not videomore_url:
+ videomore_id = self._search_regex(
+ r'getVMCode\s*\(\s*["\']?(\d+)', webpage, 'videomore id',
+ default=None)
+ if videomore_id:
+ videomore_url = 'videomore:%s' % videomore_id
if videomore_url:
title = self._og_search_title(webpage)
return {
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
+from ..utils import (
+ clean_html,
+ float_or_none,
+ int_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class CiscoLiveBaseIE(InfoExtractor):
+ # These appear to be constant across all Cisco Live presentations
+ # and are not tied to any user session or event
+ RAINFOCUS_API_URL = 'https://events.rainfocus.com/api/%s'
+ RAINFOCUS_API_PROFILE_ID = 'Na3vqYdAlJFSxhYTYQGuMbpafMqftalz'
+ RAINFOCUS_WIDGET_ID = 'n6l4Lo05R8fiy3RpUBm447dZN8uNWoye'
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5647924234001/SyK2FdqjM_default/index.html?videoId=%s'
+
+ HEADERS = {
+ 'Origin': 'https://ciscolive.cisco.com',
+ 'rfApiProfileId': RAINFOCUS_API_PROFILE_ID,
+ 'rfWidgetId': RAINFOCUS_WIDGET_ID,
+ }
+
+ def _call_api(self, ep, rf_id, query, referrer, note=None):
+ headers = self.HEADERS.copy()
+ headers['Referer'] = referrer
+ return self._download_json(
+ self.RAINFOCUS_API_URL % ep, rf_id, note=note,
+ data=urlencode_postdata(query), headers=headers)
+
+ def _parse_rf_item(self, rf_item):
+ event_name = rf_item.get('eventName')
+ title = rf_item['title']
+ description = clean_html(rf_item.get('abstract'))
+ presenter_name = try_get(rf_item, lambda x: x['participants'][0]['fullName'])
+ bc_id = rf_item['videos'][0]['url']
+ bc_url = self.BRIGHTCOVE_URL_TEMPLATE % bc_id
+ duration = float_or_none(try_get(rf_item, lambda x: x['times'][0]['length']))
+ location = try_get(rf_item, lambda x: x['times'][0]['room'])
+
+ if duration:
+ duration = duration * 60
+
+ return {
+ '_type': 'url_transparent',
+ 'url': bc_url,
+ 'ie_key': 'BrightcoveNew',
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'creator': presenter_name,
+ 'location': location,
+ 'series': event_name,
+ }
+
+
+class CiscoLiveSessionIE(CiscoLiveBaseIE):
+ _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/\??[^#]*#/session/(?P<id>[^/?&]+)'
+ _TEST = {
+ 'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs',
+ 'md5': 'c98acf395ed9c9f766941c70f5352e22',
+ 'info_dict': {
+ 'id': '5803694304001',
+ 'ext': 'mp4',
+ 'title': '13 Smart Automations to Monitor Your Cisco IOS Network',
+ 'description': 'md5:ec4a436019e09a918dec17714803f7cc',
+ 'timestamp': 1530305395,
+ 'upload_date': '20180629',
+ 'uploader_id': '5647924234001',
+ 'location': '16B Mezz.',
+ },
+ }
+
+ def _real_extract(self, url):
+ rf_id = self._match_id(url)
+ rf_result = self._call_api('session', rf_id, {'id': rf_id}, url)
+ return self._parse_rf_item(rf_result['items'][0])
+
+
+class CiscoLiveSearchIE(CiscoLiveBaseIE):
+ _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/'
+ _TESTS = [{
+ 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/',
+ 'info_dict': {
+ 'title': 'Search query',
+ },
+ 'playlist_count': 5,
+ }, {
+ 'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if CiscoLiveSessionIE.suitable(url) else super(CiscoLiveSearchIE, cls).suitable(url)
+
+ @staticmethod
+ def _check_bc_id_exists(rf_item):
+ return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None
+
+ def _entries(self, query, url):
+ query['size'] = 50
+ query['from'] = 0
+ for page_num in itertools.count(1):
+ results = self._call_api(
+ 'search', None, query, url,
+ 'Downloading search JSON page %d' % page_num)
+ sl = try_get(results, lambda x: x['sectionList'][0], dict)
+ if sl:
+ results = sl
+ items = results.get('items')
+ if not items or not isinstance(items, list):
+ break
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ if not self._check_bc_id_exists(item):
+ continue
+ yield self._parse_rf_item(item)
+ size = int_or_none(results.get('size'))
+ if size is not None:
+ query['size'] = size
+ total = int_or_none(results.get('total'))
+ if total is not None and query['from'] + query['size'] > total:
+ break
+ query['from'] += query['size']
+
+ def _real_extract(self, url):
+ query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ query['type'] = 'session'
+ return self.playlist_result(
+ self._entries(query, url), playlist_title='Search query')
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r'data-url="(.+?)"', webpage, 'cnn url')
- return {
- '_type': 'url',
- 'url': cnn_url,
- 'ie_key': CNNIE.ie_key(),
- }
+ return self.url_result(cnn_url, CNNIE.ie_key())
class CNNArticleIE(InfoExtractor):
def _real_extract(self, url):
webpage = self._download_webpage(url, url_basename(url))
cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url')
- return {
- '_type': 'url',
- 'url': 'http://cnn.com/video/?/video/' + cnn_url,
- 'ie_key': CNNIE.ie_key(),
- }
+ return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key())
if expected_type is not None and expected_type != item_type:
return info
if item_type in ('TVEpisode', 'Episode'):
+ episode_name = unescapeHTML(e.get('name'))
info.update({
- 'episode': unescapeHTML(e.get('name')),
+ 'episode': episode_name,
'episode_number': int_or_none(e.get('episodeNumber')),
'description': unescapeHTML(e.get('description')),
})
+ if not info.get('title') and episode_name:
+ info['title'] = episode_name
part_of_season = e.get('partOfSeason')
if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
info['series'] = unescapeHTML(part_of_series.get('name'))
+ elif item_type == 'Movie':
+ info.update({
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('dateCreated')),
+ })
elif item_type in ('Article', 'NewsArticle'):
info.update({
'timestamp': parse_iso8601(e.get('datePublished')),
'only_matching': True,
}]
+ _MEDIA_FILE_SLOTS = {
+ '360p.mp4': {
+ 'width': 640,
+ 'height': 360,
+ },
+ '480p.mp4': {
+ 'width': 768,
+ 'height': 432,
+ },
+ '480p_1mbps.mp4': {
+ 'width': 852,
+ 'height': 480,
+ },
+ }
+
def _real_extract(self, url):
video_id = self._match_id(url)
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
format_url, video_id, mpd_id='dash', fatal=False))
+ elif format_url.endswith('.ism/Manifest'):
+ formats.extend(self._extract_ism_formats(
+ format_url, video_id, ism_id='mss', fatal=False))
+ else:
+ mfs_path = e.get('Type')
+ mfs_info = self._MEDIA_FILE_SLOTS.get(mfs_path)
+ if not mfs_info:
+ continue
+ formats.append({
+ 'url': format_url,
+ 'format_id': 'http-' + mfs_path.split('.')[0],
+ 'width': mfs_info['width'],
+ 'height': mfs_info['height'],
+ })
self._sort_formats(formats)
description = media.get('Description')
self._handle_errors(result)
self._auth_token = result['message']['auth_token']
- def _extract_media_info(self, media):
- video_id = compat_str(media['id'])
+
+class CuriosityStreamIE(CuriosityStreamBaseIE):
+ IE_NAME = 'curiositystream'
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://app.curiositystream.com/video/2',
+ 'md5': '262bb2f257ff301115f1973540de8983',
+ 'info_dict': {
+ 'id': '2',
+ 'ext': 'mp4',
+ 'title': 'How Did You Develop The Internet?',
+ 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ media = self._call_api('media/' + video_id, video_id)
title = media['title']
formats = []
}
-class CuriosityStreamIE(CuriosityStreamBaseIE):
- IE_NAME = 'curiositystream'
- _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)'
- _TEST = {
- 'url': 'https://app.curiositystream.com/video/2',
- 'md5': '262bb2f257ff301115f1973540de8983',
- 'info_dict': {
- 'id': '2',
- 'ext': 'mp4',
- 'title': 'How Did You Develop The Internet?',
- 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- media = self._call_api('media/' + video_id, video_id)
- return self._extract_media_info(media)
-
-
class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
IE_NAME = 'curiositystream:collection'
- _VALID_URL = r'https?://app\.curiositystream\.com/collection/(?P<id>\d+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collection|series)/(?P<id>\d+)'
+ _TESTS = [{
'url': 'https://app.curiositystream.com/collection/2',
'info_dict': {
'id': '2',
'title': 'Curious Minds: The Internet',
'description': 'How is the internet shaping our lives in the 21st Century?',
},
- 'playlist_mincount': 12,
- }
+ 'playlist_mincount': 17,
+ }, {
+ 'url': 'https://curiositystream.com/series/2',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
collection_id = self._match_id(url)
'collections/' + collection_id, collection_id)
entries = []
for media in collection.get('media', []):
- entries.append(self._extract_media_info(media))
+ media_id = compat_str(media.get('id'))
+ entries.append(self.url_result(
+ 'https://curiositystream.com/video/' + media_id,
+ CuriosityStreamIE.ie_key(), media_id))
return self.playlist_result(
entries, collection_id,
collection.get('title'), collection.get('description'))
class DiscoveryIE(DiscoveryGoBaseIE):
- _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>
- discovery|
- investigationdiscovery|
- discoverylife|
- animalplanet|
- ahctv|
- destinationamerica|
- sciencechannel|
- tlc|
- velocity
+ _VALID_URL = r'''(?x)https?://
+ (?P<site>
+ (?:www\.)?
+ (?:
+ discovery|
+ investigationdiscovery|
+ discoverylife|
+ animalplanet|
+ ahctv|
+ destinationamerica|
+ sciencechannel|
+ tlc|
+ velocity
+ )|
+ watch\.
+ (?:
+ hgtv|
+ foodnetwork|
+ travelchannel|
+ diynetwork|
+ cookingchanneltv|
+ motortrend
+ )
)\.com(?P<path>/tv-shows/[^/]+/(?:video|full-episode)s/(?P<id>[^./?#]+))'''
_TESTS = [{
'url': 'https://www.discovery.com/tv-shows/cash-cab/videos/dave-foley',
if not access_token:
access_token = self._download_json(
- 'https://www.%s.com/anonymous' % site, display_id, query={
+ 'https://%s.com/anonymous' % site, display_id, query={
'authRel': 'authorization',
'client_id': try_get(
react_data, lambda x: x['application']['apiClientId'],
})['access_token']
try:
+ headers = self.geo_verification_headers()
+ headers['Authorization'] = 'Bearer ' + access_token
+
stream = self._download_json(
'https://api.discovery.com/v1/streaming/video/' + video_id,
- display_id, headers={
- 'Authorization': 'Bearer ' + access_token,
- })
+ display_id, headers=headers)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403):
e_description = self._parse_json(
class DTubeIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?d\.tube/(?:#!/)?v/(?P<uploader_id>[0-9a-z.-]+)/(?P<id>[0-9a-z]{8})'
_TEST = {
- 'url': 'https://d.tube/#!/v/benswann/zqd630em',
- 'md5': 'a03eaa186618ffa7a3145945543a251e',
+ 'url': 'https://d.tube/#!/v/broncnutz/x380jtr1',
+ 'md5': '9f29088fa08d699a7565ee983f56a06e',
'info_dict': {
- 'id': 'zqd630em',
+ 'id': 'x380jtr1',
'ext': 'mp4',
- 'title': 'Reality Check: FDA\'s Disinformation Campaign on Kratom',
- 'description': 'md5:700d164e066b87f9eac057949e4227c2',
- 'uploader_id': 'benswann',
- 'upload_date': '20180222',
- 'timestamp': 1519328958,
+ 'title': 'Lefty 3-Rings is Back Baby!! NCAA Picks',
+ 'description': 'md5:60be222088183be3a42f196f34235776',
+ 'uploader_id': 'broncnutz',
+ 'upload_date': '20190107',
+ 'timestamp': 1546854054,
},
'params': {
'format': '480p',
def canonical_url(h):
if not h:
return None
- return 'https://ipfs.io/ipfs/' + h
+ return 'https://video.dtube.top/ipfs/' + h
formats = []
for q in ('240', '480', '720', '1080', ''):
ChirbitProfileIE,
)
from .cinchcast import CinchcastIE
+from .ciscolive import (
+ CiscoLiveSessionIE,
+ CiscoLiveSearchIE,
+)
from .cjsw import CJSWIE
from .cliphunter import CliphunterIE
from .clippit import ClippitIE
from .funnyordie import FunnyOrDieIE
from .fusion import FusionIE
from .fxnetworks import FXNetworksIE
+from .gaia import GaiaIE
from .gameinformer import GameInformerIE
from .gameone import (
GameOneIE,
)
from .huajiao import HuajiaoIE
from .huffpost import HuffPostIE
+from .hungama import (
+ HungamaIE,
+ HungamaSongIE,
+)
from .hypem import HypemIE
from .iconosquare import IconosquareIE
from .ign import (
from .imgur import (
ImgurIE,
ImgurAlbumIE,
+ ImgurGalleryIE,
)
from .ina import InaIE
from .inc import IncIE
)
from .learnr import LearnrIE
from .lecture2go import Lecture2GoIE
+from .lecturio import (
+ LecturioIE,
+ LecturioCourseIE,
+ LecturioDeCourseIE,
+)
from .leeco import (
LeIE,
LePlaylistIE,
MyviEmbedIE,
)
from .myvidster import MyVidsterIE
-from .nationalgeographic import (
- NationalGeographicVideoIE,
- NationalGeographicIE,
- NationalGeographicEpisodeGuideIE,
-)
+from .nationalgeographic import NationalGeographicVideoIE
from .naver import NaverIE
from .nba import NBAIE
from .nbc import (
ORFOE1IE,
ORFIPTVIE,
)
+from .outsidetv import OutsideTVIE
from .packtpub import (
PacktPubIE,
PacktPubCourseIE,
from .pinkbike import PinkbikeIE
from .pladform import PladformIE
from .playfm import PlayFMIE
+from .playplustv import PlayPlusTVIE
from .plays import PlaysTVIE
from .playtvak import PlaytvakIE
from .playvid import PlayvidIE
from .tastytrade import TastyTradeIE
from .tbs import TBSIE
from .tdslifeway import TDSLifewayIE
+from .teachable import (
+ TeachableIE,
+ TeachableCourseIE,
+)
from .teachertube import (
TeacherTubeIE,
TeacherTubeUserIE,
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
from .threeqsdn import ThreeQSDNIE
+from .tiktok import (
+ TikTokIE,
+ TikTokUserIE,
+)
from .tinypic import TinyPicIE
from .tmz import (
TMZIE,
from .tvnoe import TVNoeIE
from .tvnow import (
TVNowIE,
- TVNowListIE,
+ TVNowNewIE,
+ TVNowSeasonIE,
+ TVNowAnnualIE,
TVNowShowIE,
)
from .tvp import (
UplynkIE,
UplynkPreplayIE,
)
-from .upskill import (
- UpskillIE,
- UpskillCourseIE,
-)
from .urort import UrortIE
from .urplay import URPlayIE
from .usanetwork import USANetworkIE
VimeoReviewIE,
VimeoUserIE,
VimeoWatchLaterIE,
+ VHXEmbedIE,
)
from .vimple import VimpleIE
from .vine import (
WSJIE,
WSJArticleIE,
)
+from .wwe import WWEIE
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
)
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
+from .zype import ZypeIE
# coding: utf-8
from __future__ import unicode_literals
+# import json
+# import uuid
+
from .adobepass import AdobePassIE
-from .uplynk import UplynkPreplayIE
-from ..compat import compat_str
from ..utils import (
- HEADRequest,
int_or_none,
parse_age_limit,
parse_duration,
class FOXIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:fox\.com|nationalgeographic\.com/tv)/watch/(?P<id>[\da-fA-F]+)'
_TESTS = [{
# clip
'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
# episode, geo-restricted, tv provided required
'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.nationalgeographic.com/tv/watch/f690e05ebbe23ab79747becd0cc223d1/',
+ 'only_matching': True,
}]
+ # _access_token = None
+
+ # def _call_api(self, path, video_id, data=None):
+ # headers = {
+ # 'X-Api-Key': '238bb0a0c2aba67922c48709ce0c06fd',
+ # }
+ # if self._access_token:
+ # headers['Authorization'] = 'Bearer ' + self._access_token
+ # return self._download_json(
+ # 'https://api2.fox.com/v2.0/' + path, video_id, data=data, headers=headers)
+
+ # def _real_initialize(self):
+ # self._access_token = self._call_api(
+ # 'login', None, json.dumps({
+ # 'deviceId': compat_str(uuid.uuid4()),
+ # }).encode())['accessToken']
def _real_extract(self, url):
video_id = self._match_id(url)
video = self._download_json(
- 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id,
+ 'https://api.fox.com/fbc-content/v1_5/video/%s' % video_id,
video_id, headers={
'apikey': 'abdcbed02c124d393b39e818a4312055',
'Content-Type': 'application/json',
'Referer': url,
})
+ # video = self._call_api('vodplayer/' + video_id, video_id)
title = video['name']
release_url = video['videoRelease']['url']
-
- description = video.get('description')
- duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
- video.get('duration')) or parse_duration(video.get('duration'))
- timestamp = unified_timestamp(video.get('datePublished'))
- rating = video.get('contentRating')
- age_limit = parse_age_limit(rating)
+ # release_url = video['url']
data = try_get(
video, lambda x: x['trackingData']['properties'], dict) or {}
- creator = data.get('brand') or data.get('network') or video.get('network')
-
- series = video.get('seriesName') or data.get(
- 'seriesName') or data.get('show')
- season_number = int_or_none(video.get('seasonNumber'))
- episode = video.get('name')
- episode_number = int_or_none(video.get('episodeNumber'))
- release_year = int_or_none(video.get('releaseYear'))
-
+ rating = video.get('contentRating')
if data.get('authRequired'):
resource = self._get_mvpd_resource(
'fbc-fox', title, video.get('guid'), rating)
'auth': self._extract_mvpd_auth(
url, video_id, 'fbc-fox', resource)
})
+ m3u8_url = self._download_json(release_url, video_id)['playURL']
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
+ video.get('duration')) or parse_duration(video.get('duration'))
+ timestamp = unified_timestamp(video.get('datePublished'))
+ creator = data.get('brand') or data.get('network') or video.get('network')
+ series = video.get('seriesName') or data.get(
+ 'seriesName') or data.get('show')
subtitles = {}
for doc_rel in video.get('documentReleases', []):
}]
break
- info = {
+ return {
'id': video_id,
'title': title,
- 'description': description,
+ 'formats': formats,
+ 'description': video.get('description'),
'duration': duration,
'timestamp': timestamp,
- 'age_limit': age_limit,
+ 'age_limit': parse_age_limit(rating),
'creator': creator,
'series': series,
- 'season_number': season_number,
- 'episode': episode,
- 'episode_number': episode_number,
- 'release_year': release_year,
+ 'season_number': int_or_none(video.get('seasonNumber')),
+ 'episode': video.get('name'),
+ 'episode_number': int_or_none(video.get('episodeNumber')),
+ 'release_year': int_or_none(video.get('releaseYear')),
'subtitles': subtitles,
}
-
- urlh = self._request_webpage(HEADRequest(release_url), video_id)
- video_url = compat_str(urlh.geturl())
-
- if UplynkPreplayIE.suitable(video_url):
- info.update({
- '_type': 'url_transparent',
- 'url': video_url,
- 'ie_key': UplynkPreplayIE.ie_key(),
- })
- else:
- m3u8_url = self._download_json(release_url, video_id)['playURL']
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls')
- self._sort_formats(formats)
- info['formats'] = formats
- return info
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import (
- smuggle_url,
- update_url_query,
-)
class FoxSportsIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*video/(?P<id>\d+)'
_TEST = {
'url': 'http://www.foxsports.com/tennessee/video/432609859715',
'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'bwduI3X_TgUB',
+ 'id': '432609859715',
'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
- 'upload_date': '20150423',
- 'timestamp': 1429761109,
+ # TODO: fix timestamp
+ 'upload_date': '19700101', # '20150423',
+ # 'timestamp': 1429761109,
'uploader': 'NEWA-FNG-FOXSPORTS',
},
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
'add_ie': ['ThePlatform'],
}
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- config = self._parse_json(
- self._html_search_regex(
- r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
- webpage, 'data player config'),
- video_id)
-
- return self.url_result(smuggle_url(update_url_query(
- config['releaseURL'], {
- 'mbr': 'true',
- 'switch': 'http',
- }), {'force_smil_url': True}))
+ return self.url_result(
+ 'https://feed.theplatform.com/f/BKQ29B/foxsports-all?byId=' + video_id, 'ThePlatformFeed')
from __future__ import unicode_literals
from .common import InfoExtractor
+from .youtube import YoutubeIE
class FreespeechIE(InfoExtractor):
r'data-video-url="([^"]+)"',
webpage, 'youtube url')
- return {
- '_type': 'url',
- 'url': youtube_url,
- 'ie_key': 'Youtube',
- }
+ return self.url_result(youtube_url, YoutubeIE.ie_key())
# coding: utf-8
from __future__ import unicode_literals
+import random
+import string
+
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
video_id = title_data.get('id') or self._search_regex([
r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
- r'<iframe[^>]+src="/player/(\d+)"',
+ r'<iframe[^>]+src="/player/(\d+)',
], webpage, 'video_id', default=None)
if not video_id:
player_url = self._html_search_meta([
if self._TOKEN:
headers['Authorization'] = 'Token %s' % self._TOKEN
sources = self._download_json(
- 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
- video_id, headers=headers)['items']
+ 'https://www.funimation.com/api/showexperience/%s/' % video_id,
+ video_id, headers=headers, query={
+ 'pinst_id': ''.join([random.choice(string.digits + string.ascii_letters) for _ in range(8)]),
+ })['items']
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
error = self._parse_json(e.cause.read(), video_id)['errors'][0]
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ str_or_none,
+ strip_or_none,
+ try_get,
+)
+
+
+class GaiaIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gaia\.com/video/(?P<id>[^/?]+).*?\bfullplayer=(?P<type>feature|preview)'
+ _TESTS = [{
+ 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=feature',
+ 'info_dict': {
+ 'id': '89356',
+ 'ext': 'mp4',
+ 'title': 'Connecting with Universal Consciousness',
+ 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f',
+ 'upload_date': '20151116',
+ 'timestamp': 1447707266,
+ 'duration': 936,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.gaia.com/video/connecting-universal-consciousness?fullplayer=preview',
+ 'info_dict': {
+ 'id': '89351',
+ 'ext': 'mp4',
+ 'title': 'Connecting with Universal Consciousness',
+ 'description': 'md5:844e209ad31b7d31345f5ed689e3df6f',
+ 'upload_date': '20151116',
+ 'timestamp': 1447707266,
+ 'duration': 53,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ display_id, vtype = re.search(self._VALID_URL, url).groups()
+ node_id = self._download_json(
+ 'https://brooklyn.gaia.com/pathinfo', display_id, query={
+ 'path': 'video/' + display_id,
+ })['id']
+ node = self._download_json(
+ 'https://brooklyn.gaia.com/node/%d' % node_id, node_id)
+ vdata = node[vtype]
+ media_id = compat_str(vdata['nid'])
+ title = node['title']
+
+ media = self._download_json(
+ 'https://brooklyn.gaia.com/media/' + media_id, media_id)
+ formats = self._extract_m3u8_formats(
+ media['mediaUrls']['bcHLS'], media_id, 'mp4')
+ self._sort_formats(formats)
+
+ subtitles = {}
+ text_tracks = media.get('textTracks', {})
+ for key in ('captions', 'subtitles'):
+ for lang, sub_url in text_tracks.get(key, {}).items():
+ subtitles.setdefault(lang, []).append({
+ 'url': sub_url,
+ })
+
+ fivestar = node.get('fivestar', {})
+ fields = node.get('fields', {})
+
+ def get_field_value(key, value_key='value'):
+ return try_get(fields, lambda x: x[key][0][value_key])
+
+ return {
+ 'id': media_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': strip_or_none(get_field_value('body') or get_field_value('teaser')),
+ 'timestamp': int_or_none(node.get('created')),
+ 'subtitles': subtitles,
+ 'duration': int_or_none(vdata.get('duration')),
+ 'like_count': int_or_none(try_get(fivestar, lambda x: x['up_count']['value'])),
+ 'dislike_count': int_or_none(try_get(fivestar, lambda x: x['down_count']['value'])),
+ 'comment_count': int_or_none(node.get('comment_count')),
+ 'series': try_get(node, lambda x: x['series']['title'], compat_str),
+ 'season_number': int_or_none(get_field_value('season')),
+ 'season_id': str_or_none(get_field_value('series_nid', 'nid')),
+ 'episode_number': int_or_none(get_field_value('episode')),
+ }
class GameSpotIE(OnceIE):
- _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?gamespot\.com/(?:video|article|review)s/(?:[^/]+/\d+-|embed/)(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/',
'md5': 'b2a30deaa8654fcccd43713a6b6a4825',
}, {
'url': 'https://www.gamespot.com/articles/the-last-of-us-2-receives-new-ps4-trailer/1100-6454469/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.gamespot.com/reviews/gears-of-war-review/1900-6161188/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
from .xfileshare import XFileShareIE
from .cloudflarestream import CloudflareStreamIE
from .peertube import PeerTubeIE
+from .teachable import TeachableIE
from .indavideo import IndavideoEmbedIE
from .apa import APAIE
from .foxnews import FoxNewsIE
from .viqeo import ViqeoIE
from .expressen import ExpressenIE
+from .zype import ZypeIE
class GenericIE(InfoExtractor):
},
'playlist_count': 6,
},
+ {
+ # Zype embed
+ 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites',
+ 'info_dict': {
+ 'id': '5b400b834b32992a310622b9',
+ 'ext': 'mp4',
+ 'title': 'Smoky Barbecue Favorites',
+ 'thumbnail': r're:^https?://.*\.jpe?g',
+ },
+ 'add_ie': [ZypeIE.ie_key()],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
{
# videojs embed
'url': 'https://video.sibnet.ru/shell.php?videoid=3422904',
def _real_extract(self, url):
if url.startswith('//'):
- return {
- '_type': 'url',
- 'url': self.http_scheme() + url,
- }
+ return self.url_result(self.http_scheme() + url)
parsed_url = compat_urlparse.urlparse(url)
if not parsed_url.scheme:
return self.playlist_from_matches(
peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
+ teachable_url = TeachableIE._extract_url(webpage, url)
+ if teachable_url:
+ return self.url_result(teachable_url)
+
indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
if indavideo_urls:
return self.playlist_from_matches(
return self.playlist_from_matches(
expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key())
+ zype_urls = ZypeIE._extract_urls(webpage)
+ if zype_urls:
+ return self.playlist_from_matches(
+ zype_urls, video_id, video_title, ie=ZypeIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
video_id = self._match_id(url)
gfy = self._download_json(
- 'http://gfycat.com/cajax/get/%s' % video_id,
+ 'https://api.gfycat.com/v1/gfycats/%s' % video_id,
video_id, 'Downloading video info')
if 'error' in gfy:
raise ExtractorError('Gfycat said: ' + gfy['error'], expected=True)
return
try:
- self._download_json(
+ glb_id = (self._download_json(
'https://login.globo.com/api/authentication', None, data=json.dumps({
'payload': {
'email': email,
},
}).encode(), headers={
'Content-Type': 'application/json; charset=utf-8',
- })
+ }) or {}).get('glbId')
+ if glb_id:
+ self._set_cookie('.globo.com', 'GLBID', glb_id)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
resp = self._parse_json(e.cause.read(), None)
IE_NAME = 'hotstar'
_VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})'
_TESTS = [{
+ # contentData
'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273',
'info_dict': {
'id': '1000076273',
# m3u8 download
'skip_download': True,
}
+ }, {
+ # contentDetail
+ 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157',
+ 'only_matching': True,
}, {
'url': 'http://www.hotstar.com/sports/cricket/rajitha-sizzles-on-debut-with-329/2001477583',
'only_matching': True,
r'<script>window\.APP_STATE\s*=\s*({.+?})</script>',
webpage, 'app state'), video_id)
video_data = {}
+ getters = list(
+ lambda x, k=k: x['initialState']['content%s' % k]['content']
+ for k in ('Data', 'Detail')
+ )
for v in app_state.values():
- content = try_get(v, lambda x: x['initialState']['contentData']['content'], dict)
+ content = try_get(v, getters, dict)
if content and content.get('contentId') == video_id:
video_data = content
+ break
title = video_data['title']
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ urlencode_postdata,
+)
+
+
+class HungamaIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?hungama\.com/
+ (?:
+ (?:video|movie)/[^/]+/|
+ tv-show/(?:[^/]+/){2}\d+/episode/[^/]+/
+ )
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'http://www.hungama.com/video/krishna-chants/39349649/',
+ 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0',
+ 'info_dict': {
+ 'id': '2931166',
+ 'ext': 'mp4',
+ 'title': 'Lucky Ali - Kitni Haseen Zindagi',
+ 'track': 'Kitni Haseen Zindagi',
+ 'artist': 'Lucky Ali',
+ 'album': 'Aks',
+ 'release_year': 2000,
+ }
+ }, {
+ 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.hungama.com/tv-show/padded-ki-pushup/season-1/44139461/episode/ep-02-training-sasu-pathlaag-karing/44139503/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ info = self._search_json_ld(webpage, video_id)
+
+ m3u8_url = self._download_json(
+ 'https://www.hungama.com/index.php', video_id,
+ data=urlencode_postdata({'content_id': video_id}), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
+ 'X-Requested-With': 'XMLHttpRequest',
+ }, query={
+ 'c': 'common',
+ 'm': 'get_video_mdn_url',
+ })['stream_url']
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(formats)
+
+ info.update({
+ 'id': video_id,
+ 'formats': formats,
+ })
+ return info
+
+
+class HungamaSongIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?hungama\.com/song/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.hungama.com/song/kitni-haseen-zindagi/2931166/',
+ 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0',
+ 'info_dict': {
+ 'id': '2931166',
+ 'ext': 'mp4',
+ 'title': 'Lucky Ali - Kitni Haseen Zindagi',
+ 'track': 'Kitni Haseen Zindagi',
+ 'artist': 'Lucky Ali',
+ 'album': 'Aks',
+ 'release_year': 2000,
+ }
+ }
+
+ def _real_extract(self, url):
+ audio_id = self._match_id(url)
+
+ data = self._download_json(
+ 'https://www.hungama.com/audio-player-data/track/%s' % audio_id,
+ audio_id, query={'_country': 'IN'})[0]
+
+ track = data['song_name']
+ artist = data.get('singer_name')
+
+ m3u8_url = self._download_json(
+ data.get('file') or data['preview_link'],
+ audio_id)['response']['media_url']
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, audio_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(formats)
+
+ title = '%s - %s' % (artist, track) if artist else track
+ thumbnail = data.get('img_src') or data.get('album_image')
+
+ return {
+ 'id': audio_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'track': track,
+ 'artist': artist,
+ 'album': data.get('album_name'),
+ 'release_year': int_or_none(data.get('date')),
+ 'formats': formats,
+ }
class ImgurIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:gallery|(?:topic|r)/[^/]+)/)?(?P<id>[a-zA-Z0-9]{6,})(?:[/?#&]+|\.[a-z0-9]+)?$'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?!(?:a|gallery|(?:t(?:opic)?|r)/[^/]+)/)(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'https://i.imgur.com/A61SaA1.gifv',
'id': 'A61SaA1',
'ext': 'mp4',
'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
- 'description': 'Imgur: The magic of the Internet',
},
}, {
'url': 'https://imgur.com/A61SaA1',
- 'info_dict': {
- 'id': 'A61SaA1',
- 'ext': 'mp4',
- 'title': 're:Imgur GIF$|MRW gifv is up and running without any bugs$',
- 'description': 'Imgur: The magic of the Internet',
- },
- }, {
- 'url': 'https://imgur.com/gallery/YcAQlkx',
- 'info_dict': {
- 'id': 'YcAQlkx',
- 'ext': 'mp4',
- 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
- }
- }, {
- 'url': 'http://imgur.com/topic/Funny/N8rOudd',
- 'only_matching': True,
- }, {
- 'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True,
}, {
'url': 'https://i.imgur.com/crGpqCV.mp4',
def _real_extract(self, url):
video_id = self._match_id(url)
- gifv_url = 'https://i.imgur.com/{id}.gifv'.format(id=video_id)
- webpage = self._download_webpage(gifv_url, video_id)
+ webpage = self._download_webpage(
+ 'https://i.imgur.com/{id}.gifv'.format(id=video_id), video_id)
width = int_or_none(self._og_search_property(
'video:width', webpage, default=None))
'format_id': m.group('type').partition('/')[2],
'url': self._proto_relative_url(m.group('src')),
'ext': mimetype2ext(m.group('type')),
- 'acodec': 'none',
'width': width,
'height': height,
'http_headers': {
return {
'id': video_id,
'formats': formats,
- 'description': self._og_search_description(webpage, default=None),
'title': self._og_search_title(webpage),
}
-class ImgurAlbumIE(InfoExtractor):
- _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:(?:a|gallery|topic/[^/]+)/)?(?P<id>[a-zA-Z0-9]{5})(?:[/?#&]+)?$'
+class ImgurGalleryIE(InfoExtractor):
+ IE_NAME = 'imgur:gallery'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/(?:gallery|(?:t(?:opic)?|r)/[^/]+)/(?P<id>[a-zA-Z0-9]+)'
_TESTS = [{
'url': 'http://imgur.com/gallery/Q95ko',
'info_dict': {
'id': 'Q95ko',
+ 'title': 'Adding faces make every GIF better',
},
'playlist_count': 25,
}, {
- 'url': 'http://imgur.com/a/j6Orj',
+ 'url': 'http://imgur.com/topic/Aww/ll5Vk',
'only_matching': True,
}, {
- 'url': 'http://imgur.com/topic/Aww/ll5Vk',
+ 'url': 'https://imgur.com/gallery/YcAQlkx',
+ 'info_dict': {
+ 'id': 'YcAQlkx',
+ 'ext': 'mp4',
+ 'title': 'Classic Steve Carell gif...cracks me up everytime....damn the repost downvotes....',
+ }
+ }, {
+ 'url': 'http://imgur.com/topic/Funny/N8rOudd',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://imgur.com/r/aww/VQcQPhM',
'only_matching': True,
}]
def _real_extract(self, url):
- album_id = self._match_id(url)
-
- album_images = self._download_json(
- 'http://imgur.com/gallery/%s/album_images/hit.json?all=true' % album_id,
- album_id, fatal=False)
-
- if album_images:
- data = album_images.get('data')
- if data and isinstance(data, dict):
- images = data.get('images')
- if images and isinstance(images, list):
- entries = [
- self.url_result('http://imgur.com/%s' % image['hash'])
- for image in images if image.get('hash')]
- return self.playlist_result(entries, album_id)
-
- # Fallback to single video
- return self.url_result('http://imgur.com/%s' % album_id, ImgurIE.ie_key())
+ gallery_id = self._match_id(url)
+
+ data = self._download_json(
+ 'https://imgur.com/gallery/%s.json' % gallery_id,
+ gallery_id)['data']['image']
+
+ if data.get('is_album'):
+ entries = [
+ self.url_result('http://imgur.com/%s' % image['hash'], ImgurIE.ie_key(), image['hash'])
+ for image in data['album_images']['images'] if image.get('hash')]
+ return self.playlist_result(entries, gallery_id, data.get('title'), data.get('description'))
+
+ return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id)
+
+
+class ImgurAlbumIE(ImgurGalleryIE):
+ IE_NAME = 'imgur:album'
+ _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)'
+
+ _TESTS = [{
+ 'url': 'http://imgur.com/a/j6Orj',
+ 'info_dict': {
+ 'id': 'j6Orj',
+ 'title': 'A Literary Analysis of "Star Wars: The Force Awakens"',
+ },
+ 'playlist_count': 12,
+ }]
class IPrimaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:play|prima)\.iprima\.cz/(?:.+/)?(?P<id>[^?#]+)'
+ _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_GEO_BYPASS = False
_TESTS = [{
# iframe prima.iprima.cz
'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha',
'only_matching': True,
+ }, {
+ 'url': 'http://www.iprima.cz/filmy/desne-rande',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://zoom.iprima.cz/10-nejvetsich-tajemstvi-zahad/posvatna-mista-a-stavby',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://krimi.iprima.cz/mraz-0/sebevrazdy',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://cool.iprima.cz/derava-silnice-nevadi',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
\r
bitrates = self._parse_json(\r
self._search_regex(\r
- r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',\r
+ r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',\r
default='{}'),\r
video_id, transform_source=js_to_json, fatal=False)\r
\r
class JWPlatformIE(InfoExtractor):
- _VALID_URL = r'(?:https?://content\.jwplatform\.com/(?:feeds|players|jw6)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
- _TEST = {
+ _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|video|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})'
+ _TESTS = [{
'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js',
'md5': 'fa8899fa601eb7c83a64e9d568bdf325',
'info_dict': {
'upload_date': '20081127',
'timestamp': 1227796140,
}
- }
+ }, {
+ 'url': 'https://cdn.jwplayer.com/players/nPripu9l-ALJ3XQCI.js',
+ 'only_matching': True,
+ }]
@staticmethod
def _extract_url(webpage):
def _real_extract(self, url):
video_id = self._match_id(url)
- json_data = self._download_json('http://content.jwplatform.com/feeds/%s.json' % video_id, video_id)
+ json_data = self._download_json('https://cdn.jwplayer.com/v2/media/' + video_id, video_id)
return self._parse_jwplayer_data(json_data, video_id)
'entryId': video_id,
'service': 'baseentry',
'ks': '{1:result:ks}',
+ 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId',
+ 'responseProfile:type': 1,
},
{
'action': 'getbyentryid',
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ str_or_none,
+ url_or_none,
+ urlencode_postdata,
+ urljoin,
+)
+
+
+class LecturioBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://app.lecturio.com/en/login'
+ _NETRC_MACHINE = 'lecturio'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ # Sets some cookies
+ _, urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(url_handle):
+ return self._LOGIN_URL not in compat_str(url_handle.geturl())
+
+ # Already logged in
+ if is_logged(urlh):
+ return
+
+ login_form = {
+ 'signin[email]': username,
+ 'signin[password]': password,
+ 'signin[remember]': 'on',
+ }
+
+ response, urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Logging in',
+ data=urlencode_postdata(login_form))
+
+ # Logged in successfully
+ if is_logged(urlh):
+ return
+
+ errors = self._html_search_regex(
+ r'(?s)<ul[^>]+class=["\']error_list[^>]+>(.+?)</ul>', response,
+ 'errors', default=None)
+ if errors:
+ raise ExtractorError('Unable to login: %s' % errors, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class LecturioIE(LecturioBaseIE):
+ _VALID_URL = r'''(?x)
+ https://
+ (?:
+ app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.lecture|
+ (?:www\.)?lecturio\.de/[^/]+/(?P<id_de>[^/?#&]+)\.vortrag
+ )
+ '''
+ _TESTS = [{
+ 'url': 'https://app.lecturio.com/medical-courses/important-concepts-and-terms-introduction-to-microbiology.lecture#tab/videos',
+ 'md5': 'f576a797a5b7a5e4e4bbdfc25a6a6870',
+ 'info_dict': {
+ 'id': '39634',
+ 'ext': 'mp4',
+ 'title': 'Important Concepts and Terms – Introduction to Microbiology',
+ },
+ 'skip': 'Requires lecturio account credentials',
+ }, {
+ 'url': 'https://www.lecturio.de/jura/oeffentliches-recht-staatsexamen.vortrag',
+ 'only_matching': True,
+ }]
+
+ _CC_LANGS = {
+ 'German': 'de',
+ 'English': 'en',
+ 'Spanish': 'es',
+ 'French': 'fr',
+ 'Polish': 'pl',
+ 'Russian': 'ru',
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id') or mobj.group('id_de')
+
+ webpage = self._download_webpage(
+ 'https://app.lecturio.com/en/lecture/%s/player.html' % display_id,
+ display_id)
+
+ lecture_id = self._search_regex(
+ r'lecture_id\s*=\s*(?:L_)?(\d+)', webpage, 'lecture id')
+
+ api_url = self._search_regex(
+ r'lectureDataLink\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'api url', group='url')
+
+ video = self._download_json(api_url, display_id)
+
+ title = video['title'].strip()
+
+ formats = []
+ for format_ in video['content']['media']:
+ if not isinstance(format_, dict):
+ continue
+ file_ = format_.get('file')
+ if not file_:
+ continue
+ ext = determine_ext(file_)
+ if ext == 'smil':
+ # smil contains only broken RTMP formats anyway
+ continue
+ file_url = url_or_none(file_)
+ if not file_url:
+ continue
+ label = str_or_none(format_.get('label'))
+ filesize = int_or_none(format_.get('fileSize'))
+ formats.append({
+ 'url': file_url,
+ 'format_id': label,
+ 'filesize': float_or_none(filesize, invscale=1000)
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ automatic_captions = {}
+ cc = self._parse_json(
+ self._search_regex(
+ r'subtitleUrls\s*:\s*({.+?})\s*,', webpage, 'subtitles',
+ default='{}'), display_id, fatal=False)
+ for cc_label, cc_url in cc.items():
+ cc_url = url_or_none(cc_url)
+ if not cc_url:
+ continue
+ lang = self._search_regex(
+ r'/([a-z]{2})_', cc_url, 'lang',
+ default=cc_label.split()[0] if cc_label else 'en')
+ original_lang = self._search_regex(
+ r'/[a-z]{2}_([a-z]{2})_', cc_url, 'original lang',
+ default=None)
+ sub_dict = (automatic_captions
+ if 'auto-translated' in cc_label or original_lang
+ else subtitles)
+ sub_dict.setdefault(self._CC_LANGS.get(lang, lang), []).append({
+ 'url': cc_url,
+ })
+
+ return {
+ 'id': lecture_id,
+ 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'automatic_captions': automatic_captions,
+ }
+
+
+class LecturioCourseIE(LecturioBaseIE):
+ _VALID_URL = r'https://app\.lecturio\.com/[^/]+/(?P<id>[^/?#&]+)\.course'
+ _TEST = {
+ 'url': 'https://app.lecturio.com/medical-courses/microbiology-introduction.course#/',
+ 'info_dict': {
+ 'id': 'microbiology-introduction',
+ 'title': 'Microbiology: Introduction',
+ },
+ 'playlist_count': 45,
+ 'skip': 'Requires lecturio account credentials',
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ entries = []
+ for mobj in re.finditer(
+ r'(?s)<[^>]+\bdata-url=(["\'])(?:(?!\1).)+\.lecture\b[^>]+>',
+ webpage):
+ params = extract_attributes(mobj.group(0))
+ lecture_url = urljoin(url, params.get('data-url'))
+ lecture_id = params.get('data-id')
+ entries.append(self.url_result(
+ lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
+
+ title = self._search_regex(
+ r'<span[^>]+class=["\']content-title[^>]+>([^<]+)', webpage,
+ 'title', default=None)
+
+ return self.playlist_result(entries, display_id, title)
+
+
+class LecturioDeCourseIE(LecturioBaseIE):
+ _VALID_URL = r'https://(?:www\.)?lecturio\.de/[^/]+/(?P<id>[^/?#&]+)\.kurs'
+ _TEST = {
+ 'url': 'https://www.lecturio.de/jura/grundrechte.kurs',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ entries = []
+ for mobj in re.finditer(
+ r'(?s)<td[^>]+\bdata-lecture-id=["\'](?P<id>\d+).+?\bhref=(["\'])(?P<url>(?:(?!\2).)+\.vortrag)\b[^>]+>',
+ webpage):
+ lecture_url = urljoin(url, mobj.group('url'))
+ lecture_id = mobj.group('id')
+ entries.append(self.url_result(
+ lecture_url, ie=LecturioIE.ie_key(), video_id=lecture_id))
+
+ title = self._search_regex(
+ r'<h1[^>]*>([^<]+)', webpage, 'title', default=None)
+
+ return self.playlist_result(entries, display_id, title)
class LibraryOfCongressIE(InfoExtractor):
IE_NAME = 'loc'
IE_DESC = 'Library of Congress'
- _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)'
_TESTS = [{
# embedded via <div class="media-player"
'url': 'http://loc.gov/item/90716351/',
- 'md5': '353917ff7f0255aa6d4b80a034833de8',
+ 'md5': '6ec0ae8f07f86731b1b2ff70f046210a',
'info_dict': {
'id': '90716351',
'ext': 'mp4',
'title': "Pa's trip to Mars",
- 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 0,
'view_count': int,
},
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://www.loc.gov/item/ihas.200197114/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.loc.gov/item/afc1981005_afs20503/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
(r'id=(["\'])media-player-(?P<id>.+?)\1',
r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
- r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1'),
+ r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1',
+ r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'),
webpage, 'media id', group='id')
data = self._download_json(
'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
- video_id)['mediaObject']
+ media_id)['mediaObject']
derivative = data['derivatives'][0]
media_url = derivative['derivativeUrl']
if ext not in ('mp4', 'mp3'):
media_url += '.mp4' if is_video else '.mp3'
- if 'vod/mp4:' in media_url:
- formats = [{
- 'url': media_url.replace('vod/mp4:', 'hls-vod/media/') + '.m3u8',
+ formats = []
+ if '/vod/mp4:' in media_url:
+ formats.append({
+ 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8',
'format_id': 'hls',
'ext': 'mp4',
'protocol': 'm3u8_native',
'quality': 1,
- }]
- elif 'vod/mp3:' in media_url:
- formats = [{
- 'url': media_url.replace('vod/mp3:', ''),
- 'vcodec': 'none',
- }]
+ })
+ http_format = {
+ 'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url),
+ 'format_id': 'http',
+ 'quality': 1,
+ }
+ if not is_video:
+ http_format['vcodec'] = 'none'
+ formats.append(http_format)
download_urls = set()
for m in re.finditer(
r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?: |\s+)\((?P<size>.+?)\))?\s*<', webpage):
format_id = m.group('id').lower()
- if format_id == 'gif':
+ if format_id in ('gif', 'jpeg'):
continue
download_url = m.group('url')
if download_url in download_urls:
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"',
+ r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"',
webpage)
def _real_extract(self, url):
}
for idx, info_dict in enumerate(entries):
+ formats = []
for a_format in info_dict['formats']:
if not a_format.get('height'):
a_format['height'] = int_or_none(self._search_regex(
r'([0-9]+)p\.mp4', a_format['url'], 'height label',
default=None))
-
- self._sort_formats(info_dict['formats'])
+ formats.append(a_format)
+
+ # Removing '.*.mp4' gives the raw video, which is essentially
+ # the same video without the LiveLeak logo at the top (see
+ # https://github.com/rg3/youtube-dl/pull/4768)
+ orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url'])
+ if a_format['url'] != orig_url:
+ format_id = a_format.get('format_id')
+ formats.append({
+ 'format_id': 'original' + ('-' + format_id if format_id else ''),
+ 'url': orig_url,
+ 'preference': 1,
+ })
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
# Don't append entry ID for one-video pages to keep backward compatibility
if len(entries) > 1:
class LiveLeakEmbedIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)'
+ _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)'
# See generic.py for actual test cases
_TESTS = [{
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- kind, video_id = mobj.group('kind', 'id')
+ kind, video_id = re.match(self._VALID_URL, url).groups()
if kind == 'f':
webpage = self._download_webpage(url, video_id)
liveleak_url = self._search_regex(
- r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
+ r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
webpage, 'LiveLeak URL', group='url')
- elif kind == 'i':
- liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id
+ else:
+ liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id)
return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
id = mobj.group('id')
webpage = self._download_webpage(url, id)
- return {
- '_type': 'url',
- 'url': self._og_search_url(webpage),
- }
+ return self.url_result(self._og_search_url(webpage))
class LyndaBaseIE(InfoExtractor):
- _SIGNIN_URL = 'https://www.lynda.com/signin'
+ _SIGNIN_URL = 'https://www.lynda.com/signin/lynda'
_PASSWORD_URL = 'https://www.lynda.com/signin/password'
_USER_URL = 'https://www.lynda.com/signin/user'
_ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.'
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ str_to_int,
+ urlencode_postdata,
+)
class ManyVidsIE(InfoExtractor):
_VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
+ # preview video
'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/',
'md5': '03f11bb21c52dd12a05be21a5c7dcc97',
'info_dict': {
'view_count': int,
'like_count': int,
},
- }
+ }, {
+ # full video
+ 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/',
+ 'md5': 'f3e8f7086409e9b470e2643edb96bdcc',
+ 'info_dict': {
+ 'id': '935718',
+ 'ext': 'mp4',
+ 'title': 'MY FACE REVEAL',
+ 'view_count': int,
+ 'like_count': int,
+ },
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'video URL', group='url')
- title = '%s (Preview)' % self._html_search_regex(
- r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title')
+ title = self._html_search_regex(
+ (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)',
+ r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'),
+ webpage, 'title', default=None) or self._html_search_meta(
+ 'twitter:title', webpage, 'title', fatal=True)
+
+ if any(p in webpage for p in ('preview_videos', '_preview.mp4')):
+ title += ' (Preview)'
+
+ mv_token = self._search_regex(
+ r'data-mvtoken=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'mv token', default=None, group='value')
+
+ if mv_token:
+ # Sets some cookies
+ self._download_webpage(
+ 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php',
+ video_id, fatal=False, data=urlencode_postdata({
+ 'mvtoken': mv_token,
+ 'vid': video_id,
+ }), headers={
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest'
+ })
+
+ if determine_ext(video_url) == 'm3u8':
+ formats = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ else:
+ formats = [{'url': video_url}]
like_count = int_or_none(self._search_regex(
r'data-likes=["\'](\d+)', webpage, 'like count', default=None))
- view_count = int_or_none(self._html_search_regex(
+ view_count = str_to_int(self._html_search_regex(
r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage,
'view count', default=None))
'title': title,
'view_count': view_count,
'like_count': like_count,
- 'formats': [{
- 'url': video_url,
- }],
+ 'formats': formats,
}
class MediasiteIE(InfoExtractor):
- _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)'
+ _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/(?:default|livebroadcast)/Presentation)/(?P<id>[0-9a-f]{32,34})(?P<query>\?[^#]+|)'
_TESTS = [
{
'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d',
'timestamp': 1333983600,
'duration': 7794,
}
- }
+ },
+ {
+ 'url': 'https://collegerama.tudelft.nl/Mediasite/Showcase/livebroadcast/Presentation/ada7020854f743c49fbb45c9ec7dbb351d',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://mediasite.ntnu.no/Mediasite/Showcase/default/Presentation/7d8b913259334b688986e970fae6fcb31d',
+ 'only_matching': True,
+ },
]
# look in Mediasite.Core.js (Mediasite.ContentStreamType[*])
stream_info = info_json['streamInfo']
formats = []
+ def decrypt_url(f_url):
+ for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'):
+ decrypted_url = self._decrypt_xor_cipher(k, f_url)
+ if re.search(r'^https?://[0-9a-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url):
+ return decrypted_url
+
for url_key in ('url', 'hlsUrl', 'dashUrl'):
format_url = stream_info.get(url_key)
if not format_url:
continue
- decrypted = self._decrypt_xor_cipher(key, compat_b64decode(format_url))
+ decrypted = decrypt_url(compat_b64decode(format_url))
if not decrypted:
continue
if url_key == 'hlsUrl':
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .adobepass import AdobePassIE
-from .theplatform import ThePlatformIE
from ..utils import (
smuggle_url,
url_basename,
- update_url_query,
- get_element_by_class,
)
{'force_smil_url': True}),
'id': guid,
}
-
-
-class NationalGeographicIE(ThePlatformIE, AdobePassIE):
- IE_NAME = 'natgeo'
- _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:(?:wild/)?[^/]+/)?(?:videos|episodes)|u)/(?P<id>[^/?]+)'
-
- _TESTS = [
- {
- 'url': 'http://channel.nationalgeographic.com/u/kdi9Ld0PN2molUUIMSBGxoeDhD729KRjQcnxtetilWPMevo8ZwUBIDuPR0Q3D2LVaTsk0MPRkRWDB8ZhqWVeyoxfsZZm36yRp1j-zPfsHEyI_EgAeFY/',
- 'md5': '518c9aa655686cf81493af5cc21e2a04',
- 'info_dict': {
- 'id': 'vKInpacll2pC',
- 'ext': 'mp4',
- 'title': 'Uncovering a Universal Knowledge',
- 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a',
- 'timestamp': 1458680907,
- 'upload_date': '20160322',
- 'uploader': 'NEWA-FNG-NGTV',
- },
- 'add_ie': ['ThePlatform'],
- },
- {
- 'url': 'http://channel.nationalgeographic.com/u/kdvOstqYaBY-vSBPyYgAZRUL4sWUJ5XUUPEhc7ISyBHqoIO4_dzfY3K6EjHIC0hmFXoQ7Cpzm6RkET7S3oMlm6CFnrQwSUwo/',
- 'md5': 'c4912f656b4cbe58f3e000c489360989',
- 'info_dict': {
- 'id': 'Pok5lWCkiEFA',
- 'ext': 'mp4',
- 'title': 'The Stunning Red Bird of Paradise',
- 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c',
- 'timestamp': 1459362152,
- 'upload_date': '20160330',
- 'uploader': 'NEWA-FNG-NGTV',
- },
- 'add_ie': ['ThePlatform'],
- },
- {
- 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/',
- 'only_matching': True,
- },
- {
- 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/',
- 'only_matching': True,
- },
- {
- 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/',
- 'only_matching': True,
- },
- {
- 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/',
- 'only_matching': True,
- }
- ]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- release_url = self._search_regex(
- r'video_auth_playlist_url\s*=\s*"([^"]+)"',
- webpage, 'release url')
- theplatform_path = self._search_regex(r'https?://link\.theplatform\.com/s/([^?]+)', release_url, 'theplatform path')
- video_id = theplatform_path.split('/')[-1]
- query = {
- 'mbr': 'true',
- }
- is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False)
- if is_auth == 'auth':
- auth_resource_id = self._search_regex(
- r"video_auth_resourceId\s*=\s*'([^']+)'",
- webpage, 'auth resource id')
- query['auth'] = self._extract_mvpd_auth(url, video_id, 'natgeo', auth_resource_id)
-
- formats = []
- subtitles = {}
- for key, value in (('switch', 'http'), ('manifest', 'm3u')):
- tp_query = query.copy()
- tp_query.update({
- key: value,
- })
- tp_formats, tp_subtitles = self._extract_theplatform_smil(
- update_url_query(release_url, tp_query), video_id, 'Downloading %s SMIL data' % value)
- formats.extend(tp_formats)
- subtitles = self._merge_subtitles(subtitles, tp_subtitles)
- self._sort_formats(formats)
-
- info = self._extract_theplatform_metadata(theplatform_path, display_id)
- info.update({
- 'id': video_id,
- 'formats': formats,
- 'subtitles': subtitles,
- 'display_id': display_id,
- })
- return info
-
-
-class NationalGeographicEpisodeGuideIE(InfoExtractor):
- IE_NAME = 'natgeo:episodeguide'
- _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P<id>[^/]+)/episode-guide'
- _TESTS = [
- {
- 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/',
- 'info_dict': {
- 'id': 'the-story-of-god-with-morgan-freeman-season-1',
- 'title': 'The Story of God with Morgan Freeman - Season 1',
- },
- 'playlist_mincount': 6,
- },
- {
- 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2',
- 'info_dict': {
- 'id': 'underworld-inc-season-2',
- 'title': 'Underworld, Inc. - Season 2',
- },
- 'playlist_mincount': 7,
- },
- ]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- show = get_element_by_class('show', webpage)
- selected_season = self._search_regex(
- r'<div[^>]+class="select-seasons[^"]*".*?<a[^>]*>(.*?)</a>',
- webpage, 'selected season')
- entries = [
- self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic')
- for entry_url in re.findall('(?s)<div[^>]+class="col-inner"[^>]*?>.*?<a[^>]+href="([^"]+)"', webpage)]
- return self.playlist_result(
- entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')),
- '%s - %s' % (show, selected_season))
from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
- find_xpath_attr,
smuggle_url,
try_get,
- unescapeHTML,
update_url_query,
int_or_none,
)
class NBCNewsIE(ThePlatformIE):
- _VALID_URL = r'''(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/
- (?:video/.+?/(?P<id>\d+)|
- ([^/]+/)*(?:.*-)?(?P<mpx_id>[^/?]+))
- '''
+ _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)'
_TESTS = [
- {
- 'url': 'http://www.nbcnews.com/video/nbc-news/52753292',
- 'md5': '47abaac93c6eaf9ad37ee6c4463a5179',
- 'info_dict': {
- 'id': '52753292',
- 'ext': 'flv',
- 'title': 'Crew emerges after four-month Mars food study',
- 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1',
- },
- },
{
'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
'md5': 'af1adfa51312291a017720403826bb64',
'info_dict': {
- 'id': 'p_tweet_snow_140529',
+ 'id': '269389891880',
'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
'md5': '73135a2e0ef819107bbb55a5a9b2a802',
'info_dict': {
- 'id': 'nn_netcast_150204',
+ 'id': '394064451844',
'ext': 'mp4',
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
'md5': 'a49e173825e5fcd15c13fc297fced39d',
'info_dict': {
- 'id': 'x_lon_vwhorn_150922',
+ 'id': '529953347624',
'ext': 'mp4',
'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
'description': 'md5:c8be487b2d80ff0594c005add88d8351',
'url': 'http://www.today.com/video/see-the-aurora-borealis-from-space-in-stunning-new-nasa-video-669831235788',
'md5': '118d7ca3f0bea6534f119c68ef539f71',
'info_dict': {
- 'id': 'tdy_al_space_160420',
+ 'id': '669831235788',
'ext': 'mp4',
'title': 'See the aurora borealis from space in stunning new NASA video',
'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
'url': 'http://www.msnbc.com/all-in-with-chris-hayes/watch/the-chaotic-gop-immigration-vote-314487875924',
'md5': '6d236bf4f3dddc226633ce6e2c3f814d',
'info_dict': {
- 'id': 'n_hayes_Aimm_140801_272214',
+ 'id': '314487875924',
'ext': 'mp4',
'title': 'The chaotic GOP immigration vote',
'description': 'The Republican House votes on a border bill that has no chance of getting through the Senate or signed by the President and is drawing criticism from all sides.',
]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- if video_id is not None:
- all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
- info = all_info.find('video')
-
- return {
- 'id': video_id,
- 'title': info.find('headline').text,
- 'ext': 'flv',
- 'url': find_xpath_attr(info, 'media', 'type', 'flashVideo').text,
- 'description': info.find('caption').text,
- 'thumbnail': find_xpath_attr(info, 'media', 'type', 'thumbnail').text,
- }
- else:
- # "feature" and "nightly-news" pages use theplatform.com
- video_id = mobj.group('mpx_id')
+ video_id = self._match_id(url)
+ if not video_id.isdigit():
webpage = self._download_webpage(url, video_id)
- filter_param = 'byId'
- bootstrap_json = self._search_regex(
- [r'(?m)(?:var\s+(?:bootstrapJson|playlistData)|NEWS\.videoObj)\s*=\s*({.+});?\s*$',
- r'videoObj\s*:\s*({.+})', r'data-video="([^"]+)"',
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);'],
- webpage, 'bootstrap json', default=None)
- if bootstrap_json:
- bootstrap = self._parse_json(
- bootstrap_json, video_id, transform_source=unescapeHTML)
-
- info = None
- if 'results' in bootstrap:
- info = bootstrap['results'][0]['video']
- elif 'video' in bootstrap:
- info = bootstrap['video']
- elif 'msnbcVideoInfo' in bootstrap:
- info = bootstrap['msnbcVideoInfo']['meta']
- elif 'msnbcThePlatform' in bootstrap:
- info = bootstrap['msnbcThePlatform']['videoPlayer']['video']
- else:
- info = bootstrap
-
- if 'guid' in info:
- video_id = info['guid']
- filter_param = 'byGuid'
- elif 'mpxId' in info:
- video_id = info['mpxId']
-
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
- 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {filter_param: video_id}),
- 'ie_key': 'ThePlatformFeed',
- }
+ data = self._parse_json(self._search_regex(
+ r'window\.__data\s*=\s*({.+});', webpage,
+ 'bootstrap json'), video_id)
+ video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id']
+
+ return {
+ '_type': 'url_transparent',
+ 'id': video_id,
+ # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
+ 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}),
+ 'ie_key': 'ThePlatformFeed',
+ }
class NBCOlympicsIE(InfoExtractor):
bitrates = self._parse_json(
self._search_regex(
- r'(?s)bitrates\s*=\s*({.+?})\s*;', webpage, 'formats'),
+ r'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage, 'formats'),
video_id, transform_source=js_to_json)
QUALITIES = ('lq', 'mq', 'hq', 'hd')
class NPOLiveIE(NPOBaseIE):
IE_NAME = 'npo.nl:live'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:www\.)?npo(?:start)?\.nl/live(?:/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://www.npo.nl/live/npo-1',
}, {
'url': 'http://www.npo.nl/live',
'only_matching': True,
+ }, {
+ 'url': 'https://www.npostart.nl/live/npo-1',
+ 'only_matching': True,
}]
def _real_extract(self, url):
_TESTS = [{
# video
'url': 'http://www.nrk.no/video/PS*150533',
- 'md5': '2f7f6eeb2aacdd99885f355428715cfa',
+ 'md5': '706f34cdf1322577589e369e522b50ef',
'info_dict': {
'id': '150533',
'ext': 'mp4',
'title': 'Dompap og andre fugler i Piip-Show',
'description': 'md5:d9261ba34c43b61c812cb6b0269a5c8f',
- 'duration': 263,
+ 'duration': 262,
}
}, {
# audio
_VALID_URL = r'''(?x)
https?://
(?:tv|radio)\.nrk(?:super)?\.no/
- (?:serie/[^/]+|program)/
+ (?:serie(?:/[^/]+){1,2}|program)/
(?![Ee]pisodes)%s
(?:/\d{2}-\d{2}-\d{4})?
(?:\#del=(?P<part_id>\d+))?
_API_HOSTS = ('psapi-ne.nrk.no', 'psapi-we.nrk.no')
_TESTS = [{
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
- 'md5': '4e9ca6629f09e588ed240fb11619922a',
+ 'md5': '9a167e54d04671eb6317a37b7bc8a280',
'info_dict': {
'id': 'MUHH48000314AA',
'ext': 'mp4',
'title': '20 spørsmål 23.05.2014',
'description': 'md5:bdea103bc35494c143c6a9acdd84887a',
'duration': 1741,
- 'series': '20 spørsmål - TV',
+ 'series': '20 spørsmål',
'episode': '23.05.2014',
},
}, {
'id': 'MSPO40010515AH',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 1)',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
'duration': 772,
'series': 'Tour de Ski',
'episode': '06.01.2015',
'id': 'MSPO40010515BH',
'ext': 'mp4',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015 (Part 2)',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
'duration': 6175,
'series': 'Tour de Ski',
'episode': '06.01.2015',
'info_dict': {
'id': 'MSPO40010515',
'title': 'Sprint fri teknikk, kvinner og menn 06.01.2015',
- 'description': 'md5:c03aba1e917561eface5214020551b7a',
+ 'description': 'md5:1f97a41f05a9486ee00c56f35f82993d',
},
'expected_warnings': ['Video is geo restricted'],
}, {
}, {
'url': 'https://radio.nrk.no/serie/dagsnytt/NPUB21019315/12-07-2015#',
'only_matching': True,
+ }, {
+ 'url': 'https://tv.nrk.no/serie/lindmo/2018/MUHU11006318/avspiller',
+ 'only_matching': True,
}]
def _extract_series(self, webpage, display_id, fatal=True):
config = self._parse_json(
self._search_regex(
- r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>', webpage, 'config',
- default='{}' if not fatal else NO_DEFAULT),
+ (r'INITIAL_DATA_*\s*=\s*({.+?})\s*;',
+ r'({.+?})\s*,\s*"[^"]+"\s*\)\s*</script>'),
+ webpage, 'config', default='{}' if not fatal else NO_DEFAULT),
display_id, fatal=False)
if not config:
return
- return try_get(config, lambda x: x['series'], dict)
+ return try_get(
+ config,
+ (lambda x: x['initialState']['series'], lambda x: x['series']),
+ dict)
+
+ def _extract_seasons(self, seasons):
+ if not isinstance(seasons, list):
+ return []
+ entries = []
+ for season in seasons:
+ entries.extend(self._extract_episodes(season))
+ return entries
def _extract_episodes(self, season):
- entries = []
if not isinstance(season, dict):
- return entries
- episodes = season.get('episodes')
- if not isinstance(episodes, list):
- return entries
- for episode in episodes:
+ return []
+ return self._extract_entries(season.get('episodes'))
+
+ def _extract_entries(self, entry_list):
+ if not isinstance(entry_list, list):
+ return []
+ entries = []
+ for episode in entry_list:
nrk_id = episode.get('prfId')
if not nrk_id or not isinstance(nrk_id, compat_str):
continue
_VALID_URL = r'https?://(?:tv|radio)\.nrk(?:super)?\.no/serie/(?P<id>[^/]+)'
_ITEM_RE = r'(?:data-season=["\']|id=["\']season-)(?P<id>\d+)'
_TESTS = [{
- # new layout
+ # new layout, seasons
'url': 'https://tv.nrk.no/serie/backstage',
'info_dict': {
'id': 'backstage',
},
'playlist_mincount': 60,
}, {
- # old layout
+ # new layout, instalments
'url': 'https://tv.nrk.no/serie/groenn-glede',
'info_dict': {
'id': 'groenn-glede',
'title': 'Grønn glede',
'description': 'md5:7576e92ae7f65da6993cf90ee29e4608',
},
- 'playlist_mincount': 9,
+ 'playlist_mincount': 10,
}, {
- 'url': 'http://tv.nrksuper.no/serie/labyrint',
+ # old layout
+ 'url': 'https://tv.nrksuper.no/serie/labyrint',
'info_dict': {
'id': 'labyrint',
'title': 'Labyrint',
- 'description': 'md5:58afd450974c89e27d5a19212eee7115',
+ 'description': 'md5:318b597330fdac5959247c9b69fdb1ec',
},
'playlist_mincount': 3,
}, {
description = try_get(
series, lambda x: x['titles']['subtitle'], compat_str)
entries = []
- for season in series['seasons']:
- entries.extend(self._extract_episodes(season))
+ entries.extend(self._extract_seasons(series.get('seasons')))
+ entries.extend(self._extract_entries(series.get('instalments')))
+ entries.extend(self._extract_episodes(series.get('extraMaterial')))
return self.playlist_result(entries, series_id, title, description)
- # Old layout (e.g. https://tv.nrk.no/serie/groenn-glede)
+ # Old layout (e.g. https://tv.nrksuper.no/serie/labyrint)
entries = [
self.url_result(
'https://tv.nrk.no/program/Episodes/{series}/{season}'.format(
'seriestitle', webpage,
'title', default=None) or self._og_search_title(
webpage, fatal=False)
+ if title:
+ title = self._search_regex(
+ r'NRK (?:Super )?TV\s*[-–]\s*(.+)', title, 'title', default=title)
description = self._html_search_meta(
'series_description', webpage,
'title': 'Rivertonprisen til Karin Fossum',
'description': 'Første kvinne på 15 år til å vinne krimlitteraturprisen.',
},
- 'playlist_count': 5,
+ 'playlist_count': 2,
}]
def _extract_title(self, webpage):
class NZZIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?nzz\.ch/(?:[^/]+/)*[^/?#]+-ld\.(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.nzz.ch/zuerich/gymizyte/gymizyte-schreiben-schueler-heute-noch-diktate-ld.9153',
'info_dict': {
'id': '9153',
},
'playlist_mincount': 6,
- }
+ }, {
+ 'url': 'https://www.nzz.ch/video/nzz-standpunkte/cvp-auf-der-suche-nach-dem-mass-der-mitte-ld.1368112',
+ 'info_dict': {
+ 'id': '1368112',
+ },
+ 'playlist_count': 1,
+ }]
def _real_extract(self, url):
page_id = self._match_id(url)
webpage = self._download_webpage(url, page_id)
entries = []
- for player_element in re.findall(r'(<[^>]+class="kalturaPlayer"[^>]*>)', webpage):
+ for player_element in re.findall(
+ r'(<[^>]+class="kalturaPlayer[^"]*"[^>]*>)', webpage):
player_params = extract_attributes(player_element)
if player_params.get('data-type') not in ('kaltura_singleArticle',):
self.report_warning('Unsupported player type')
class OpenloadIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:openload\.(?:co|io|link)|oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun))/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?P<host>
+ (?:www\.)?
+ (?:
+ openload\.(?:co|io|link)|
+ oload\.(?:tv|stream|site|xyz|win|download|cloud|cc|icu|fun)
+ )
+ )/
+ (?:f|embed)/
+ (?P<id>[a-zA-Z0-9-_]+)
+ '''
_TESTS = [{
'url': 'https://openload.co/f/kUEfGclsU9o',
webpage)
def _real_extract(self, url):
- video_id = self._match_id(url)
- url_pattern = 'https://openload.co/%%s/%s/' % video_id
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
+ video_id = mobj.group('id')
+
+ url_pattern = 'https://%s/%%s/%s/' % (host, video_id)
headers = {
'User-Agent': self._USER_AGENT,
}
r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage,
'stream URL'))
- video_url = 'https://openload.co/stream/%s?mime=true' % decoded_id
+ video_url = 'https://%s/stream/%s?mime=true' % (host, decoded_id)
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
entry = entries[0] if entries else {}
subtitles = entry.get('subtitles')
- info_dict = {
+ return {
'id': video_id,
'title': title,
'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None),
'subtitles': subtitles,
'http_headers': headers,
}
- return info_dict
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class OutsideTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?outsidetv\.com/(?:[^/]+/)*?play/[a-zA-Z0-9]{8}/\d+/\d+/(?P<id>[a-zA-Z0-9]{8})'
+ _TESTS = [{
+ 'url': 'http://www.outsidetv.com/category/snow/play/ZjQYboH6/1/10/Hdg0jukV/4',
+ 'md5': '192d968fedc10b2f70ec31865ffba0da',
+ 'info_dict': {
+ 'id': 'Hdg0jukV',
+ 'ext': 'mp4',
+ 'title': 'Home - Jackson Ep 1 | Arbor Snowboards',
+ 'description': 'md5:41a12e94f3db3ca253b04bb1e8d8f4cd',
+ 'upload_date': '20181225',
+ 'timestamp': 1545742800,
+ }
+ }, {
+ 'url': 'http://www.outsidetv.com/home/play/ZjQYboH6/1/10/Hdg0jukV/4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ jw_media_id = self._match_id(url)
+ return self.url_result(
+ 'jwplatform:' + jw_media_id, 'JWPlatform', jw_media_id)
class PacktPubIE(PacktPubBaseIE):
- _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro',
'md5': '1e74bd6cfd45d7d07666f4684ef58f70',
'info_dict': {
'timestamp': 1490918400,
'upload_date': '20170331',
},
- }
+ }, {
+ 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215/20528/20530/project-intro',
+ 'only_matching': True,
+ }]
_NETRC_MACHINE = 'packtpub'
_TOKEN = None
class PacktPubCourseIE(PacktPubBaseIE):
- _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))'
- _TEST = {
+ _VALID_URL = r'(?P<url>https?://(?:(?:www\.)?packtpub\.com/mapt|subscription\.packtpub\.com)/video/[^/]+/(?P<id>\d+))'
+ _TESTS = [{
'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215',
'info_dict': {
'id': '9781787122215',
'title': 'Learn Nodejs by building 12 projects [Video]',
},
'playlist_count': 90,
- }
+ }, {
+ 'url': 'https://subscription.packtpub.com/video/web_development/9781787122215',
+ 'only_matching': True,
+ }]
@classmethod
def suitable(cls, url):
# coding: utf-8
from __future__ import unicode_literals
+import re
import time
from .common import InfoExtractor
class PicartoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www.)?picarto\.tv/(?P<id>[a-zA-Z0-9]+)(?:/(?P<token>[a-zA-Z0-9]+))?'
_TEST = {
'url': 'https://picarto.tv/Setz',
'info_dict': {
return False if PicartoVodIE.suitable(url) else super(PicartoIE, cls).suitable(url)
def _real_extract(self, url):
- channel_id = self._match_id(url)
- stream_page = self._download_webpage(url, channel_id)
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
- if '>This channel does not exist' in stream_page:
- raise ExtractorError(
- 'Channel %s does not exist' % channel_id, expected=True)
+ metadata = self._download_json(
+ 'https://api.picarto.tv/v1/channel/name/' + channel_id,
+ channel_id)
- player = self._parse_json(
- self._search_regex(
- r'(?s)playerSettings\[\d+\]\s*=\s*(\{.+?\}\s*\n)', stream_page,
- 'player settings'),
- channel_id, transform_source=js_to_json)
-
- if player.get('online') is False:
+ if metadata.get('online') is False:
raise ExtractorError('Stream is offline', expected=True)
cdn_data = self._download_json(
data=urlencode_postdata({'loadbalancinginfo': channel_id}),
note='Downloading load balancing info')
- def get_event(key):
- return try_get(player, lambda x: x['event'][key], compat_str) or ''
-
+ token = mobj.group('token') or 'public'
params = {
- 'token': player.get('token') or '',
- 'ticket': get_event('ticket'),
'con': int(time.time() * 1000),
- 'type': get_event('ticket'),
- 'scope': get_event('scope'),
+ 'token': token,
}
prefered_edge = cdn_data.get('preferedEdge')
- default_tech = player.get('defaultTech')
-
formats = []
for edge in cdn_data['edges']:
preference = 0
if edge_id == prefered_edge:
preference += 1
- if tech_type == default_tech:
- preference += 1
format_id = []
if edge_id:
format_id.append(edge_id)
continue
self._sort_formats(formats)
- mature = player.get('mature')
+ mature = metadata.get('adult')
if mature is None:
age_limit = None
else:
return {
'id': channel_id,
- 'title': self._live_title(channel_id),
+ 'title': self._live_title(metadata.get('title') or channel_id),
'is_live': True,
- 'thumbnail': player.get('vodThumb'),
+ 'thumbnail': try_get(metadata, lambda x: x['thumbnails']['web']),
+ 'channel': channel_id,
+ 'channel_url': 'https://picarto.tv/%s' % channel_id,
'age_limit': age_limit,
'formats': formats,
}
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ int_or_none,
+ PUTRequest,
+)
+
+
+class PlayPlusTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?playplus\.(?:com|tv)/VOD/(?P<project_id>[0-9]+)/(?P<id>[0-9a-f]{32})'
+ _TEST = {
+ 'url': 'https://www.playplus.tv/VOD/7572/db8d274a5163424e967f35a30ddafb8e',
+ 'md5': 'd078cb89d7ab6b9df37ce23c647aef72',
+ 'info_dict': {
+ 'id': 'db8d274a5163424e967f35a30ddafb8e',
+ 'ext': 'mp4',
+ 'title': 'Capítulo 179 - Final',
+ 'description': 'md5:01085d62d8033a1e34121d3c3cabc838',
+ 'timestamp': 1529992740,
+ 'upload_date': '20180626',
+ },
+ 'skip': 'Requires account credential',
+ }
+ _NETRC_MACHINE = 'playplustv'
+ _GEO_COUNTRIES = ['BR']
+ _token = None
+ _profile_id = None
+
+ def _call_api(self, resource, video_id=None, query=None):
+ return self._download_json('https://api.playplus.tv/api/media/v2/get' + resource, video_id, headers={
+ 'Authorization': 'Bearer ' + self._token,
+ }, query=query)
+
+ def _real_initialize(self):
+ email, password = self._get_login_info()
+ if email is None:
+ self.raise_login_required()
+
+ req = PUTRequest(
+ 'https://api.playplus.tv/api/web/login', json.dumps({
+ 'email': email,
+ 'password': password,
+ }).encode(), {
+ 'Content-Type': 'application/json; charset=utf-8',
+ })
+
+ try:
+ self._token = self._download_json(req, None)['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ raise ExtractorError(self._parse_json(
+ e.cause.read(), None)['errorMessage'], expected=True)
+ raise
+
+ self._profile = self._call_api('Profiles')['list'][0]['_id']
+
+ def _real_extract(self, url):
+ project_id, media_id = re.match(self._VALID_URL, url).groups()
+ media = self._call_api(
+ 'Media', media_id, {
+ 'profileId': self._profile,
+ 'projectId': project_id,
+ 'mediaId': media_id,
+ })['obj']
+ title = media['title']
+
+ formats = []
+ for f in media.get('files', []):
+ f_url = f.get('url')
+ if not f_url:
+ continue
+ file_info = f.get('fileInfo') or {}
+ formats.append({
+ 'url': f_url,
+ 'width': int_or_none(file_info.get('width')),
+ 'height': int_or_none(file_info.get('height')),
+ })
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for thumb in media.get('thumbs', []):
+ thumb_url = thumb.get('url')
+ if not thumb_url:
+ continue
+ thumbnails.append({
+ 'url': thumb_url,
+ 'width': int_or_none(thumb.get('width')),
+ 'height': int_or_none(thumb.get('height')),
+ })
+
+ return {
+ 'id': media_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': clean_html(media.get('description')) or media.get('shortDescription'),
+ 'timestamp': int_or_none(media.get('publishDate'), 1000),
+ 'view_count': int_or_none(media.get('numberOfViews')),
+ 'comment_count': int_or_none(media.get('numberOfComments')),
+ 'tags': media.get('tags'),
+ }
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+ (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
}, {
'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
'only_matching': True,
+ }, {
+ 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
+ 'only_matching': True,
}]
@staticmethod
def _extract_urls(webpage):
return re.findall(
- r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
+ r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
webpage)
def _extract_count(self, pattern, webpage, name):
pattern, webpage, '%s count' % name, fatal=False))
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host') or 'pornhub.com'
+ video_id = mobj.group('id')
- self._set_cookie('pornhub.com', 'age_verified', '1')
+ self._set_cookie(host, 'age_verified', '1')
def dl_webpage(platform):
- self._set_cookie('pornhub.com', 'platform', platform)
+ self._set_cookie(host, 'platform', platform)
return self._download_webpage(
- 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
+ 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id),
video_id, 'Downloading %s webpage' % platform)
webpage = dl_webpage('pc')
class PornHubPlaylistBaseIE(InfoExtractor):
- def _extract_entries(self, webpage):
+ def _extract_entries(self, webpage, host):
# Only process container div with main playlist content skipping
# drop-down menu that uses similar pattern for videos (see
# https://github.com/rg3/youtube-dl/issues/11594).
return [
self.url_result(
- 'http://www.pornhub.com/%s' % video_url,
+ 'http://www.%s/%s' % (host, video_url),
PornHubIE.ie_key(), video_title=title)
for video_url, title in orderedSet(re.findall(
r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
]
def _real_extract(self, url):
- playlist_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
+ playlist_id = mobj.group('id')
webpage = self._download_webpage(url, playlist_id)
- entries = self._extract_entries(webpage)
+ entries = self._extract_entries(webpage, host)
playlist = self._parse_json(
self._search_regex(
class PornHubPlaylistIE(PornHubPlaylistBaseIE):
- _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.pornhub.com/playlist/4667351',
'info_dict': {
class PornHubUserVideosIE(PornHubPlaylistBaseIE):
- _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
+ _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
_TESTS = [{
'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
'info_dict': {
}]
def _real_extract(self, url):
- user_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ host = mobj.group('host')
+ user_id = mobj.group('id')
entries = []
for page_num in itertools.count(1):
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
break
raise
- page_entries = self._extract_entries(webpage)
+ page_entries = self._extract_entries(webpage, host)
if not page_entries:
break
entries.extend(page_entries)
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from .brightcove import BrightcoveLegacyIE
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
+from ..utils import smuggle_url
class RMCDecouverteIE(InfoExtractor):
- _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'
+ _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/(?:(?:[^/]+/)*program_(?P<id>\d+)|(?P<live_id>mediaplayer-direct))'
- _TEST = {
- 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',
+ _TESTS = [{
+ 'url': 'https://rmcdecouverte.bfmtv.com/wheeler-dealers-occasions-a-saisir/program_2566/',
'info_dict': {
- 'id': '5419055995001',
+ 'id': '5983675500001',
'ext': 'mp4',
- 'title': 'UN DELICIEUX PROJET',
- 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',
+ 'title': 'CORVETTE',
+ 'description': 'md5:c1e8295521e45ffebf635d6a7658f506',
'uploader_id': '1969646226001',
- 'upload_date': '20170502',
- 'timestamp': 1493745308,
+ 'upload_date': '20181226',
+ 'timestamp': 1545861635,
},
'params': {
'skip_download': True,
},
'skip': 'only available for a week',
- }
+ }, {
+ # live, geo restricted, bypassable
+ 'url': 'https://rmcdecouverte.bfmtv.com/mediaplayer-direct/',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ mobj = re.match(self._VALID_URL, url)
+ display_id = mobj.group('id') or mobj.group('live_id')
+ webpage = self._download_webpage(url, display_id)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
if brightcove_legacy_url:
brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
brightcove_id = self._search_regex(
r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
return self.url_result(
- self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew',
- brightcove_id)
+ smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['FR']}),
+ 'BrightcoveNew', brightcove_id)
from ..utils import (
float_or_none,
parse_iso8601,
+ str_or_none,
+ try_get,
unescapeHTML,
+ url_or_none,
ExtractorError,
)
def _real_extract(self, url):
item_id = self._match_id(url)
- try:
- json_string = self._download_json(
- 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=' + item_id,
- item_id)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
- error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False)
- if error_info:
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, error_info['message']),
- expected=True)
- raise
-
- # NB the string values in the JSON are stored using XML escaping(!)
- show = json_string['shows'][0]
- title = unescapeHTML(show['title'])
- description = unescapeHTML(show.get('description'))
- thumbnail = show.get('thumbnail')
- duration = float_or_none(show.get('duration'), 1000)
- timestamp = parse_iso8601(show.get('published'))
-
- mg = show['media:group'][0]
-
+ info_dict = {}
formats = []
- if mg.get('url'):
- m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url'])
- if m:
- m = m.groupdict()
- formats.append({
- 'url': m['url'] + '/' + m['app'],
- 'app': m['app'],
- 'play_path': m['playpath'],
- 'player_url': url,
- 'ext': 'flv',
- 'format_id': 'rtmp',
- })
-
- if mg.get('hls_server') and mg.get('hls_url'):
- formats.extend(self._extract_m3u8_formats(
- mg['hls_server'] + mg['hls_url'], item_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
-
- if mg.get('hds_server') and mg.get('hds_url'):
- formats.extend(self._extract_f4m_formats(
- mg['hds_server'] + mg['hds_url'], item_id,
- f4m_id='hds', fatal=False))
+ ENDPOINTS = (
+ 'https://feeds.rasset.ie/rteavgen/player/playlist?type=iptv&format=json&showId=',
+ 'http://www.rte.ie/rteavgen/getplaylist/?type=web&format=json&id=',
+ )
+
+ for num, ep_url in enumerate(ENDPOINTS, start=1):
+ try:
+ data = self._download_json(ep_url + item_id, item_id)
+ except ExtractorError as ee:
+ if num < len(ENDPOINTS) or formats:
+ continue
+ if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404:
+ error_info = self._parse_json(ee.cause.read().decode(), item_id, fatal=False)
+ if error_info:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error_info['message']),
+ expected=True)
+ raise
+
+ # NB the string values in the JSON are stored using XML escaping(!)
+ show = try_get(data, lambda x: x['shows'][0], dict)
+ if not show:
+ continue
+
+ if not info_dict:
+ title = unescapeHTML(show['title'])
+ description = unescapeHTML(show.get('description'))
+ thumbnail = show.get('thumbnail')
+ duration = float_or_none(show.get('duration'), 1000)
+ timestamp = parse_iso8601(show.get('published'))
+ info_dict = {
+ 'id': item_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ }
+
+ mg = try_get(show, lambda x: x['media:group'][0], dict)
+ if not mg:
+ continue
+
+ if mg.get('url'):
+ m = re.match(r'(?P<url>rtmpe?://[^/]+)/(?P<app>.+)/(?P<playpath>mp4:.*)', mg['url'])
+ if m:
+ m = m.groupdict()
+ formats.append({
+ 'url': m['url'] + '/' + m['app'],
+ 'app': m['app'],
+ 'play_path': m['playpath'],
+ 'player_url': url,
+ 'ext': 'flv',
+ 'format_id': 'rtmp',
+ })
+
+ if mg.get('hls_server') and mg.get('hls_url'):
+ formats.extend(self._extract_m3u8_formats(
+ mg['hls_server'] + mg['hls_url'], item_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False))
+
+ if mg.get('hds_server') and mg.get('hds_url'):
+ formats.extend(self._extract_f4m_formats(
+ mg['hds_server'] + mg['hds_url'], item_id,
+ f4m_id='hds', fatal=False))
+
+ mg_rte_server = str_or_none(mg.get('rte:server'))
+ mg_url = str_or_none(mg.get('url'))
+ if mg_rte_server and mg_url:
+ hds_url = url_or_none(mg_rte_server + mg_url)
+ if hds_url:
+ formats.extend(self._extract_f4m_formats(
+ hds_url, item_id, f4m_id='hds', fatal=False))
self._sort_formats(formats)
- return {
- 'id': item_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
- 'formats': formats,
- }
+ info_dict['formats'] = formats
+ return info_dict
class RteIE(RteBaseIE):
video_id = self._match_id(url)
video_xml = self._download_xml(
- 'http://gatling.ruutu.fi/media-xml-cache?id=%s' % video_id, video_id)
+ 'https://gatling.nelonenmedia.fi/media-xml-cache', video_id,
+ query={'id': video_id})
formats = []
processed_urls = []
class SafariBaseIE(InfoExtractor):
- _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'
+ _LOGIN_URL = 'https://learning.oreilly.com/accounts/login/'
_NETRC_MACHINE = 'safari'
- _API_BASE = 'https://www.safaribooksonline.com/api/v1'
+ _API_BASE = 'https://learning.oreilly.com/api/v1'
_API_FORMAT = 'json'
LOGGED_IN = False
IE_DESC = 'safaribooksonline.com online video'
_VALID_URL = r'''(?x)
https?://
- (?:www\.)?safaribooksonline\.com/
+ (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/
(?:
library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?\#&]+)\.html|
videos/[^/]+/[^/]+/(?P<reference_id>[^-]+-[^/?\#&]+)
}, {
'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314/9780134217314-PYMC_13_00',
'only_matching': True,
+ }, {
+ 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838/9780133392838-00_SeriesIntro',
+ 'only_matching': True,
}]
_PARTNER_ID = '1926081'
class SafariApiIE(SafariBaseIE):
IE_NAME = 'safari:api'
- _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html'
_TESTS = [{
'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
_VALID_URL = r'''(?x)
https?://
(?:
- (?:www\.)?safaribooksonline\.com/
+ (?:www\.)?(?:safaribooksonline|learning\.oreilly)\.com/
(?:
library/view/[^/]+|
api/v1/book|
}, {
'url': 'https://www.safaribooksonline.com/videos/python-programming-language/9780134217314',
'only_matching': True,
+ }, {
+ 'url': 'https://learning.oreilly.com/videos/hadoop-fundamentals-livelessons/9780133392838',
+ 'only_matching': True,
}]
@classmethod
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = os.path.splitext(url.split('/')[-1])[0]
- return {
- '_type': 'url',
- 'id': video_id,
- 'url': mobj.group('url'),
- }
+
+ return self.url_result(mobj.group('url'), video_id=video_id)
_VALID_URL = r'''(?x)
https?://
watch\.
- (?P<site>hgtv|foodnetwork|travelchannel|diynetwork|cookingchanneltv|geniuskitchen)\.com/
+ (?P<site>geniuskitchen)\.com/
(?:
player\.[A-Z0-9]+\.html\#|
show/(?:[^/]+/){2}|
(?P<id>\d+)
'''
_TESTS = [{
- 'url': 'http://watch.hgtv.com/show/HGTVE/Best-Ever-Treehouses/2241515/Best-Ever-Treehouses/',
- 'md5': '26545fd676d939954c6808274bdb905a',
+ 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/',
'info_dict': {
- 'id': '4173834',
+ 'id': '4194875',
'ext': 'mp4',
- 'title': 'Best Ever Treehouses',
- 'description': "We're searching for the most over the top treehouses.",
+ 'title': 'Ample Hills Ice Cream Bike',
+ 'description': 'Courtney Rada churns up a signature GK Now ice cream with The Scoopmaster.',
'uploader': 'ANV',
- 'upload_date': '20170922',
- 'timestamp': 1506056400,
+ 'upload_date': '20171011',
+ 'timestamp': 1507698000,
},
'params': {
'skip_download': True,
},
'add_ie': [AnvatoIE.ie_key()],
- }, {
- 'url': 'http://watch.diynetwork.com/show/DSAL/Salvage-Dawgs/2656646/Covington-Church/',
- 'only_matching': True,
- }, {
- 'url': 'http://watch.diynetwork.com/player.HNT.html#2656646',
- 'only_matching': True,
- }, {
- 'url': 'http://watch.geniuskitchen.com/player/3787617/Ample-Hills-Ice-Cream-Bike/',
- 'only_matching': True,
}]
_SNI_TABLE = {
- 'hgtv': 'hgtv',
- 'diynetwork': 'diy',
- 'foodnetwork': 'food',
- 'cookingchanneltv': 'cook',
- 'travelchannel': 'trav',
'geniuskitchen': 'genius',
}
from ..utils import (
ExtractorError,
int_or_none,
+ url_or_none,
urlencode_postdata,
)
}
def _extract_video_url(self, webpage, video_id, *args):
+ def decode_url(encoded_url):
+ return compat_b64decode(encoded_url).decode('utf-8')
+
+ stream_url = url_or_none(decode_url(self._search_regex(
+ r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'stream url', default=None, group='url')))
+ if stream_url:
+ return stream_url
return self._parse_json(
self._search_regex(
r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
webpage, 'stream', group='url'),
- video_id,
- transform_source=lambda x: compat_b64decode(x).decode('utf-8'))[0]
+ video_id, transform_source=decode_url)[0]
for asset in clip_data['assets']:
asset_url = asset.get('full_physical_path')
protocol = asset.get('protocol')
- if not asset_url or protocol == 'primetime' or asset_url in urls:
+ if not asset_url or protocol == 'primetime' or asset.get('type') == 'usp_hlsfp_h264' or asset_url in urls:
continue
urls.append(asset_url)
container = asset.get('video_container')
if not urlh:
continue
asset_url = urlh.geturl()
- asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url)
- formats.extend(self._extract_m3u8_formats(
- asset_url, video_id, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- formats.extend(self._extract_f4m_formats(
- asset_url.replace('.m3u8', '.f4m'),
- video_id, f4m_id='hds', fatal=False))
- formats.extend(self._extract_mpd_formats(
- asset_url.replace('.m3u8', '.mpd'),
- video_id, mpd_id='dash', fatal=False))
- formats.extend(self._extract_ism_formats(
- re.sub(r'/[^/]+\.m3u8', '/Manifest', asset_url),
- video_id, ism_id='mss', fatal=False))
+ for i in range(3, 0, -1):
+ asset_url = asset_url = asset_url.replace('_sd1/', '_sd%d/' % i)
+ m3u8_formats = self._extract_m3u8_formats(
+ asset_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ formats.extend(m3u8_formats)
+ formats.extend(self._extract_mpd_formats(
+ asset_url.replace('.m3u8', '.mpd'),
+ video_id, mpd_id='dash', fatal=False))
+ if m3u8_formats:
+ break
else:
formats.extend(self._extract_m3u8_formats(
asset_url, video_id, 'mp4', 'm3u8_native',
webpage = self._download_webpage(url, video_id)
stream_url = self._search_regex(
- r'url\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage,
+ r'(?:url|source)\s*:\s*(["\'])(?P<url>(?:https?:)?//.+?\.m3u8.*?)\1', webpage,
'stream url', group='url')
title = self._og_search_title(webpage)
class TBSIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com(?P<path>/(?:movies|shows/[^/]+/(?:clips|season-\d+/episode-\d+))/(?P<id>[^/?#]+))'
_TESTS = [{
'url': 'http://www.tntdrama.com/shows/the-alienist/clips/monster',
'info_dict': {
}]
def _real_extract(self, url):
- site, display_id = re.match(self._VALID_URL, url).groups()
+ site, path, display_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
drupal_settings = self._parse_json(self._search_regex(
r'<script[^>]+?data-drupal-selector="drupal-settings-json"[^>]*?>({.+?})</script>',
webpage, 'drupal setting'), display_id)
- video_data = drupal_settings['turner_playlist'][0]
+ video_data = next(v for v in drupal_settings['turner_playlist'] if v.get('url') == path)
media_id = video_data['mediaID']
title = video_data['title']
)
-class UpskillBaseIE(InfoExtractor):
- _LOGIN_URL = 'http://upskillcourses.com/sign_in'
- _NETRC_MACHINE = 'upskill'
+class TeachableBaseIE(InfoExtractor):
+ _NETRC_MACHINE = 'teachable'
+ _URL_PREFIX = 'teachable:'
+
+ _SITES = {
+ # Only notable ones here
+ 'upskillcourses.com': 'upskill',
+ 'academy.gns3.com': 'gns3',
+ 'academyhacker.com': 'academyhacker',
+ 'stackskills.com': 'stackskills',
+ 'market.saleshacker.com': 'saleshacker',
+ 'learnability.org': 'learnability',
+ 'edurila.com': 'edurila',
+ }
+
+ _VALID_URL_SUB_TUPLE = (_URL_PREFIX, '|'.join(re.escape(site) for site in _SITES.keys()))
def _real_initialize(self):
- self._login()
+ self._logged_in = False
- def _login(self):
- username, password = self._get_login_info()
+ def _login(self, site):
+ if self._logged_in:
+ return
+
+ username, password = self._get_login_info(
+ netrc_machine=self._SITES.get(site, site))
if username is None:
return
login_page, urlh = self._download_webpage_handle(
- self._LOGIN_URL, None, 'Downloading login page')
+ 'https://%s/sign_in' % site, None,
+ 'Downloading %s login page' % site)
login_url = compat_str(urlh.geturl())
post_url = urljoin(login_url, post_url)
response = self._download_webpage(
- post_url, None, 'Logging in',
+ post_url, None, 'Logging in to %s' % site,
data=urlencode_postdata(login_form),
headers={
'Content-Type': 'application/x-www-form-urlencoded',
'Referer': login_url,
})
+ if '>I accept the new Privacy Policy<' in response:
+ raise ExtractorError(
+ 'Unable to login: %s asks you to accept new Privacy Policy. '
+ 'Go to https://%s/ and accept.' % (site, site), expected=True)
+
# Successful login
if any(re.search(p, response) for p in (
r'class=["\']user-signout',
r'<a[^>]+\bhref=["\']/sign_out',
r'>\s*Log out\s*<')):
+ self._logged_in = True
return
message = get_element_by_class('alert', response)
raise ExtractorError('Unable to log in')
-class UpskillIE(UpskillBaseIE):
- _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)'
+class TeachableIE(TeachableBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ %shttps?://(?P<site_t>[^/]+)|
+ https?://(?:www\.)?(?P<site>%s)
+ )
+ /courses/[^/]+/lectures/(?P<id>\d+)
+ ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
_TESTS = [{
'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
'id': 'uzw6zw58or',
'ext': 'mp4',
'title': 'Welcome to the Course!',
- 'description': 'md5:8d66c13403783370af62ca97a7357bdd',
+ 'description': 'md5:65edb0affa582974de4625b9cdea1107',
'duration': 138.763,
'timestamp': 1479846621,
'upload_date': '20161122',
}, {
'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
'only_matching': True,
+ }, {
+ 'url': 'https://academy.gns3.com/courses/423415/lectures/6885939',
+ 'only_matching': True,
+ }, {
+ 'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+ 'only_matching': True,
}]
+ @staticmethod
+ def _is_teachable(webpage):
+ return 'teachableTracker.linker:autoLink' in webpage and re.search(
+ r'<link[^>]+href=["\']https?://process\.fs\.teachablecdn\.com',
+ webpage)
+
+ @staticmethod
+ def _extract_url(webpage, source_url):
+ if not TeachableIE._is_teachable(webpage):
+ return
+ if re.match(r'https?://[^/]+/(?:courses|p)', source_url):
+ return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url)
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ site = mobj.group('site') or mobj.group('site_t')
+ video_id = mobj.group('id')
+
+ self._login(site)
+
+ prefixed = url.startswith(self._URL_PREFIX)
+ if prefixed:
+ url = url[len(self._URL_PREFIX):]
webpage = self._download_webpage(url, video_id)
}
-class UpskillCourseIE(UpskillBaseIE):
- _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)'
+class TeachableCourseIE(TeachableBaseIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ %shttps?://(?P<site_t>[^/]+)|
+ https?://(?:www\.)?(?P<site>%s)
+ )
+ /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+)
+ ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
_TESTS = [{
'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
'info_dict': {
- 'id': '119763',
+ 'id': 'essential-web-developer-course',
'title': 'The Essential Web Developer Course (Free)',
},
'playlist_count': 192,
}, {
'url': 'http://upskillcourses.com/courses/enrolled/119763',
'only_matching': True,
+ }, {
+ 'url': 'https://academy.gns3.com/courses/enrolled/423415',
+ 'only_matching': True,
+ }, {
+ 'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini',
+ 'only_matching': True,
+ }, {
+ 'url': 'teachable:https://filmsimplified.com/p/davinci-resolve-15-crash-course',
+ 'only_matching': True,
}]
@classmethod
def suitable(cls, url):
- return False if UpskillIE.suitable(url) else super(
- UpskillCourseIE, cls).suitable(url)
+ return False if TeachableIE.suitable(url) else super(
+ TeachableCourseIE, cls).suitable(url)
def _real_extract(self, url):
- course_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ site = mobj.group('site') or mobj.group('site_t')
+ course_id = mobj.group('id')
+
+ self._login(site)
+
+ prefixed = url.startswith(self._URL_PREFIX)
+ if prefixed:
+ prefix = self._URL_PREFIX
+ url = url[len(prefix):]
webpage = self._download_webpage(url, course_id)
- course_id = self._search_regex(
- r'data-course-id=["\'](\d+)', webpage, 'course id',
- default=course_id)
+ url_base = 'https://%s/' % site
entries = []
title = self._html_search_regex(
r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
'title', default=None)
+ entry_url = urljoin(url_base, lecture_url)
+ if prefixed:
+ entry_url = self._URL_PREFIX + entry_url
entries.append(
self.url_result(
- urljoin('http://upskillcourses.com/', lecture_url),
- ie=UpskillIE.ie_key(), video_id=lecture_id,
+ entry_url,
+ ie=TeachableIE.ie_key(), video_id=lecture_id,
video_title=clean_html(title)))
course_title = self._html_search_regex(
ext_url = None
if service.lower() == 'youtube':
ext_url = external.get('code')
- return {
- '_type': 'url',
- 'url': ext_url or external['uri'],
- }
+
+ return self.url_result(ext_url or external['uri'])
resources_ = player_talk.get('resources') or talk_info.get('resources')
self.to_screen('Test URL: %s' % tc['url'])
- return {
- '_type': 'url',
- 'url': tc['url'],
- 'id': video_id,
- }
+ return self.url_result(tc['url'], video_id=video_id)
def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
entry = self._download_json(real_url, video_id)['entries'][0]
- main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None
+ main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else entry.get('plmedia$publicUrl')
formats = []
subtitles = {}
if first_video_id is None:
first_video_id = cur_video_id
duration = float_or_none(item.get('plfile$duration'))
- for asset_type in item['plfile$assetTypes']:
+ file_asset_types = item.get('plfile$assetTypes') or compat_parse_qs(compat_urllib_parse_urlparse(smil_url).query)['assetTypes']
+ for asset_type in file_asset_types:
if asset_type in asset_types:
continue
asset_types.append(asset_type)
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ compat_str,
+ ExtractorError,
+ int_or_none,
+ str_or_none,
+ try_get,
+ url_or_none,
+)
+
+
+class TikTokBaseIE(InfoExtractor):
+ def _extract_aweme(self, data):
+ video = data['video']
+ description = str_or_none(try_get(data, lambda x: x['desc']))
+ width = int_or_none(try_get(data, lambda x: video['width']))
+ height = int_or_none(try_get(data, lambda x: video['height']))
+
+ format_urls = set()
+ formats = []
+ for format_id in (
+ 'play_addr_lowbr', 'play_addr', 'play_addr_h264',
+ 'download_addr'):
+ for format in try_get(
+ video, lambda x: x[format_id]['url_list'], list) or []:
+ format_url = url_or_none(format)
+ if not format_url:
+ continue
+ if format_url in format_urls:
+ continue
+ format_urls.add(format_url)
+ formats.append({
+ 'url': format_url,
+ 'ext': 'mp4',
+ 'height': height,
+ 'width': width,
+ })
+ self._sort_formats(formats)
+
+ thumbnail = url_or_none(try_get(
+ video, lambda x: x['cover']['url_list'][0], compat_str))
+ uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
+ timestamp = int_or_none(data.get('create_time'))
+ comment_count = int_or_none(data.get('comment_count')) or int_or_none(
+ try_get(data, lambda x: x['statistics']['comment_count']))
+ repost_count = int_or_none(try_get(
+ data, lambda x: x['statistics']['share_count']))
+
+ aweme_id = data['aweme_id']
+
+ return {
+ 'id': aweme_id,
+ 'title': uploader or aweme_id,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'comment_count': comment_count,
+ 'repost_count': repost_count,
+ 'formats': formats,
+ }
+
+
+class TikTokIE(TikTokBaseIE):
+ _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://m.tiktok.com/v/6606727368545406213.html',
+ 'md5': 'd584b572e92fcd48888051f238022420',
+ 'info_dict': {
+ 'id': '6606727368545406213',
+ 'ext': 'mp4',
+ 'title': 'Zureeal',
+ 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
+ 'thumbnail': r're:^https?://.*~noop.image',
+ 'uploader': 'Zureeal',
+ 'timestamp': 1538248586,
+ 'upload_date': '20180929',
+ 'comment_count': int,
+ 'repost_count': int,
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ data = self._parse_json(self._search_regex(
+ r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id)
+ return self._extract_aweme(data)
+
+
+class TikTokUserIE(TikTokBaseIE):
+ _VALID_URL = r'https?://(?:m\.)?tiktok\.com/h5/share/usr/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html',
+ 'info_dict': {
+ 'id': '188294915489964032',
+ },
+ 'playlist_mincount': 24,
+ }
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+ data = self._download_json(
+ 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id,
+ query={'_signature': '_'})
+ entries = []
+ for aweme in data['aweme_list']:
+ try:
+ entry = self._extract_aweme(aweme)
+ except ExtractorError:
+ continue
+ entry['extractor_key'] = TikTokIE.ie_key()
+ entries.append(entry)
+ return self.playlist_result(entries, user_id)
class TNAFlixNetworkBaseIE(InfoExtractor):
# May be overridden in descendants if necessary
_CONFIG_REGEX = [
- r'flashvars\.config\s*=\s*escape\("([^"]+)"',
- r'<input[^>]+name="config\d?" value="([^"]+)"',
+ r'flashvars\.config\s*=\s*escape\("(?P<url>[^"]+)"',
+ r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"',
+ r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1',
]
_HOST = 'tna'
_VKEY_SUFFIX = ''
webpage = self._download_webpage(url, display_id)
cfg_url = self._proto_relative_url(self._html_search_regex(
- self._CONFIG_REGEX, webpage, 'flashvars.config', default=None), 'http:')
+ self._CONFIG_REGEX, webpage, 'flashvars.config', default=None,
+ group='url'), 'http:')
if not cfg_url:
inputs = self._hidden_inputs(webpage)
int_or_none,
parse_iso8601,
parse_duration,
- try_get,
+ str_or_none,
update_url_query,
+ urljoin,
)
def _call_api(self, path, video_id, query):
return self._download_json(
- 'https://api.tvnow.de/v3/' + path,
- video_id, query=query)
+ 'https://api.tvnow.de/v3/' + path, video_id, query=query)
def _extract_video(self, info, display_id):
video_id = compat_str(info['id'])
(?!(?:list|jahr)(?:/|$))(?P<id>[^/?\#&]+)
'''
+ @classmethod
+ def suitable(cls, url):
+ return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url) or TVNowShowIE.suitable(url)
+ else super(TVNowIE, cls).suitable(url))
+
_TESTS = [{
'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player',
'info_dict': {
'ext': 'mp4',
'title': 'Der neue Porsche 911 GT 3',
'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
- 'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1495994400,
'upload_date': '20170528',
'duration': 5283,
info = self._call_api(
'movies/' + display_id, display_id, query={
'fields': ','.join(self._VIDEO_FIELDS),
- 'station': mobj.group(1),
})
return self._extract_video(info, display_id)
-class TVNowListBaseIE(TVNowBaseIE):
- _SHOW_VALID_URL = r'''(?x)
- (?P<base_url>
- https?://
- (?:www\.)?tvnow\.(?:de|at|ch)/[^/]+/
- (?P<show_id>[^/]+)
- )
+class TVNowNewIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?P<base_url>https?://
+ (?:www\.)?tvnow\.(?:de|at|ch)/
+ (?:shows|serien))/
+ (?P<show>[^/]+)-\d+/
+ [^/]+/
+ episode-\d+-(?P<episode>[^/?$&]+)-(?P<id>\d+)
'''
- def _extract_list_info(self, display_id, show_id):
- fields = list(self._SHOW_FIELDS)
- fields.extend('formatTabs.%s' % field for field in self._SEASON_FIELDS)
- fields.extend(
- 'formatTabs.formatTabPages.container.movies.%s' % field
- for field in self._VIDEO_FIELDS)
- return self._call_api(
- 'formats/seo', display_id, query={
- 'fields': ','.join(fields),
- 'name': show_id + '.php'
- })
-
-
-class TVNowListIE(TVNowListBaseIE):
- _VALID_URL = r'%s/(?:list|jahr)/(?P<id>[^?\#&]+)' % TVNowListBaseIE._SHOW_VALID_URL
+ _TESTS = [{
+ 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
+ 'only_matching': True,
+ }]
- _SHOW_FIELDS = ('title', )
- _SEASON_FIELDS = ('id', 'headline', 'seoheadline', )
- _VIDEO_FIELDS = ('id', 'headline', 'seoUrl', )
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ base_url = re.sub(r'(?:shows|serien)', '_', mobj.group('base_url'))
+ show, episode = mobj.group('show', 'episode')
+ return self.url_result(
+ # Rewrite new URLs to the old format and use extraction via old API
+ # at api.tvnow.de as a loophole for bypassing premium content checks
+ '%s/%s/%s' % (base_url, show, episode),
+ ie=TVNowIE.ie_key(), video_id=mobj.group('id'))
+
+
+class TVNowNewBaseIE(InfoExtractor):
+ def _call_api(self, path, video_id, query={}):
+ result = self._download_json(
+ 'https://apigw.tvnow.de/module/' + path, video_id, query=query)
+ error = result.get('error')
+ if error:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error), expected=True)
+ return result
+
+
+"""
+TODO: new apigw.tvnow.de based version of TVNowIE. Replace old TVNowIE with it
+when api.tvnow.de is shut down. This version can't bypass premium checks though.
+class TVNowIE(TVNowNewBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?tvnow\.(?:de|at|ch)/
+ (?:shows|serien)/[^/]+/
+ (?:[^/]+/)+
+ (?P<display_id>[^/?$&]+)-(?P<id>\d+)
+ '''
_TESTS = [{
- 'url': 'https://www.tvnow.de/rtl/30-minuten-deutschland/list/aktuell',
+ # episode with annual navigation
+ 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
'info_dict': {
- 'id': '28296',
- 'title': '30 Minuten Deutschland - Aktuell',
+ 'id': '331082',
+ 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3',
+ 'ext': 'mp4',
+ 'title': 'Der neue Porsche 911 GT 3',
+ 'description': 'md5:6143220c661f9b0aae73b245e5d898bb',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1495994400,
+ 'upload_date': '20170528',
+ 'duration': 5283,
+ 'series': 'GRIP - Das Motormagazin',
+ 'season_number': 14,
+ 'episode_number': 405,
+ 'episode': 'Der neue Porsche 911 GT 3',
},
- 'playlist_mincount': 1,
}, {
- 'url': 'https://www.tvnow.de/vox/ab-ins-beet/list/staffel-14',
+ # rtl2, episode with season navigation
+ 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471/staffel-3/episode-14-bernd-steht-seit-der-trennung-von-seiner-frau-allein-da-526124',
'only_matching': True,
}, {
- 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/2018/3',
+ # rtlnitro
+ 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13/episode-5-auf-eigene-faust-pilot-366822',
+ 'only_matching': True,
+ }, {
+ # superrtl
+ 'url': 'https://www.tvnow.de/shows/die-lustigsten-schlamassel-der-welt-1221/staffel-2/episode-14-u-a-ketchup-effekt-364120',
+ 'only_matching': True,
+ }, {
+ # ntv
+ 'url': 'https://www.tvnow.de/shows/startup-news-10674/staffel-2/episode-39-goetter-in-weiss-387630',
+ 'only_matching': True,
+ }, {
+ # vox
+ 'url': 'https://www.tvnow.de/shows/auto-mobil-174/2017-11/episode-46-neues-vom-automobilmarkt-2017-11-19-17-00-00-380072',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05/episode-405-der-neue-porsche-911-gt-3-331082',
'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return (False if TVNowIE.suitable(url)
- else super(TVNowListIE, cls).suitable(url))
+ def _extract_video(self, info, url, display_id):
+ config = info['config']
+ source = config['source']
- def _real_extract(self, url):
- base_url, show_id, season_id = re.match(self._VALID_URL, url).groups()
+ video_id = compat_str(info.get('id') or source['videoId'])
+ title = source['title'].strip()
- list_info = self._extract_list_info(season_id, show_id)
+ paths = []
+ for manifest_url in (info.get('manifest') or {}).values():
+ if not manifest_url:
+ continue
+ manifest_url = update_url_query(manifest_url, {'filter': ''})
+ path = self._search_regex(r'https?://[^/]+/(.+?)\.ism/', manifest_url, 'path')
+ if path in paths:
+ continue
+ paths.append(path)
- season = next(
- season for season in list_info['formatTabs']['items']
- if season.get('seoheadline') == season_id)
+ def url_repl(proto, suffix):
+ return re.sub(
+ r'(?:hls|dash|hss)([.-])', proto + r'\1', re.sub(
+ r'\.ism/(?:[^.]*\.(?:m3u8|mpd)|[Mm]anifest)',
+ '.ism/' + suffix, manifest_url))
- title = list_info.get('title')
- headline = season.get('headline')
- if title and headline:
- title = '%s - %s' % (title, headline)
+ formats = self._extract_mpd_formats(
+ url_repl('dash', '.mpd'), video_id,
+ mpd_id='dash', fatal=False)
+ formats.extend(self._extract_ism_formats(
+ url_repl('hss', 'Manifest'),
+ video_id, ism_id='mss', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ url_repl('hls', '.m3u8'), video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+ if formats:
+ break
else:
- title = headline or title
+ if try_get(info, lambda x: x['rights']['isDrm']):
+ raise ExtractorError(
+ 'Video %s is DRM protected' % video_id, expected=True)
+ if try_get(config, lambda x: x['boards']['geoBlocking']['block']):
+ raise self.raise_geo_restricted()
+ if not info.get('free', True):
+ raise ExtractorError(
+ 'Video %s is not available for free' % video_id, expected=True)
+ self._sort_formats(formats)
+
+ description = source.get('description')
+ thumbnail = url_or_none(source.get('poster'))
+ timestamp = unified_timestamp(source.get('previewStart'))
+ duration = parse_duration(source.get('length'))
+
+ series = source.get('format')
+ season_number = int_or_none(self._search_regex(
+ r'staffel-(\d+)', url, 'season number', default=None))
+ episode_number = int_or_none(self._search_regex(
+ r'episode-(\d+)', url, 'episode number', default=None))
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ 'duration': duration,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'episode': title,
+ 'formats': formats,
+ }
+
+ def _real_extract(self, url):
+ display_id, video_id = re.match(self._VALID_URL, url).groups()
+ info = self._call_api('player/' + video_id, video_id)
+ return self._extract_video(info, video_id, display_id)
+"""
+
+
+class TVNowListBaseIE(TVNowNewBaseIE):
+ _SHOW_VALID_URL = r'''(?x)
+ (?P<base_url>
+ https?://
+ (?:www\.)?tvnow\.(?:de|at|ch)/(?:shows|serien)/
+ [^/?#&]+-(?P<show_id>\d+)
+ )
+ '''
+
+ @classmethod
+ def suitable(cls, url):
+ return (False if TVNowNewIE.suitable(url)
+ else super(TVNowListBaseIE, cls).suitable(url))
+
+ def _extract_items(self, url, show_id, list_id, query):
+ items = self._call_api(
+ 'teaserrow/format/episode/' + show_id, list_id,
+ query=query)['items']
entries = []
- for container in season['formatTabPages']['items']:
- items = try_get(
- container, lambda x: x['container']['movies']['items'],
- list) or []
- for info in items:
- seo_url = info.get('seoUrl')
- if not seo_url:
- continue
- video_id = info.get('id')
- entries.append(self.url_result(
- '%s/%s/player' % (base_url, seo_url), TVNowIE.ie_key(),
- compat_str(video_id) if video_id else None))
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ item_url = urljoin(url, item.get('url'))
+ if not item_url:
+ continue
+ video_id = str_or_none(item.get('id') or item.get('videoId'))
+ item_title = item.get('subheadline') or item.get('text')
+ entries.append(self.url_result(
+ item_url, ie=TVNowNewIE.ie_key(), video_id=video_id,
+ video_title=item_title))
- return self.playlist_result(
- entries, compat_str(season.get('id') or season_id), title)
+ return self.playlist_result(entries, '%s/%s' % (show_id, list_id))
-class TVNowShowIE(TVNowListBaseIE):
- _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL
+class TVNowSeasonIE(TVNowListBaseIE):
+ _VALID_URL = r'%s/staffel-(?P<id>\d+)' % TVNowListBaseIE._SHOW_VALID_URL
+ _TESTS = [{
+ 'url': 'https://www.tvnow.de/serien/alarm-fuer-cobra-11-die-autobahnpolizei-1815/staffel-13',
+ 'info_dict': {
+ 'id': '1815/13',
+ },
+ 'playlist_mincount': 22,
+ }]
+
+ def _real_extract(self, url):
+ _, show_id, season_id = re.match(self._VALID_URL, url).groups()
+ return self._extract_items(
+ url, show_id, season_id, {'season': season_id})
- _SHOW_FIELDS = ('id', 'title', )
- _SEASON_FIELDS = ('id', 'headline', 'seoheadline', )
- _VIDEO_FIELDS = ()
+class TVNowAnnualIE(TVNowListBaseIE):
+ _VALID_URL = r'%s/(?P<year>\d{4})-(?P<month>\d{2})' % TVNowListBaseIE._SHOW_VALID_URL
_TESTS = [{
- 'url': 'https://www.tvnow.at/vox/ab-ins-beet',
+ 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669/2017-05',
'info_dict': {
- 'id': 'ab-ins-beet',
- 'title': 'Ab ins Beet!',
+ 'id': '1669/2017-05',
},
- 'playlist_mincount': 7,
- }, {
- 'url': 'https://www.tvnow.at/vox/ab-ins-beet/list',
- 'only_matching': True,
+ 'playlist_mincount': 2,
+ }]
+
+ def _real_extract(self, url):
+ _, show_id, year, month = re.match(self._VALID_URL, url).groups()
+ return self._extract_items(
+ url, show_id, '%s-%s' % (year, month), {
+ 'year': int(year),
+ 'month': int(month),
+ })
+
+
+class TVNowShowIE(TVNowListBaseIE):
+ _VALID_URL = TVNowListBaseIE._SHOW_VALID_URL
+ _TESTS = [{
+ # annual navigationType
+ 'url': 'https://www.tvnow.de/shows/grip-das-motormagazin-1669',
+ 'info_dict': {
+ 'id': '1669',
+ },
+ 'playlist_mincount': 73,
}, {
- 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/jahr/',
- 'only_matching': True,
+ # season navigationType
+ 'url': 'https://www.tvnow.de/shows/armes-deutschland-11471',
+ 'info_dict': {
+ 'id': '11471',
+ },
+ 'playlist_mincount': 3,
}]
@classmethod
def suitable(cls, url):
- return (False if TVNowIE.suitable(url) or TVNowListIE.suitable(url)
+ return (False if TVNowNewIE.suitable(url) or TVNowSeasonIE.suitable(url) or TVNowAnnualIE.suitable(url)
else super(TVNowShowIE, cls).suitable(url))
def _real_extract(self, url):
base_url, show_id = re.match(self._VALID_URL, url).groups()
- list_info = self._extract_list_info(show_id, show_id)
+ result = self._call_api(
+ 'teaserrow/format/navigation/' + show_id, show_id)
+
+ items = result['items']
entries = []
- for season_info in list_info['formatTabs']['items']:
- season_url = season_info.get('seoheadline')
- if not season_url:
- continue
- season_id = season_info.get('id')
- entries.append(self.url_result(
- '%s/list/%s' % (base_url, season_url), TVNowListIE.ie_key(),
- compat_str(season_id) if season_id else None,
- season_info.get('headline')))
+ navigation = result.get('navigationType')
+ if navigation == 'annual':
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ year = int_or_none(item.get('year'))
+ if year is None:
+ continue
+ months = item.get('months')
+ if not isinstance(months, list):
+ continue
+ for month_dict in months:
+ if not isinstance(month_dict, dict) or not month_dict:
+ continue
+ month_number = int_or_none(list(month_dict.keys())[0])
+ if month_number is None:
+ continue
+ entries.append(self.url_result(
+ '%s/%04d-%02d' % (base_url, year, month_number),
+ ie=TVNowAnnualIE.ie_key()))
+ elif navigation == 'season':
+ for item in items:
+ if not isinstance(item, dict):
+ continue
+ season_number = int_or_none(item.get('season'))
+ if season_number is None:
+ continue
+ entries.append(self.url_result(
+ '%s/staffel-%d' % (base_url, season_number),
+ ie=TVNowSeasonIE.ie_key()))
+ else:
+ raise ExtractorError('Unknown navigationType')
- return self.playlist_result(entries, show_id, list_info.get('title'))
+ return self.playlist_result(entries, show_id)
urls.append('https://twitter.com/i/videos/' + video_id)
for u in urls:
- webpage = self._download_webpage(u, video_id)
+ webpage = self._download_webpage(
+ u, video_id, headers={'Referer': 'https://twitter.com/'})
iframe_url = self._html_search_regex(
r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
'height': 360,
},
'5': {
- 'width': 1080,
+ 'width': 1280,
'height': 720,
},
'6': {
'width': 568,
'height': 320,
},
+ '11': {
+ 'width': 640,
+ 'height': 360,
+ }
}
def _real_extract(self, url):
'ver': video_data.get('numRevision', 2),
'r': 'http://mais.uol.com.br',
}
+ for k in ('token', 'sign'):
+ v = video_data.get(k)
+ if v:
+ query[k] = v
+
formats = []
for f in video_data.get('formats', []):
f_url = f.get('url') or f.get('secureUrl')
if not f_url:
continue
+ f_url = update_url_query(f_url, query)
format_id = str_or_none(f.get('id'))
+ if format_id == '10':
+ formats.extend(self._extract_m3u8_formats(
+ f_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ continue
fmt = {
'format_id': format_id,
- 'url': update_url_query(f_url, query),
+ 'url': f_url,
+ 'source_preference': 1,
}
fmt.update(self._FORMATS.get(format_id, {}))
formats.append(fmt)
- self._sort_formats(formats)
+ self._sort_formats(formats, ('height', 'width', 'source_preference', 'tbr', 'ext'))
tags = []
for tag in video_data.get('tags', []):
from ..utils import (
determine_ext,
ExtractorError,
+ js_to_json,
InAdvancePagedList,
int_or_none,
merge_dicts,
NO_DEFAULT,
+ parse_filesize,
+ qualities,
RegexNotFoundError,
sanitized_Request,
smuggle_url,
unsmuggle_url,
urlencode_postdata,
unescapeHTML,
- parse_filesize,
)
'description': description,
'entries': pl,
}
+
+
+class VHXEmbedIE(InfoExtractor):
+ IE_NAME = 'vhx:embed'
+ _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)'
+
+ def _call_api(self, video_id, access_token, path='', query=None):
+ return self._download_json(
+ 'https://api.vhx.tv/videos/' + video_id + path, video_id, headers={
+ 'Authorization': 'Bearer ' + access_token,
+ }, query=query)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ credentials = self._parse_json(self._search_regex(
+ r'(?s)credentials\s*:\s*({.+?}),', webpage,
+ 'config'), video_id, js_to_json)
+ access_token = credentials['access_token']
+
+ query = {}
+ for k, v in credentials.items():
+ if k in ('authorization', 'authUserToken', 'ticket') and v and v != 'undefined':
+ if k == 'authUserToken':
+ query['auth_user_token'] = v
+ else:
+ query[k] = v
+ files = self._call_api(video_id, access_token, '/files', query)
+
+ formats = []
+ for f in files:
+ href = try_get(f, lambda x: x['_links']['source']['href'])
+ if not href:
+ continue
+ method = f.get('method')
+ if method == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ href, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif method == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ href, video_id, mpd_id='dash', fatal=False))
+ else:
+ fmt = {
+ 'filesize': int_or_none(try_get(f, lambda x: x['size']['bytes'])),
+ 'format_id': 'http',
+ 'preference': 1,
+ 'url': href,
+ 'vcodec': f.get('codec'),
+ }
+ quality = f.get('quality')
+ if quality:
+ fmt.update({
+ 'format_id': 'http-' + quality,
+ 'height': int_or_none(self._search_regex(r'(\d+)p', quality, 'height', default=None)),
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
+ video_data = self._call_api(video_id, access_token)
+ title = video_data.get('title') or video_data['name']
+
+ subtitles = {}
+ for subtitle in try_get(video_data, lambda x: x['tracks']['subtitles'], list) or []:
+ lang = subtitle.get('srclang') or subtitle.get('label')
+ for _link in subtitle.get('_links', {}).values():
+ href = _link.get('href')
+ if not href:
+ continue
+ subtitles.setdefault(lang, []).append({
+ 'url': href,
+ })
+
+ q = qualities(['small', 'medium', 'large', 'source'])
+ thumbnails = []
+ for thumbnail_id, thumbnail_url in video_data.get('thumbnail', {}).items():
+ thumbnails.append({
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ 'preference': q(thumbnail_id),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'duration': int_or_none(try_get(video_data, lambda x: x['duration']['seconds'])),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'timestamp': unified_timestamp(video_data.get('created_at')),
+ 'view_count': int_or_none(video_data.get('plays_count')),
+ }
# This video is no longer available, because its author has been blocked.
'url': 'https://vk.com/video-10639516_456240611',
'only_matching': True,
- }
- ]
+ },
+ {
+ # The video is not available in your region.
+ 'url': 'https://vk.com/video-51812607_171445436',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
r'<!>This video is no longer available, because it has been deleted.':
'Video %s is no longer available, because it has been deleted.',
+
+ r'<!>The video .+? is not available in your region.':
+ 'Video %s is not available in your region.',
}
for error_re, error_msg in ERRORS.items():
url, video_id,
headers=self.geo_verification_headers())
media_resource = self._parse_json(self._search_regex(
- r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>',
- webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
+ [
+ r'window\.__INITIAL_STATE__\s*=\s*({.+?})(?:</script>|;)',
+ r'window\.__INITIAL_STATE__\s*=\s*({.+})'
+ ], webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
video_data = media_resource.get('json')
if not video_data:
r'data-id=["\']([0-9A-Za-z_-]{11})'),
webpage, 'video URL', default=None)
if youtube_id:
- return {
- '_type': 'url',
- 'url': youtube_id,
- 'ie_key': YoutubeIE.ie_key(),
- }
+ return self.url_result(youtube_id, YoutubeIE.ie_key())
info_dict = self._extract_jwplayer_data(
webpage, video_id, require_title=False)
class WistiaIE(InfoExtractor):
- _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.net/embed/iframe/)(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]+)'
_API_URL = 'http://fast.wistia.com/embed/medias/%s.json'
_IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s'
# with hls video
'url': 'wistia:807fafadvk',
'only_matching': True,
+ }, {
+ 'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
+ 'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage)
if match:
return unescapeHTML(match.group('url'))
--- /dev/null
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ try_get,
+ unescapeHTML,
+ url_or_none,
+ urljoin,
+)
+
+
+class WWEBaseIE(InfoExtractor):
+ _SUBTITLE_LANGS = {
+ 'English': 'en',
+ 'Deutsch': 'de',
+ }
+
+ def _extract_entry(self, data, url, video_id=None):
+ video_id = compat_str(video_id or data['nid'])
+ title = data['title']
+
+ formats = self._extract_m3u8_formats(
+ data['file'], video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+
+ description = data.get('description')
+ thumbnail = urljoin(url, data.get('image'))
+ series = data.get('show_name')
+ episode = data.get('episode_name')
+
+ subtitles = {}
+ tracks = data.get('tracks')
+ if isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ track_file = url_or_none(track.get('file'))
+ if not track_file:
+ continue
+ label = track.get('label')
+ lang = self._SUBTITLE_LANGS.get(label, label) or 'en'
+ subtitles.setdefault(lang, []).append({
+ 'url': track_file,
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'series': series,
+ 'episode': episode,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class WWEIE(WWEBaseIE):
+ _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*videos/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.wwe.com/videos/daniel-bryan-vs-andrade-cien-almas-smackdown-live-sept-4-2018',
+ 'md5': '92811c6a14bfc206f7a6a9c5d9140184',
+ 'info_dict': {
+ 'id': '40048199',
+ 'ext': 'mp4',
+ 'title': 'Daniel Bryan vs. Andrade "Cien" Almas: SmackDown LIVE, Sept. 4, 2018',
+ 'description': 'md5:2d7424dbc6755c61a0e649d2a8677f67',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }, {
+ 'url': 'https://de.wwe.com/videos/gran-metalik-vs-tony-nese-wwe-205-live-sept-4-2018',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ landing = self._parse_json(
+ self._html_search_regex(
+ r'(?s)Drupal\.settings\s*,\s*({.+?})\s*\)\s*;',
+ webpage, 'drupal settings'),
+ display_id)['WWEVideoLanding']
+
+ data = landing['initialVideo']['playlist'][0]
+ video_id = landing.get('initialVideoId')
+
+ info = self._extract_entry(data, url, video_id)
+ info['display_id'] = display_id
+ return info
+
+
+class WWEPlaylistIE(WWEBaseIE):
+ _VALID_URL = r'https?://(?:[^/]+\.)?wwe\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.wwe.com/shows/raw/2018-11-12',
+ 'info_dict': {
+ 'id': '2018-11-12',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'http://www.wwe.com/article/walk-the-prank-wwe-edition',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.wwe.com/shows/wwenxt/article/matt-riddle-interview',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if WWEIE.suitable(url) else super(WWEPlaylistIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ entries = []
+ for mobj in re.finditer(
+ r'data-video\s*=\s*(["\'])(?P<data>{.+?})\1', webpage):
+ video = self._parse_json(
+ mobj.group('data'), display_id, transform_source=unescapeHTML,
+ fatal=False)
+ if not video:
+ continue
+ data = try_get(video, lambda x: x['playlist'][0], dict)
+ if not data:
+ continue
+ try:
+ entry = self._extract_entry(data, url)
+ except Exception:
+ continue
+ entry['extractor_key'] = WWEIE.ie_key()
+ entries.append(entry)
+
+ return self.playlist_result(entries, display_id)
video_id = self._match_id(url)
webpage = self._download_webpage(
- 'http://www.xvideos.com/video%s/' % video_id, video_id)
+ 'https://www.xvideos.com/video%s/' % video_id, video_id)
mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage)
if mobj:
request.add_header('Cookie', 'age_verified=1')
webpage = self._download_webpage(request, display_id)
- title = self._search_regex(
- [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<h1[^>]+class=["\']heading\d?["\'][^>]*>(?P<title>[^<]+)<'],
- webpage, 'title', group='title',
- default=None) or self._og_search_title(
+ title = self._html_search_regex(
+ r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>',
+ webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or self._html_search_meta(
'title', webpage, fatal=True)
formats.append(f)
self._sort_formats(formats)
- description = self._og_search_description(webpage, default=None)
+ description = self._html_search_regex(
+ r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>',
+ webpage, 'description',
+ default=None) or self._og_search_description(
+ webpage, default=None)
thumbnail = self._search_regex(
r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1',
webpage, 'thumbnail', fatal=False, group='thumbnail')
'ext': 'mp4',
'title': 'md5:c9f43630bd968267672651ba905a7d35',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'age_limit': 18
},
}
self._search_regex(
r'data-vnfo=(["\'])(?P<data>{.+?})\1', webpage, 'data info',
group='data'),
- video_id)[video_id])
+ video_id)[video_id]).replace('/cdn/', '/cdn3/')
title = (self._search_regex(
r'<[^>]+\bclass=["\']PostEditTA[^>]+>([^<]+)', webpage, 'title',
'url': video_url,
'title': title,
'thumbnail': thumbnail,
+ 'age_limit': 18
}
unified_strdate,
unsmuggle_url,
uppercase_escape,
+ url_or_none,
urlencode_postdata,
)
'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q',
'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q',
'upload_date': '20121002',
- 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'uploader': 'Icona Pop',
'uploader_id': 'IconaPop',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
- 'license': 'Standard YouTube License',
'creator': 'Icona Pop',
'track': 'I Love It (feat. Charli XCX)',
'artist': 'Icona Pop',
'id': '07FYdnEawAQ',
'ext': 'mp4',
'upload_date': '20130703',
- 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
+ 'title': 'Justin Timberlake - Tunnel Vision (Official Music Video) (Explicit)',
'alt_title': 'Tunnel Vision',
- 'description': 'md5:64249768eec3bc4276236606ea996373',
+ 'description': 'md5:07dab3356cde4199048e4c7cd93471e1',
'duration': 419,
'uploader': 'justintimberlakeVEVO',
'uploader_id': 'justintimberlakeVEVO',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
- 'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
'track': 'Tunnel Vision',
'artist': 'Justin Timberlake',
'uploader': 'SET India',
'uploader_id': 'setindia',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/setindia',
- 'license': 'Standard YouTube License',
'age_limit': 18,
}
},
'uploader_id': 'phihag',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag',
'upload_date': '20121002',
- 'license': 'Standard YouTube License',
'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
'categories': ['Science & Technology'],
'tags': ['youtube-dl'],
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/8KVIDEO',
'description': '',
'uploader': '8KVIDEO',
- 'license': 'Standard YouTube License',
'title': 'UHDTV TEST 8K VIDEO.mp4'
},
'params': {
'info_dict': {
'id': 'IB3lcPjvWLA',
'ext': 'm4a',
- 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson',
- 'description': 'md5:1900ed86ee514927b9e00fbead6969a5',
+ 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
+ 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
'duration': 244,
'uploader': 'AfrojackVEVO',
'uploader_id': 'AfrojackVEVO',
'upload_date': '20131011',
- 'license': 'Standard YouTube License',
},
'params': {
'youtube_include_dash_manifest': True,
'id': 'nfWlot6h_JM',
'ext': 'm4a',
'title': 'Taylor Swift - Shake It Off',
- 'alt_title': 'Shake It Off',
- 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3',
+ 'description': 'md5:bec2185232c05479482cb5a9b82719bf',
'duration': 242,
'uploader': 'TaylorSwiftVEVO',
'uploader_id': 'TaylorSwiftVEVO',
'upload_date': '20140818',
- 'license': 'Standard YouTube License',
'creator': 'Taylor Swift',
},
'params': {
'ext': 'mp4',
'duration': 219,
'upload_date': '20100909',
- 'uploader': 'TJ Kirk',
+ 'uploader': 'Amazing Atheist',
'uploader_id': 'TheAmazingAtheist',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist',
- 'license': 'Standard YouTube License',
'title': 'Burning Everyone\'s Koran',
'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
}
'uploader_id': 'WitcherGame',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame',
'upload_date': '20140605',
- 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
'url': 'https://www.youtube.com/watch?v=6kLq3WMV1nU',
'info_dict': {
'id': '6kLq3WMV1nU',
- 'ext': 'webm',
+ 'ext': 'mp4',
'title': 'Dedication To My Ex (Miss That) (Lyric Video)',
'description': 'md5:33765bb339e1b47e7e72b5490139bb41',
'duration': 246,
'uploader_id': 'LloydVEVO',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO',
'upload_date': '20110629',
- 'license': 'Standard YouTube License',
'age_limit': 18,
},
},
'creator': 'deadmau5',
'description': 'md5:12c56784b8032162bb936a5f76d55360',
'uploader': 'deadmau5',
- 'license': 'Standard YouTube License',
'title': 'Deadmau5 - Some Chords (HD)',
'alt_title': 'Some Chords',
},
'upload_date': '20150827',
'uploader_id': 'olympic',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic',
- 'license': 'Standard YouTube License',
'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games',
'uploader': 'Olympic',
'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow',
'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯',
'uploader': '孫ᄋᄅ',
- 'license': 'Standard YouTube License',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
},
'uploader_id': 'dorappi2000',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000',
'uploader': 'dorappi2000',
- 'license': 'Standard YouTube License',
'formats': 'mincount:31',
},
'skip': 'not actual anymore',
'uploader': 'Airtek',
'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.',
'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ',
- 'license': 'Standard YouTube License',
'title': 'Retransmisión XVIII Media maratón Zaragoza 2015',
},
'params': {
'params': {
'skip_download': True,
},
+ 'skip': 'This video is not available.',
},
{
# Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536)
'uploader_id': 'IronSoulElf',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf',
'uploader': 'IronSoulElf',
- 'license': 'Standard YouTube License',
'creator': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
'track': 'Dark Walk - Position Music',
'artist': 'Todd Haberman, Daniel Law Heath and Aaron Kaplan',
'id': 'iqKdEhx-dD4',
'ext': 'mp4',
'title': 'Isolation - Mind Field (Ep 1)',
- 'description': 'md5:25b78d2f64ae81719f5c96319889b736',
+ 'description': 'md5:46a29be4ceffa65b92d277b93f463c0f',
'duration': 2085,
'upload_date': '20170118',
'uploader': 'Vsauce',
'uploader_id': 'Vsauce',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/Vsauce',
- 'license': 'Standard YouTube License',
'series': 'Mind Field',
'season_number': 1,
'episode_number': 1,
'uploader': 'New Century Foundation',
'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
- 'license': 'Standard YouTube License',
},
'params': {
'skip_download': True,
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
},
+ {
+ # DRM protected
+ 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc',
+ 'only_matching': True,
+ },
+ {
+ # Video with unsupported adaptive stream type formats
+ 'url': 'https://www.youtube.com/watch?v=Z4Vy8R84T1U',
+ 'info_dict': {
+ 'id': 'Z4Vy8R84T1U',
+ 'ext': 'mp4',
+ 'title': 'saman SMAN 53 Jakarta(Sancety) opening COFFEE4th at SMAN 53 Jakarta',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'duration': 433,
+ 'upload_date': '20130923',
+ 'uploader': 'Amelia Putri Harwita',
+ 'uploader_id': 'UCpOxM49HJxmC1qCalXyB3_Q',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCpOxM49HJxmC1qCalXyB3_Q',
+ 'formats': 'maxcount:10',
+ },
+ 'params': {
+ 'skip_download': True,
+ 'youtube_include_dash_manifest': False,
+ },
+ }
]
def __init__(self, *args, **kwargs):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
self._downloader.report_warning(err_msg)
return {}
- def _mark_watched(self, video_id, video_info):
- playback_url = video_info.get('videostats_playback_base_url', [None])[0]
+ def _mark_watched(self, video_id, video_info, player_response):
+ playback_url = url_or_none(try_get(
+ player_response,
+ lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get(
+ video_info, lambda x: x['videostats_playback_base_url'][0]))
if not playback_url:
return
parsed_playback_url = compat_urlparse.urlparse(playback_url)
if dash_mpd and dash_mpd[0] not in dash_mpds:
dash_mpds.append(dash_mpd[0])
+ def add_dash_mpd_pr(pl_response):
+ dash_mpd = url_or_none(try_get(
+ pl_response, lambda x: x['streamingData']['dashManifestUrl'],
+ compat_str))
+ if dash_mpd and dash_mpd not in dash_mpds:
+ dash_mpds.append(dash_mpd)
+
is_live = None
view_count = None
if isinstance(pl_response, dict):
player_response = pl_response
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
+ add_dash_mpd_pr(player_response)
# We also try looking in get_video_info since it may contain different dashmpd
# URL that points to a DASH manifest with possibly different itag set (some itags
# are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
pl_response = get_video_info.get('player_response', [None])[0]
if isinstance(pl_response, dict):
player_response = pl_response
+ add_dash_mpd_pr(player_response)
add_dash_mpd(get_video_info)
if view_count is None:
view_count = extract_view_count(get_video_info)
'"token" parameter not in video info for unknown reason',
video_id=video_id)
+ if video_info.get('license_info'):
+ raise ExtractorError('This video is DRM protected.', expected=True)
+
video_details = try_get(
player_response, lambda x: x['videoDetails'], dict) or {}
else:
video_description = ''
- if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False):
+ if not smuggled_data.get('force_singlefeed', False):
if not self._downloader.params.get('noplaylist'):
- entries = []
- feed_ids = []
- multifeed_metadata_list = video_info['multifeed_metadata_list'][0]
- for feed in multifeed_metadata_list.split(','):
- # Unquote should take place before split on comma (,) since textual
- # fields may contain comma as well (see
- # https://github.com/rg3/youtube-dl/issues/8536)
- feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
- entries.append({
- '_type': 'url_transparent',
- 'ie_key': 'Youtube',
- 'url': smuggle_url(
- '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
- {'force_singlefeed': True}),
- 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
- })
- feed_ids.append(feed_data['id'][0])
- self.to_screen(
- 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
- % (', '.join(feed_ids), video_id))
- return self.playlist_result(entries, video_id, video_title, video_description)
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ multifeed_metadata_list = try_get(
+ player_response,
+ lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'],
+ compat_str) or try_get(
+ video_info, lambda x: x['multifeed_metadata_list'][0], compat_str)
+ if multifeed_metadata_list:
+ entries = []
+ feed_ids = []
+ for feed in multifeed_metadata_list.split(','):
+ # Unquote should take place before split on comma (,) since textual
+ # fields may contain comma as well (see
+ # https://github.com/rg3/youtube-dl/issues/8536)
+ feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed))
+ entries.append({
+ '_type': 'url_transparent',
+ 'ie_key': 'Youtube',
+ 'url': smuggle_url(
+ '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]),
+ {'force_singlefeed': True}),
+ 'title': '%s (%s)' % (video_title, feed_data['title'][0]),
+ })
+ feed_ids.append(feed_data['id'][0])
+ self.to_screen(
+ 'Downloading multifeed video (%s) - add --no-playlist to just download video %s'
+ % (', '.join(feed_ids), video_id))
+ return self.playlist_result(entries, video_id, video_title, video_description)
+ else:
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
if view_count is None:
view_count = extract_view_count(video_info)
'height': int_or_none(width_height[1]),
}
q = qualities(['small', 'medium', 'hd720'])
+ streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list)
+ if streaming_formats:
+ for fmt in streaming_formats:
+ itag = str_or_none(fmt.get('itag'))
+ if not itag:
+ continue
+ quality = fmt.get('quality')
+ quality_label = fmt.get('qualityLabel') or quality
+ formats_spec[itag] = {
+ 'asr': int_or_none(fmt.get('audioSampleRate')),
+ 'filesize': int_or_none(fmt.get('contentLength')),
+ 'format_note': quality_label,
+ 'fps': int_or_none(fmt.get('fps')),
+ 'height': int_or_none(fmt.get('height')),
+ 'quality': q(quality),
+ # bitrate for itag 43 is always 2147483647
+ 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None,
+ 'width': int_or_none(fmt.get('width')),
+ }
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
if 'itag' not in url_data or 'url' not in url_data:
continue
+ stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0]))
+ # Unsupported FORMAT_STREAM_TYPE_OTF
+ if stream_type == 3:
+ continue
format_id = url_data['itag'][0]
url = url_data['url'][0]
else:
player_version = self._search_regex(
[r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
- r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
+ r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
filesize = int_or_none(url_data.get(
'clen', [None])[0]) or _extract_filesize(url)
- quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
+ quality = url_data.get('quality', [None])[0]
more_fields = {
'filesize': filesize,
'width': width,
'height': height,
'fps': int_or_none(url_data.get('fps', [None])[0]),
- 'format_note': quality,
+ 'format_note': url_data.get('quality_label', [None])[0] or quality,
'quality': q(quality),
}
for key, value in more_fields.items():
'http_chunk_size': 10485760,
}
formats.append(dct)
- elif video_info.get('hlsvp'):
- manifest_url = video_info['hlsvp'][0]
- formats = []
- m3u8_formats = self._extract_m3u8_formats(
- manifest_url, video_id, 'mp4', fatal=False)
- for a_format in m3u8_formats:
- itag = self._search_regex(
- r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
- if itag:
- a_format['format_id'] = itag
- if itag in self._formats:
- dct = self._formats[itag].copy()
- dct.update(a_format)
- a_format = dct
- a_format['player_url'] = player_url
- # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
- a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
- formats.append(a_format)
else:
- error_message = clean_html(video_info.get('reason', [None])[0])
- if not error_message:
- error_message = extract_unavailable_message()
- if error_message:
- raise ExtractorError(error_message, expected=True)
- raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
+ manifest_url = (
+ url_or_none(try_get(
+ player_response,
+ lambda x: x['streamingData']['hlsManifestUrl'],
+ compat_str)) or
+ url_or_none(try_get(
+ video_info, lambda x: x['hlsvp'][0], compat_str)))
+ if manifest_url:
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', fatal=False)
+ for a_format in m3u8_formats:
+ itag = self._search_regex(
+ r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+ if itag:
+ a_format['format_id'] = itag
+ if itag in self._formats:
+ dct = self._formats[itag].copy()
+ dct.update(a_format)
+ a_format = dct
+ a_format['player_url'] = player_url
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
+ a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+ formats.append(a_format)
+ else:
+ error_message = clean_html(video_info.get('reason', [None])[0])
+ if not error_message:
+ error_message = extract_unavailable_message()
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
+ raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
# uploader
video_uploader = try_get(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)
if m_episode:
- series = m_episode.group('series')
+ series = unescapeHTML(m_episode.group('series'))
season_number = int(m_episode.group('season'))
episode_number = int(m_episode.group('episode'))
else:
self._sort_formats(formats)
- self.mark_watched(video_id, video_info)
+ self.mark_watched(video_id, video_info, player_response)
return {
'id': video_id,
--- /dev/null
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+
+
+class ZypeIE(InfoExtractor):
+ _VALID_URL = r'https?://player\.zype\.com/embed/(?P<id>[\da-fA-F]+)\.js\?.*?api_key=[^&]+'
+ _TEST = {
+ 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false',
+ 'md5': 'eaee31d474c76a955bdaba02a505c595',
+ 'info_dict': {
+ 'id': '5b400b834b32992a310622b9',
+ 'ext': 'mp4',
+ 'title': 'Smoky Barbecue Favorites',
+ 'thumbnail': r're:^https?://.*\.jpe?g',
+ },
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//player\.zype\.com/embed/[\da-fA-F]+\.js\?.*?api_key=.+?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._search_regex(
+ r'video_title\s*[:=]\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'title', group='value')
+
+ m3u8_url = self._search_regex(
+ r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1', webpage,
+ 'm3u8 url', group='url')
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ self._sort_formats(formats)
+
+ thumbnail = self._search_regex(
+ r'poster\s*[:=]\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'thumbnail',
+ default=False, group='url')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe']
prefer_ffmpeg = True
+ def get_ffmpeg_version(path):
+ ver = get_exe_version(path, args=['-version'])
+ if ver:
+ regexs = [
+ r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1]
+ r'n([0-9.]+)$', # Arch Linux
+ # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/
+ ]
+ for regex in regexs:
+ mobj = re.match(regex, ver)
+ if mobj:
+ ver = mobj.group(1)
+ return ver
+
self.basename = None
self.probe_basename = None
self._paths = dict(
(p, os.path.join(location, p)) for p in programs)
self._versions = dict(
- (p, get_exe_version(self._paths[p], args=['-version']))
- for p in programs)
+ (p, get_ffmpeg_version(self._paths[p])) for p in programs)
if self._versions is None:
self._versions = dict(
- (p, get_exe_version(p, args=['-version'])) for p in programs)
+ (p, get_ffmpeg_version(p)) for p in programs)
self._paths = dict((p, p) for p in programs)
if prefer_ffmpeg is False:
opts += ['-c:s', 'mov_text']
for (i, lang) in enumerate(sub_langs):
opts.extend(['-map', '%d:0' % (i + 1)])
- lang_code = ISO639Utils.short2long(lang)
- if lang_code is not None:
- opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
+ lang_code = ISO639Utils.short2long(lang) or lang
+ opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code])
temp_filename = prepend_extension(filename, 'temp')
self._downloader.to_screen('[ffmpeg] Embedding subtitles in \'%s\'' % filename)
compat_HTMLParser,
compat_basestring,
compat_chr,
+ compat_cookiejar,
compat_ctypes_WINFUNCTYPE,
compat_etree_fromstring,
compat_expanduser,
req, **kwargs)
+class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
+ def save(self, filename=None, ignore_discard=False, ignore_expires=False):
+ # Store session cookies with `expires` set to 0 instead of an empty
+ # string
+ for cookie in self:
+ if cookie.expires is None:
+ cookie.expires = 0
+ compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires)
+
+ def load(self, filename=None, ignore_discard=False, ignore_expires=False):
+ compat_cookiejar.MozillaCookieJar.load(self, filename, ignore_discard, ignore_expires)
+ # Session cookies are denoted by either `expires` field set to
+ # an empty string or 0. MozillaCookieJar only recognizes the former
+ # (see [1]). So we need force the latter to be recognized as session
+ # cookies on our own.
+ # Session cookies may be important for cookies-based authentication,
+ # e.g. usually, when user does not check 'Remember me' check box while
+ # logging in on a site, some important cookies are stored as session
+ # cookies so that not recognizing them will result in failed login.
+ # 1. https://bugs.python.org/issue17164
+ for cookie in self:
+ # Treat `expires=0` cookies as session cookies
+ if cookie.expires == 0:
+ cookie.expires = None
+ cookie.discard = True
+
+
class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
def __init__(self, cookiejar=None):
compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
'gv': 'glv',
'ha': 'hau',
'he': 'heb',
+ 'iw': 'heb', # Replaced by he in 1989 revision
'hi': 'hin',
'ho': 'hmo',
'hr': 'hrv',
'hz': 'her',
'ia': 'ina',
'id': 'ind',
+ 'in': 'ind', # Replaced by id in 1989 revision
'ie': 'ile',
'ig': 'ibo',
'ii': 'iii',
'wo': 'wol',
'xh': 'xho',
'yi': 'yid',
+ 'ji': 'yid', # Replaced by yi in 1989 revision
'yo': 'yor',
'za': 'zha',
'zh': 'zho',
def random_birthday(year_field, month_field, day_field):
+ start_date = datetime.date(1950, 1, 1)
+ end_date = datetime.date(1995, 12, 31)
+ offset = random.randint(0, (end_date - start_date).days)
+ random_date = start_date + datetime.timedelta(offset)
return {
- year_field: str(random.randint(1950, 1995)),
- month_field: str(random.randint(1, 12)),
- day_field: str(random.randint(1, 31)),
+ year_field: str(random_date.year),
+ month_field: str(random_date.month),
+ day_field: str(random_date.day),
}
from __future__ import unicode_literals
-__version__ = '2018.11.07'
+__version__ = '2019.01.16'