]> Raphaël G. Git Repositories - youtubedl/commitdiff
New upstream version 2020.03.24
authorRogério Brito <rbrito@ime.usp.br>
Fri, 10 Apr 2020 13:28:52 +0000 (10:28 -0300)
committerRogério Brito <rbrito@ime.usp.br>
Fri, 10 Apr 2020 13:28:52 +0000 (10:28 -0300)
66 files changed:
ChangeLog
README.md
README.txt
docs/supportedsites.md
test/test_YoutubeDL.py
test/test_subtitles.py
youtube-dl
youtube-dl.1
youtube_dl/YoutubeDL.py
youtube_dl/compat.py
youtube_dl/extractor/abc.py
youtube_dl/extractor/bilibili.py
youtube_dl/extractor/cbc.py
youtube_dl/extractor/eporner.py
youtube_dl/extractor/extractors.py
youtube_dl/extractor/franceculture.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/hellporno.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/jpopsukitv.py [deleted file]
youtube_dl/extractor/lecturio.py
youtube_dl/extractor/limelight.py
youtube_dl/extractor/linuxacademy.py
youtube_dl/extractor/mediaset.py
youtube_dl/extractor/mediasite.py
youtube_dl/extractor/ndr.py
youtube_dl/extractor/nhk.py
youtube_dl/extractor/nova.py
youtube_dl/extractor/npr.py
youtube_dl/extractor/nytimes.py
youtube_dl/extractor/peertube.py
youtube_dl/extractor/platzi.py
youtube_dl/extractor/pokemon.py
youtube_dl/extractor/popcorntimes.py [new file with mode: 0644]
youtube_dl/extractor/pornhd.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/safari.py
youtube_dl/extractor/servus.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/sportdeutschland.py
youtube_dl/extractor/svt.py
youtube_dl/extractor/teachable.py
youtube_dl/extractor/telecinco.py
youtube_dl/extractor/telequebec.py
youtube_dl/extractor/tfo.py
youtube_dl/extractor/thisoldhouse.py
youtube_dl/extractor/toggle.py
youtube_dl/extractor/tumblr.py
youtube_dl/extractor/tv2dk.py
youtube_dl/extractor/tv5mondeplus.py
youtube_dl/extractor/tva.py
youtube_dl/extractor/twentyfourvideo.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/viewlift.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/wistia.py
youtube_dl/extractor/xhamster.py
youtube_dl/extractor/xtube.py
youtube_dl/extractor/youjizz.py
youtube_dl/extractor/youtube.py
youtube_dl/extractor/zapiks.py
youtube_dl/extractor/zdf.py
youtube_dl/options.py
youtube_dl/update.py
youtube_dl/utils.py
youtube_dl/version.py

index 94aa9f327c73a5c8ee3e792b8435a445de1432b1..f753972c46ae0df3951139cab26dafa155c05507 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,104 @@
+version 2020.03.24
+
+Core
+- [utils] Revert support for cookie files with spaces used instead of tabs
+
+Extractors
+* [teachable] Update upskillcourses and gns3 domains
+* [generic] Look for teachable embeds before wistia
++ [teachable] Extract chapter metadata (#24421)
++ [bilibili] Add support for player.bilibili.com (#24402)
++ [bilibili] Add support for new URL schema with BV ids (#24439, #24442)
+* [limelight] Remove disabled API requests (#24255)
+* [soundcloud] Fix download URL extraction (#24394)
++ [cbc:watch] Add support for authentication (#19160)
+* [hellporno] Fix extraction (#24399)
+* [xtube] Fix formats extraction (#24348)
+* [ndr] Fix extraction (#24326)
+* [nhk] Update m3u8 URL and use native HLS downloader (#24329)
+- [nhk] Remove obsolete rtmp formats (#24329)
+* [nhk] Relax URL regular expression (#24329)
+- [vimeo] Revert fix showcase password protected video extraction (#24224)
+
+
+version 2020.03.08
+
+Core
++ [utils] Add support for cookie files with spaces used instead of tabs
+
+Extractors
++ [pornhub] Add support for pornhubpremium.com (#24288)
+- [youtube] Remove outdated code and unnecessary requests
+* [youtube] Improve extraction in 429 HTTP error conditions (#24283)
+* [nhk] Update API version (#24270)
+
+
+version 2020.03.06
+
+Extractors
+* [youtube] Fix age-gated videos support without login (#24248)
+* [vimeo] Fix showcase password protected video extraction (#24224)
+* [pornhub] Improve title extraction (#24184)
+* [peertube] Improve extraction (#23657)
++ [servus] Add support for new URL schema (#23475, #23583, #24142)
+* [vimeo] Fix subtitles URLs (#24209)
+
+
+version 2020.03.01
+
+Core
+* [YoutubeDL] Force redirect URL to unicode on python 2
+- [options] Remove duplicate short option -v for --version (#24162)
+
+Extractors
+* [xhamster] Fix extraction (#24205)
+* [franceculture] Fix extraction (#24204)
++ [telecinco] Add support for article opening videos
+* [telecinco] Fix extraction (#24195)
+* [xtube] Fix metadata extraction (#21073, #22455)
+* [youjizz] Fix extraction (#24181)
+- Remove no longer needed compat_str around geturl
+* [pornhd] Fix extraction (#24128)
++ [teachable] Add support for multiple videos per lecture (#24101)
++ [wistia] Add support for multiple generic embeds (#8347, 11385)
+* [imdb] Fix extraction (#23443)
+* [tv2dk:bornholm:play] Fix extraction (#24076)
+
+
+version 2020.02.16
+
+Core
+* [YoutubeDL] Fix playlist entry indexing with --playlist-items (#10591,
+  #10622)
+* [update] Fix updating via symlinks (#23991)
++ [compat] Introduce compat_realpath (#23991)
+
+Extractors
++ [npr] Add support for streams (#24042)
++ [24video] Add support for porn.24video.net (#23779, #23784)
+- [jpopsuki] Remove extractor (#23858)
+* [nova] Improve extraction (#23690)
+* [nova:embed] Improve (#23690)
+* [nova:embed] Fix extraction (#23672)
++ [abc:iview] Add support for 720p (#22907, #22921)
+* [nytimes] Improve format sorting (#24010)
++ [toggle] Add support for mewatch.sg (#23895, #23930)
+* [thisoldhouse] Fix extraction (#23951)
++ [popcorntimes] Add support for popcorntimes.tv (#23949)
+* [sportdeutschland] Update to new API
+* [twitch:stream] Lowercase channel id for stream request (#23917)
+* [tv5mondeplus] Fix extraction (#23907, #23911)
+* [tva] Relax URL regular expression (#23903)
+* [vimeo] Fix album extraction (#23864)
+* [viewlift] Improve extraction
+    * Fix extraction (#23851)
+    + Add support for authentication
+    + Add support for more domains
+* [svt] Fix series extraction (#22297)
+* [svt] Fix article extraction (#22897, #22919)
+* [soundcloud] Imporve private playlist/set tracks extraction (#3707)
+
+
 version 2020.01.24
 
 Extractors
index 01f975958c8370016a39c9f3fb872241c977c62b..4f54a52401b4b6083147701a31a35217aa1000e6 100644 (file)
--- a/README.md
+++ b/README.md
@@ -835,7 +835,9 @@ In February 2015, the new YouTube player contained a character sequence in a str
 
 ### HTTP Error 429: Too Many Requests or 402: Payment Required
 
-These two error codes indicate that the service is blocking your IP address because of overuse. Contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address.
+These two error codes indicate that the service is blocking your IP address because of overuse. Usually this is a soft block meaning that you can gain access again after solving CAPTCHA. Just open a browser and solve a CAPTCHA the service suggests you and after that [pass cookies](#how-do-i-pass-cookies-to-youtube-dl) to youtube-dl. Note that if your machine has multiple external IPs then you should also pass exactly the same IP you've used for solving CAPTCHA with [`--source-address`](#network-options). Also you may need to pass a `User-Agent` HTTP header of your browser with [`--user-agent`](#workarounds).
+
+If this is not the case (no CAPTCHA suggested to solve by the service) then you can contact the service and ask them to unblock your IP address, or - if you have acquired a whitelisted IP address already - use the [`--proxy` or `--source-address` options](#network-options) to select another IP address.
 
 ### SyntaxError: Non-ASCII character
 
index cc86a1be504693d38cc15f08a567a4c8be01b872..168b1315cc72c3e4143346d34638840aeec09676 100644 (file)
@@ -1101,10 +1101,19 @@ above for how to update youtube-dl.
 HTTP Error 429: Too Many Requests or 402: Payment Required
 
 These two error codes indicate that the service is blocking your IP
-address because of overuse. Contact the service and ask them to unblock
-your IP address, or - if you have acquired a whitelisted IP address
-already - use the --proxy or --source-address options to select another
-IP address.
+address because of overuse. Usually this is a soft block meaning that
+you can gain access again after solving CAPTCHA. Just open a browser and
+solve a CAPTCHA the service suggests you and after that pass cookies to
+youtube-dl. Note that if your machine has multiple external IPs then you
+should also pass exactly the same IP you've used for solving CAPTCHA
+with --source-address. Also you may need to pass a User-Agent HTTP
+header of your browser with --user-agent.
+
+If this is not the case (no CAPTCHA suggested to solve by the service)
+then you can contact the service and ask them to unblock your IP
+address, or - if you have acquired a whitelisted IP address already -
+use the --proxy or --source-address options to select another IP
+address.
 
 SyntaxError: Non-ASCII character
 
index e9a8cc27adfc8a3da64f5a41ad8319e1563d5059..174b83bf3b6e1e75e2f8198b271cf6426431dc75 100644 (file)
@@ -98,6 +98,7 @@
  - **BiliBili**
  - **BilibiliAudio**
  - **BilibiliAudioAlbum**
+ - **BiliBiliPlayer**
  - **BioBioChileTV**
  - **BIQLE**
  - **BitChute**
  - **JeuxVideo**
  - **Joj**
  - **Jove**
- - **jpopsuki.tv**
  - **JWPlatform**
  - **Kakao**
  - **Kaltura**
  - **Pokemon**
  - **PolskieRadio**
  - **PolskieRadioCategory**
+ - **Popcorntimes**
  - **PopcornTV**
  - **PornCom**
  - **PornerBros**
  - **Vidzi**
  - **vier**: vier.be and vijf.be
  - **vier:videos**
- - **ViewLift**
- - **ViewLiftEmbed**
+ - **viewlift**
+ - **viewlift:embed**
  - **Viidea**
  - **viki**
  - **viki:channel**
index ce96661716c42ae0bf9c6a8ccb9ddf48c715e0a2..1e204e551b499edead22ce65e371787ca22ffc35 100644 (file)
@@ -816,11 +816,15 @@ class TestYoutubeDL(unittest.TestCase):
             'webpage_url': 'http://example.com',
         }
 
-        def get_ids(params):
+        def get_downloaded_info_dicts(params):
             ydl = YDL(params)
-            # make a copy because the dictionary can be modified
-            ydl.process_ie_result(playlist.copy())
-            return [int(v['id']) for v in ydl.downloaded_info_dicts]
+            # make a deep copy because the dictionary and nested entries
+            # can be modified
+            ydl.process_ie_result(copy.deepcopy(playlist))
+            return ydl.downloaded_info_dicts
+
+        def get_ids(params):
+            return [int(v['id']) for v in get_downloaded_info_dicts(params)]
 
         result = get_ids({})
         self.assertEqual(result, [1, 2, 3, 4])
@@ -852,6 +856,22 @@ class TestYoutubeDL(unittest.TestCase):
         result = get_ids({'playlist_items': '2-4,3-4,3'})
         self.assertEqual(result, [2, 3, 4])
 
+        # Tests for https://github.com/ytdl-org/youtube-dl/issues/10591
+        # @{
+        result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'})
+        self.assertEqual(result[0]['playlist_index'], 2)
+        self.assertEqual(result[1]['playlist_index'], 3)
+
+        result = get_downloaded_info_dicts({'playlist_items': '2-4,3-4,3'})
+        self.assertEqual(result[0]['playlist_index'], 2)
+        self.assertEqual(result[1]['playlist_index'], 3)
+        self.assertEqual(result[2]['playlist_index'], 4)
+
+        result = get_downloaded_info_dicts({'playlist_items': '4,2'})
+        self.assertEqual(result[0]['playlist_index'], 4)
+        self.assertEqual(result[1]['playlist_index'], 2)
+        # @}
+
     def test_urlopen_no_file_protocol(self):
         # see https://github.com/ytdl-org/youtube-dl/issues/8227
         ydl = YDL()
index 7d57a628e5ef79c5e12d13ccd0a2b515548ffa60..17aaaf20d9a002336af43d1afa2f7d49a186ac9a 100644 (file)
@@ -26,7 +26,6 @@ from youtube_dl.extractor import (
     ThePlatformIE,
     ThePlatformFeedIE,
     RTVEALaCartaIE,
-    FunnyOrDieIE,
     DemocracynowIE,
 )
 
@@ -322,18 +321,6 @@ class TestRtveSubtitles(BaseTestSubtitles):
         self.assertEqual(md5(subtitles['es']), '69e70cae2d40574fb7316f31d6eb7fca')
 
 
-class TestFunnyOrDieSubtitles(BaseTestSubtitles):
-    url = 'http://www.funnyordie.com/videos/224829ff6d/judd-apatow-will-direct-your-vine'
-    IE = FunnyOrDieIE
-
-    def test_allsubtitles(self):
-        self.DL.params['writesubtitles'] = True
-        self.DL.params['allsubtitles'] = True
-        subtitles = self.getSubtitles()
-        self.assertEqual(set(subtitles.keys()), set(['en']))
-        self.assertEqual(md5(subtitles['en']), 'c5593c193eacd353596c11c2d4f9ecc4')
-
-
 class TestDemocracynowSubtitles(BaseTestSubtitles):
     url = 'http://www.democracynow.org/shows/2015/7/3'
     IE = DemocracynowIE
index 748d6a0deb8f348b9d682f478fb36db4c0515ebc..fc830df0c89108cde3ec3a54c1355c09c1def66d 100755 (executable)
Binary files a/youtube-dl and b/youtube-dl differ
index 251810f21fd7545613c8ce9ef0ade45afdbff652..84ee44c659744e4f888a04742c1c3f890a5595fe 100644 (file)
@@ -1711,10 +1711,21 @@ See above for how to update youtube\-dl.
 .PP
 These two error codes indicate that the service is blocking your IP
 address because of overuse.
-Contact the service and ask them to unblock your IP address, or \- if
-you have acquired a whitelisted IP address already \- use the
-\f[C]\-\-proxy\f[] or \f[C]\-\-source\-address\f[] options to select
-another IP address.
+Usually this is a soft block meaning that you can gain access again
+after solving CAPTCHA.
+Just open a browser and solve a CAPTCHA the service suggests you and
+after that pass cookies to youtube\-dl.
+Note that if your machine has multiple external IPs then you should also
+pass exactly the same IP you\[aq]ve used for solving CAPTCHA with
+\f[C]\-\-source\-address\f[].
+Also you may need to pass a \f[C]User\-Agent\f[] HTTP header of your
+browser with \f[C]\-\-user\-agent\f[].
+.PP
+If this is not the case (no CAPTCHA suggested to solve by the service)
+then you can contact the service and ask them to unblock your IP
+address, or \- if you have acquired a whitelisted IP address already \-
+use the \f[C]\-\-proxy\f[] or \f[C]\-\-source\-address\f[] options to
+select another IP address.
 .SS SyntaxError: Non\-ASCII character
 .PP
 The error
index f5cb46308198e4c65316fba10ccf30d2f3e14b6a..19370f62b0d3ddb91c74ae4b6cf6c569341fbdc5 100755 (executable)
@@ -92,6 +92,7 @@ from .utils import (
     YoutubeDLCookieJar,
     YoutubeDLCookieProcessor,
     YoutubeDLHandler,
+    YoutubeDLRedirectHandler,
 )
 from .cache import Cache
 from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
@@ -990,7 +991,7 @@ class YoutubeDL(object):
                     'playlist_title': ie_result.get('title'),
                     'playlist_uploader': ie_result.get('uploader'),
                     'playlist_uploader_id': ie_result.get('uploader_id'),
-                    'playlist_index': i + playliststart,
+                    'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
                     'extractor': ie_result['extractor'],
                     'webpage_url': ie_result['webpage_url'],
                     'webpage_url_basename': url_basename(ie_result['webpage_url']),
@@ -2343,6 +2344,7 @@ class YoutubeDL(object):
         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+        redirect_handler = YoutubeDLRedirectHandler()
         data_handler = compat_urllib_request_DataHandler()
 
         # When passing our own FileHandler instance, build_opener won't add the
@@ -2356,7 +2358,7 @@ class YoutubeDL(object):
         file_handler.file_open = file_open
 
         opener = compat_urllib_request.build_opener(
-            proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
+            proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
 
         # Delete the default user-agent header, which would otherwise apply in
         # cases where our custom HTTP handler doesn't come into play
index c75ab131b9955cec1367ec42aa41d8dadde423da..d1b86bd13df40676f76879e8e9ea314e1fb3c9be 100644 (file)
@@ -2754,6 +2754,17 @@ else:
         compat_expanduser = os.path.expanduser
 
 
+if compat_os_name == 'nt' and sys.version_info < (3, 8):
+    # os.path.realpath on Windows does not follow symbolic links
+    # prior to Python 3.8 (see https://bugs.python.org/issue9949)
+    def compat_realpath(path):
+        while os.path.islink(path):
+            path = os.path.abspath(os.readlink(path))
+        return path
+else:
+    compat_realpath = os.path.realpath
+
+
 if sys.version_info < (3, 0):
     def compat_print(s):
         from .utils import preferredencoding
@@ -2998,6 +3009,7 @@ __all__ = [
     'compat_os_name',
     'compat_parse_qs',
     'compat_print',
+    'compat_realpath',
     'compat_setenv',
     'compat_shlex_quote',
     'compat_shlex_split',
index 4ac323bf6de6d17016c2425c133aad460072cadd..6637f4f3537591b46870cb7ec3f35d1167cb3cb7 100644 (file)
@@ -110,17 +110,17 @@ class ABCIViewIE(InfoExtractor):
 
     # ABC iview programs are normally available for 14 days only.
     _TESTS = [{
-        'url': 'https://iview.abc.net.au/show/ben-and-hollys-little-kingdom/series/0/video/ZX9371A050S00',
-        'md5': 'cde42d728b3b7c2b32b1b94b4a548afc',
+        'url': 'https://iview.abc.net.au/show/gruen/series/11/video/LE1927H001S00',
+        'md5': '67715ce3c78426b11ba167d875ac6abf',
         'info_dict': {
-            'id': 'ZX9371A050S00',
+            'id': 'LE1927H001S00',
             'ext': 'mp4',
-            'title': "Gaston's Birthday",
-            'series': "Ben And Holly's Little Kingdom",
-            'description': 'md5:f9de914d02f226968f598ac76f105bcf',
-            'upload_date': '20180604',
-            'uploader_id': 'abc4kids',
-            'timestamp': 1528140219,
+            'title': "Series 11 Ep 1",
+            'series': "Gruen",
+            'description': 'md5:52cc744ad35045baf6aded2ce7287f67',
+            'upload_date': '20190925',
+            'uploader_id': 'abc1',
+            'timestamp': 1569445289,
         },
         'params': {
             'skip_download': True,
@@ -148,7 +148,7 @@ class ABCIViewIE(InfoExtractor):
                 'hdnea': token,
             })
 
-        for sd in ('sd', 'sd-low'):
+        for sd in ('720', 'sd', 'sd-low'):
             sd_url = try_get(
                 stream, lambda x: x['streams']['hls'][sd], compat_str)
             if not sd_url:
index 80bd696e21f3a4af3c996e9899ce439116e13d19..4dc597e160bcea4f0821719dc82e2908f20d1972 100644 (file)
@@ -24,7 +24,18 @@ from ..utils import (
 
 
 class BiliBiliIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:(?:www|bangumi)\.)?
+                        bilibili\.(?:tv|com)/
+                        (?:
+                            (?:
+                                video/[aA][vV]|
+                                anime/(?P<anime_id>\d+)/play\#
+                            )(?P<id_bv>\d+)|
+                            video/[bB][vV](?P<id>[^/?#&]+)
+                        )
+                    '''
 
     _TESTS = [{
         'url': 'http://www.bilibili.tv/video/av1074402/',
@@ -92,6 +103,10 @@ class BiliBiliIE(InfoExtractor):
                 'skip_download': True,  # Test metadata only
             },
         }]
+    }, {
+        # new BV video id format
+        'url': 'https://www.bilibili.com/video/BV1JE411F741',
+        'only_matching': True,
     }]
 
     _APP_KEY = 'iVGUTjsxvpLeuDCf'
@@ -109,7 +124,7 @@ class BiliBiliIE(InfoExtractor):
         url, smuggled_data = unsmuggle_url(url, {})
 
         mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.group('id') or mobj.group('id_bv')
         anime_id = mobj.group('anime_id')
         webpage = self._download_webpage(url, video_id)
 
@@ -419,3 +434,17 @@ class BilibiliAudioAlbumIE(BilibiliAudioBaseIE):
                     entries, am_id, album_title, album_data.get('intro'))
 
         return self.playlist_result(entries, am_id)
+
+
+class BiliBiliPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://player\.bilibili\.com/player\.html\?.*?\baid=(?P<id>\d+)'
+    _TEST = {
+        'url': 'http://player.bilibili.com/player.html?aid=92494333&cid=157926707&page=1',
+        'only_matching': True,
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        return self.url_result(
+            'http://www.bilibili.tv/video/av%s/' % video_id,
+            ie=BiliBiliIE.ie_key(), video_id=video_id)
index 751a3a8f26c94ecb19c130503593515312bac6c7..fd5ec6033b80513012cf2615fc56e80c7e82cadc 100644 (file)
@@ -1,8 +1,10 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import hashlib
 import json
 import re
+from xml.sax.saxutils import escape
 
 from .common import InfoExtractor
 from ..compat import (
@@ -216,6 +218,29 @@ class CBCWatchBaseIE(InfoExtractor):
         'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
     }
     _GEO_COUNTRIES = ['CA']
+    _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login'
+    _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token'
+    _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+    _NETRC_MACHINE = 'cbcwatch'
+
+    def _signature(self, email, password):
+        data = json.dumps({
+            'email': email,
+            'password': password,
+        }).encode()
+        headers = {'content-type': 'application/json'}
+        query = {'apikey': self._API_KEY}
+        resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query)
+        access_token = resp['access_token']
+
+        # token
+        query = {
+            'access_token': access_token,
+            'apikey': self._API_KEY,
+            'jwtapp': 'jwt',
+        }
+        resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query)
+        return resp['signature']
 
     def _call_api(self, path, video_id):
         url = path if path.startswith('http') else self._API_BASE_URL + path
@@ -239,7 +264,8 @@ class CBCWatchBaseIE(InfoExtractor):
     def _real_initialize(self):
         if self._valid_device_token():
             return
-        device = self._downloader.cache.load('cbcwatch', 'device') or {}
+        device = self._downloader.cache.load(
+            'cbcwatch', self._cache_device_key()) or {}
         self._device_id, self._device_token = device.get('id'), device.get('token')
         if self._valid_device_token():
             return
@@ -248,16 +274,30 @@ class CBCWatchBaseIE(InfoExtractor):
     def _valid_device_token(self):
         return self._device_id and self._device_token
 
+    def _cache_device_key(self):
+        email, _ = self._get_login_info()
+        return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device'
+
     def _register_device(self):
-        self._device_id = self._device_token = None
         result = self._download_xml(
             self._API_BASE_URL + 'device/register',
             None, 'Acquiring device token',
             data=b'<device><type>web</type></device>')
         self._device_id = xpath_text(result, 'deviceId', fatal=True)
-        self._device_token = xpath_text(result, 'deviceToken', fatal=True)
+        email, password = self._get_login_info()
+        if email and password:
+            signature = self._signature(email, password)
+            data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format(
+                escape(signature), escape(self._device_id)).encode()
+            url = self._API_BASE_URL + 'device/login'
+            result = self._download_xml(
+                url, None, data=data,
+                headers={'content-type': 'application/xml'})
+            self._device_token = xpath_text(result, 'token', fatal=True)
+        else:
+            self._device_token = xpath_text(result, 'deviceToken', fatal=True)
         self._downloader.cache.store(
-            'cbcwatch', 'device', {
+            'cbcwatch', self._cache_device_key(), {
                 'id': self._device_id,
                 'token': self._device_token,
             })
index c050bf9df3fb7ececed5b3e03a70aea9b2c37417..fe42821c731c711e8f0974fd4ce48f5c9aee8e8f 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     encode_base_n,
     ExtractorError,
@@ -55,7 +54,7 @@ class EpornerIE(InfoExtractor):
 
         webpage, urlh = self._download_webpage_handle(url, display_id)
 
-        video_id = self._match_id(compat_str(urlh.geturl()))
+        video_id = self._match_id(urlh.geturl())
 
         hash = self._search_regex(
             r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash')
index 1cab440f46e9733e6182563b36a464d77b3610c7..ef803b8a78d00544f4286f21d2c43acde751940b 100644 (file)
@@ -105,6 +105,7 @@ from .bilibili import (
     BiliBiliBangumiIE,
     BilibiliAudioIE,
     BilibiliAudioAlbumIE,
+    BiliBiliPlayerIE,
 )
 from .biobiochiletv import BioBioChileTVIE
 from .bitchute import (
@@ -497,7 +498,6 @@ from .jeuxvideo import JeuxVideoIE
 from .jove import JoveIE
 from .joj import JojIE
 from .jwplatform import JWPlatformIE
-from .jpopsukitv import JpopsukiIE
 from .kakao import KakaoIE
 from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
@@ -850,6 +850,7 @@ from .polskieradio import (
     PolskieRadioIE,
     PolskieRadioCategoryIE,
 )
+from .popcorntimes import PopcorntimesIE
 from .popcorntv import PopcornTVIE
 from .porn91 import Porn91IE
 from .porncom import PornComIE
index b8fa175880f47d6050e4fa3908994bd9777131ac..306b45fc99a4c3495a233d8fb3c649032641d87a 100644 (file)
@@ -31,7 +31,13 @@ class FranceCultureIE(InfoExtractor):
         webpage = self._download_webpage(url, display_id)
 
         video_data = extract_attributes(self._search_regex(
-            r'(?s)<div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>.*?(<button[^>]+data-asset-source="[^"]+"[^>]+>)',
+            r'''(?sx)
+                (?:
+                    </h1>|
+                    <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*>
+                ).*?
+                (<button[^>]+data-asset-source="[^"]+"[^>]+>)
+            ''',
             webpage, 'video data'))
 
         video_url = video_data['data-asset-source']
index 3c002472f795b3600c8f721e76c446f85f3f9bf5..a495ee15aaedc2a64fad86bde03e0be4cab64e85 100644 (file)
@@ -2287,7 +2287,7 @@ class GenericIE(InfoExtractor):
 
         if head_response is not False:
             # Check for redirect
-            new_url = compat_str(head_response.geturl())
+            new_url = head_response.geturl()
             if url != new_url:
                 self.report_following_redirect(new_url)
                 if force_videoid:
@@ -2387,12 +2387,12 @@ class GenericIE(InfoExtractor):
                 return self.playlist_result(
                     self._parse_xspf(
                         doc, video_id, xspf_url=url,
-                        xspf_base_url=compat_str(full_response.geturl())),
+                        xspf_base_url=full_response.geturl()),
                     video_id)
             elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
                 info_dict['formats'] = self._parse_mpd_formats(
                     doc,
-                    mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
+                    mpd_base_url=full_response.geturl().rpartition('/')[0],
                     mpd_url=url)
                 self._sort_formats(info_dict['formats'])
                 return info_dict
@@ -2536,15 +2536,21 @@ class GenericIE(InfoExtractor):
             return self.playlist_from_matches(
                 dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
 
+        # Look for Teachable embeds, must be before Wistia
+        teachable_url = TeachableIE._extract_url(webpage, url)
+        if teachable_url:
+            return self.url_result(teachable_url)
+
         # Look for embedded Wistia player
-        wistia_url = WistiaIE._extract_url(webpage)
-        if wistia_url:
-            return {
-                '_type': 'url_transparent',
-                'url': self._proto_relative_url(wistia_url),
-                'ie_key': WistiaIE.ie_key(),
-                'uploader': video_uploader,
-            }
+        wistia_urls = WistiaIE._extract_urls(webpage)
+        if wistia_urls:
+            playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key())
+            for entry in playlist['entries']:
+                entry.update({
+                    '_type': 'url_transparent',
+                    'uploader': video_uploader,
+                })
+            return playlist
 
         # Look for SVT player
         svt_url = SVTIE._extract_url(webpage)
@@ -3140,10 +3146,6 @@ class GenericIE(InfoExtractor):
             return self.playlist_from_matches(
                 peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key())
 
-        teachable_url = TeachableIE._extract_url(webpage, url)
-        if teachable_url:
-            return self.url_result(teachable_url)
-
         indavideo_urls = IndavideoEmbedIE._extract_urls(webpage)
         if indavideo_urls:
             return self.playlist_from_matches(
index 0ee8ea712c72e618a4d7544f26c376e94fcaf70d..fae4251034da29dd252557f86528531b8fcb9235 100644 (file)
@@ -1,12 +1,11 @@
 from __future__ import unicode_literals
 
-import re
-
 from .common import InfoExtractor
 from ..utils import (
-    js_to_json,
+    int_or_none,
+    merge_dicts,
     remove_end,
-    determine_ext,
+    unified_timestamp,
 )
 
 
@@ -14,15 +13,21 @@ class HellPornoIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?hellporno\.(?:com/videos|net/v)/(?P<id>[^/]+)'
     _TESTS = [{
         'url': 'http://hellporno.com/videos/dixie-is-posing-with-naked-ass-very-erotic/',
-        'md5': '1fee339c610d2049699ef2aa699439f1',
+        'md5': 'f0a46ebc0bed0c72ae8fe4629f7de5f3',
         'info_dict': {
             'id': '149116',
             'display_id': 'dixie-is-posing-with-naked-ass-very-erotic',
             'ext': 'mp4',
             'title': 'Dixie is posing with naked ass very erotic',
+            'description': 'md5:9a72922749354edb1c4b6e540ad3d215',
+            'categories': list,
             'thumbnail': r're:https?://.*\.jpg$',
+            'duration': 240,
+            'timestamp': 1398762720,
+            'upload_date': '20140429',
+            'view_count': int,
             'age_limit': 18,
-        }
+        },
     }, {
         'url': 'http://hellporno.net/v/186271/',
         'only_matching': True,
@@ -36,40 +41,36 @@ class HellPornoIE(InfoExtractor):
         title = remove_end(self._html_search_regex(
             r'<title>([^<]+)</title>', webpage, 'title'), ' - Hell Porno')
 
-        flashvars = self._parse_json(self._search_regex(
-            r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'),
-            display_id, transform_source=js_to_json)
-
-        video_id = flashvars.get('video_id')
-        thumbnail = flashvars.get('preview_url')
-        ext = determine_ext(flashvars.get('postfix'), 'mp4')
-
-        formats = []
-        for video_url_key in ['video_url', 'video_alt_url']:
-            video_url = flashvars.get(video_url_key)
-            if not video_url:
-                continue
-            video_text = flashvars.get('%s_text' % video_url_key)
-            fmt = {
-                'url': video_url,
-                'ext': ext,
-                'format_id': video_text,
-            }
-            m = re.search(r'^(?P<height>\d+)[pP]', video_text)
-            if m:
-                fmt['height'] = int(m.group('height'))
-            formats.append(fmt)
-        self._sort_formats(formats)
+        info = self._parse_html5_media_entries(url, webpage, display_id)[0]
+        self._sort_formats(info['formats'])
 
-        categories = self._html_search_meta(
-            'keywords', webpage, 'categories', default='').split(',')
+        video_id = self._search_regex(
+            (r'chs_object\s*=\s*["\'](\d+)',
+             r'params\[["\']video_id["\']\]\s*=\s*(\d+)'), webpage, 'video id',
+            default=display_id)
+        description = self._search_regex(
+            r'class=["\']desc_video_view_v2[^>]+>([^<]+)', webpage,
+            'description', fatal=False)
+        categories = [
+            c.strip()
+            for c in self._html_search_meta(
+                'keywords', webpage, 'categories', default='').split(',')
+            if c.strip()]
+        duration = int_or_none(self._og_search_property(
+            'video:duration', webpage, fatal=False))
+        timestamp = unified_timestamp(self._og_search_property(
+            'video:release_date', webpage, fatal=False))
+        view_count = int_or_none(self._search_regex(
+            r'>Views\s+(\d+)', webpage, 'view count', fatal=False))
 
-        return {
+        return merge_dicts(info, {
             'id': video_id,
             'display_id': display_id,
             'title': title,
-            'thumbnail': thumbnail,
+            'description': description,
             'categories': categories,
+            'duration': duration,
+            'timestamp': timestamp,
+            'view_count': view_count,
             'age_limit': 18,
-            'formats': formats,
-        }
+        })
index 436759da5480da347a578aba6cb388cb01e97448..a31301985b0c7d212886a2e6e495c7d705714041 100644 (file)
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
 
+import base64
+import json
 import re
 
 from .common import InfoExtractor
@@ -8,6 +10,7 @@ from ..utils import (
     mimetype2ext,
     parse_duration,
     qualities,
+    try_get,
     url_or_none,
 )
 
@@ -15,15 +18,16 @@ from ..utils import (
 class ImdbIE(InfoExtractor):
     IE_NAME = 'imdb'
     IE_DESC = 'Internet Movie Database trailers'
-    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P<id>\d+)'
 
     _TESTS = [{
         'url': 'http://www.imdb.com/video/imdb/vi2524815897',
         'info_dict': {
             'id': '2524815897',
             'ext': 'mp4',
-            'title': 'No. 2 from Ice Age: Continental Drift (2012)',
+            'title': 'No. 2',
             'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
+            'duration': 152,
         }
     }, {
         'url': 'http://www.imdb.com/video/_/vi2524815897',
@@ -47,21 +51,23 @@ class ImdbIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(
-            'https://www.imdb.com/videoplayer/vi' + video_id, video_id)
-        video_metadata = self._parse_json(self._search_regex(
-            r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage,
-            'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id]
-        title = self._html_search_meta(
-            ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
-            r'<title>(.+?)</title>', webpage, 'title', fatal=False) or video_metadata['title']
+
+        data = self._download_json(
+            'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
+            query={
+                'key': base64.b64encode(json.dumps({
+                    'type': 'VIDEO_PLAYER',
+                    'subType': 'FORCE_LEGACY',
+                    'id': 'vi%s' % video_id,
+                }).encode()).decode(),
+            })[0]
 
         quality = qualities(('SD', '480p', '720p', '1080p'))
         formats = []
-        for encoding in video_metadata.get('encodings', []):
+        for encoding in data['videoLegacyEncodings']:
             if not encoding or not isinstance(encoding, dict):
                 continue
-            video_url = url_or_none(encoding.get('videoUrl'))
+            video_url = url_or_none(encoding.get('url'))
             if not video_url:
                 continue
             ext = mimetype2ext(encoding.get(
@@ -69,7 +75,7 @@ class ImdbIE(InfoExtractor):
             if ext == 'm3u8':
                 formats.extend(self._extract_m3u8_formats(
                     video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
+                    preference=1, m3u8_id='hls', fatal=False))
                 continue
             format_id = encoding.get('definition')
             formats.append({
@@ -80,13 +86,33 @@ class ImdbIE(InfoExtractor):
             })
         self._sort_formats(formats)
 
+        webpage = self._download_webpage(
+            'https://www.imdb.com/video/vi' + video_id, video_id)
+        video_metadata = self._parse_json(self._search_regex(
+            r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
+            'video metadata'), video_id)
+
+        video_info = video_metadata.get('VIDEO_INFO')
+        if video_info and isinstance(video_info, dict):
+            info = try_get(
+                video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
+        else:
+            info = {}
+
+        title = self._html_search_meta(
+            ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
+            r'<title>(.+?)</title>', webpage, 'title',
+            default=None) or info['videoTitle']
+
         return {
             'id': video_id,
             'title': title,
+            'alt_title': info.get('videoSubTitle'),
             'formats': formats,
-            'description': video_metadata.get('description'),
-            'thumbnail': video_metadata.get('slate', {}).get('url'),
-            'duration': parse_duration(video_metadata.get('duration')),
+            'description': info.get('videoDescription'),
+            'thumbnail': url_or_none(try_get(
+                video_metadata, lambda x: x['videoSlate']['source'])),
+            'duration': parse_duration(info.get('videoRuntime')),
         }
 
 
diff --git a/youtube_dl/extractor/jpopsukitv.py b/youtube_dl/extractor/jpopsukitv.py
deleted file mode 100644 (file)
index 4b5f346..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-    unified_strdate,
-)
-
-
-class JpopsukiIE(InfoExtractor):
-    IE_NAME = 'jpopsuki.tv'
-    _VALID_URL = r'https?://(?:www\.)?jpopsuki\.tv/(?:category/)?video/[^/]+/(?P<id>\S+)'
-
-    _TEST = {
-        'url': 'http://www.jpopsuki.tv/video/ayumi-hamasaki---evolution/00be659d23b0b40508169cdee4545771',
-        'md5': '88018c0c1a9b1387940e90ec9e7e198e',
-        'info_dict': {
-            'id': '00be659d23b0b40508169cdee4545771',
-            'ext': 'mp4',
-            'title': 'ayumi hamasaki - evolution',
-            'description': 'Release date: 2001.01.31\r\n浜崎あゆみ - evolution',
-            'thumbnail': 'http://www.jpopsuki.tv/cache/89722c74d2a2ebe58bcac65321c115b2.jpg',
-            'uploader': 'plama_chan',
-            'uploader_id': '404',
-            'upload_date': '20121101'
-        }
-    }
-
-    def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = 'http://www.jpopsuki.tv' + self._html_search_regex(
-            r'<source src="(.*?)" type', webpage, 'video url')
-
-        video_title = self._og_search_title(webpage)
-        description = self._og_search_description(webpage)
-        thumbnail = self._og_search_thumbnail(webpage)
-        uploader = self._html_search_regex(
-            r'<li>from: <a href="/user/view/user/(.*?)/uid/',
-            webpage, 'video uploader', fatal=False)
-        uploader_id = self._html_search_regex(
-            r'<li>from: <a href="/user/view/user/\S*?/uid/(\d*)',
-            webpage, 'video uploader_id', fatal=False)
-        upload_date = unified_strdate(self._html_search_regex(
-            r'<li>uploaded: (.*?)</li>', webpage, 'video upload_date',
-            fatal=False))
-        view_count_str = self._html_search_regex(
-            r'<li>Hits: ([0-9]+?)</li>', webpage, 'video view_count',
-            fatal=False)
-        comment_count_str = self._html_search_regex(
-            r'<h2>([0-9]+?) comments</h2>', webpage, 'video comment_count',
-            fatal=False)
-
-        return {
-            'id': video_id,
-            'url': video_url,
-            'title': video_title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'uploader': uploader,
-            'uploader_id': uploader_id,
-            'upload_date': upload_date,
-            'view_count': int_or_none(view_count_str),
-            'comment_count': int_or_none(comment_count_str),
-        }
index 6ed7da4abaa7a2a45f924b4bf9f919261a40bec9..1b2dcef46621237fd7c7ce376165a6bc5c674606 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     clean_html,
     determine_ext,
@@ -36,7 +35,7 @@ class LecturioBaseIE(InfoExtractor):
             self._LOGIN_URL, None, 'Downloading login popup')
 
         def is_logged(url_handle):
-            return self._LOGIN_URL not in compat_str(url_handle.geturl())
+            return self._LOGIN_URL not in url_handle.geturl()
 
         # Already logged in
         if is_logged(urlh):
index 729d8de50fab70cd69bab41fae9db0cba4d7da9b..39f74d2822bc7296df8a5c16e5edfce3298e82ab 100644 (file)
@@ -18,7 +18,6 @@ from ..utils import (
 
 class LimelightBaseIE(InfoExtractor):
     _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
-    _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
 
     @classmethod
     def _extract_urls(cls, webpage, source_url):
@@ -70,7 +69,8 @@ class LimelightBaseIE(InfoExtractor):
         try:
             return self._download_json(
                 self._PLAYLIST_SERVICE_URL % (self._PLAYLIST_SERVICE_PATH, item_id, method),
-                item_id, 'Downloading PlaylistService %s JSON' % method, fatal=fatal, headers=headers)
+                item_id, 'Downloading PlaylistService %s JSON' % method,
+                fatal=fatal, headers=headers)
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
                 error = self._parse_json(e.cause.read().decode(), item_id)['detail']['contentAccessPermission']
@@ -79,22 +79,22 @@ class LimelightBaseIE(InfoExtractor):
                 raise ExtractorError(error, expected=True)
             raise
 
-    def _call_api(self, organization_id, item_id, method):
-        return self._download_json(
-            self._API_URL % (organization_id, self._API_PATH, item_id, method),
-            item_id, 'Downloading API %s JSON' % method)
-
-    def _extract(self, item_id, pc_method, mobile_method, meta_method, referer=None):
+    def _extract(self, item_id, pc_method, mobile_method, referer=None):
         pc = self._call_playlist_service(item_id, pc_method, referer=referer)
-        metadata = self._call_api(pc['orgId'], item_id, meta_method)
-        mobile = self._call_playlist_service(item_id, mobile_method, fatal=False, referer=referer)
-        return pc, mobile, metadata
+        mobile = self._call_playlist_service(
+            item_id, mobile_method, fatal=False, referer=referer)
+        return pc, mobile
+
+    def _extract_info(self, pc, mobile, i, referer):
+        get_item = lambda x, y: try_get(x, lambda x: x[y][i], dict) or {}
+        pc_item = get_item(pc, 'playlistItems')
+        mobile_item = get_item(mobile, 'mediaList')
+        video_id = pc_item.get('mediaId') or mobile_item['mediaId']
+        title = pc_item.get('title') or mobile_item['title']
 
-    def _extract_info(self, streams, mobile_urls, properties):
-        video_id = properties['media_id']
         formats = []
         urls = []
-        for stream in streams:
+        for stream in pc_item.get('streams', []):
             stream_url = stream.get('url')
             if not stream_url or stream.get('drmProtected') or stream_url in urls:
                 continue
@@ -155,7 +155,7 @@ class LimelightBaseIE(InfoExtractor):
                     })
                 formats.append(fmt)
 
-        for mobile_url in mobile_urls:
+        for mobile_url in mobile_item.get('mobileUrls', []):
             media_url = mobile_url.get('mobileUrl')
             format_id = mobile_url.get('targetMediaPlatform')
             if not media_url or format_id in ('Widevine', 'SmoothStreaming') or media_url in urls:
@@ -179,54 +179,34 @@ class LimelightBaseIE(InfoExtractor):
 
         self._sort_formats(formats)
 
-        title = properties['title']
-        description = properties.get('description')
-        timestamp = int_or_none(properties.get('publish_date') or properties.get('create_date'))
-        duration = float_or_none(properties.get('duration_in_milliseconds'), 1000)
-        filesize = int_or_none(properties.get('total_storage_in_bytes'))
-        categories = [properties.get('category')]
-        tags = properties.get('tags', [])
-        thumbnails = [{
-            'url': thumbnail['url'],
-            'width': int_or_none(thumbnail.get('width')),
-            'height': int_or_none(thumbnail.get('height')),
-        } for thumbnail in properties.get('thumbnails', []) if thumbnail.get('url')]
-
         subtitles = {}
-        for caption in properties.get('captions', []):
-            lang = caption.get('language_code')
-            subtitles_url = caption.get('url')
-            if lang and subtitles_url:
-                subtitles.setdefault(lang, []).append({
-                    'url': subtitles_url,
-                })
-        closed_captions_url = properties.get('closed_captions_url')
-        if closed_captions_url:
-            subtitles.setdefault('en', []).append({
-                'url': closed_captions_url,
-                'ext': 'ttml',
-            })
+        for flag in mobile_item.get('flags'):
+            if flag == 'ClosedCaptions':
+                closed_captions = self._call_playlist_service(
+                    video_id, 'getClosedCaptionsDetailsByMediaId',
+                    False, referer) or []
+                for cc in closed_captions:
+                    cc_url = cc.get('webvttFileUrl')
+                    if not cc_url:
+                        continue
+                    lang = cc.get('languageCode') or self._search_regex(r'/[a-z]{2}\.vtt', cc_url, 'lang', default='en')
+                    subtitles.setdefault(lang, []).append({
+                        'url': cc_url,
+                    })
+                break
+
+        get_meta = lambda x: pc_item.get(x) or mobile_item.get(x)
 
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': get_meta('description'),
             'formats': formats,
-            'timestamp': timestamp,
-            'duration': duration,
-            'filesize': filesize,
-            'categories': categories,
-            'tags': tags,
-            'thumbnails': thumbnails,
+            'duration': float_or_none(get_meta('durationInMilliseconds'), 1000),
+            'thumbnail': get_meta('previewImageUrl') or get_meta('thumbnailImageUrl'),
             'subtitles': subtitles,
         }
 
-    def _extract_info_helper(self, pc, mobile, i, metadata):
-        return self._extract_info(
-            try_get(pc, lambda x: x['playlistItems'][i]['streams'], list) or [],
-            try_get(mobile, lambda x: x['mediaList'][i]['mobileUrls'], list) or [],
-            metadata)
-
 
 class LimelightMediaIE(LimelightBaseIE):
     IE_NAME = 'limelight'
@@ -251,8 +231,6 @@ class LimelightMediaIE(LimelightBaseIE):
             'description': 'md5:8005b944181778e313d95c1237ddb640',
             'thumbnail': r're:^https?://.*\.jpeg$',
             'duration': 144.23,
-            'timestamp': 1244136834,
-            'upload_date': '20090604',
         },
         'params': {
             # m3u8 download
@@ -268,30 +246,29 @@ class LimelightMediaIE(LimelightBaseIE):
             'title': '3Play Media Overview Video',
             'thumbnail': r're:^https?://.*\.jpeg$',
             'duration': 78.101,
-            'timestamp': 1338929955,
-            'upload_date': '20120605',
-            'subtitles': 'mincount:9',
+            # TODO: extract all languages that were accessible via API
+            # 'subtitles': 'mincount:9',
+            'subtitles': 'mincount:1',
         },
     }, {
         'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452',
         'only_matching': True,
     }]
     _PLAYLIST_SERVICE_PATH = 'media'
-    _API_PATH = 'media'
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         video_id = self._match_id(url)
+        source_url = smuggled_data.get('source_url')
         self._initialize_geo_bypass({
             'countries': smuggled_data.get('geo_countries'),
         })
 
-        pc, mobile, metadata = self._extract(
+        pc, mobile = self._extract(
             video_id, 'getPlaylistByMediaId',
-            'getMobilePlaylistByMediaId', 'properties',
-            smuggled_data.get('source_url'))
+            'getMobilePlaylistByMediaId', source_url)
 
-        return self._extract_info_helper(pc, mobile, 0, metadata)
+        return self._extract_info(pc, mobile, 0, source_url)
 
 
 class LimelightChannelIE(LimelightBaseIE):
@@ -313,6 +290,7 @@ class LimelightChannelIE(LimelightBaseIE):
         'info_dict': {
             'id': 'ab6a524c379342f9b23642917020c082',
             'title': 'Javascript Sample Code',
+            'description': 'Javascript Sample Code - http://www.delvenetworks.com/sample-code/playerCode-demo.html',
         },
         'playlist_mincount': 3,
     }, {
@@ -320,22 +298,23 @@ class LimelightChannelIE(LimelightBaseIE):
         'only_matching': True,
     }]
     _PLAYLIST_SERVICE_PATH = 'channel'
-    _API_PATH = 'channels'
 
     def _real_extract(self, url):
         url, smuggled_data = unsmuggle_url(url, {})
         channel_id = self._match_id(url)
+        source_url = smuggled_data.get('source_url')
 
-        pc, mobile, medias = self._extract(
+        pc, mobile = self._extract(
             channel_id, 'getPlaylistByChannelId',
             'getMobilePlaylistWithNItemsByChannelId?begin=0&count=-1',
-            'media', smuggled_data.get('source_url'))
+            source_url)
 
         entries = [
-            self._extract_info_helper(pc, mobile, i, medias['media_list'][i])
-            for i in range(len(medias['media_list']))]
+            self._extract_info(pc, mobile, i, source_url)
+            for i in range(len(pc['playlistItems']))]
 
-        return self.playlist_result(entries, channel_id, pc['title'])
+        return self.playlist_result(
+            entries, channel_id, pc.get('title'), mobile.get('description'))
 
 
 class LimelightChannelListIE(LimelightBaseIE):
@@ -368,10 +347,12 @@ class LimelightChannelListIE(LimelightBaseIE):
     def _real_extract(self, url):
         channel_list_id = self._match_id(url)
 
-        channel_list = self._call_playlist_service(channel_list_id, 'getMobileChannelListById')
+        channel_list = self._call_playlist_service(
+            channel_list_id, 'getMobileChannelListById')
 
         entries = [
             self.url_result('limelight:channel:%s' % channel['id'], 'LimelightChannel')
             for channel in channel_list['channelList']]
 
-        return self.playlist_result(entries, channel_list_id, channel_list['title'])
+        return self.playlist_result(
+            entries, channel_list_id, channel_list['title'])
index a78c6556e105220a09b66dac94d8fc27780e3dc5..23ca965d977b1ec682101f048684f20f1b70834c 100644 (file)
@@ -8,7 +8,6 @@ from .common import InfoExtractor
 from ..compat import (
     compat_b64decode,
     compat_HTTPError,
-    compat_str,
 )
 from ..utils import (
     ExtractorError,
@@ -99,7 +98,7 @@ class LinuxAcademyIE(InfoExtractor):
             'sso': 'true',
         })
 
-        login_state_url = compat_str(urlh.geturl())
+        login_state_url = urlh.geturl()
 
         try:
             login_page = self._download_webpage(
@@ -129,7 +128,7 @@ class LinuxAcademyIE(InfoExtractor):
             })
 
         access_token = self._search_regex(
-            r'access_token=([^=&]+)', compat_str(urlh.geturl()),
+            r'access_token=([^=&]+)', urlh.geturl(),
             'access token')
 
         self._download_webpage(
index 027a790b8b182541dd8b592b183b0dbaff505322..933df14952d5cc16857485e306be07f2d32384d3 100644 (file)
@@ -6,7 +6,6 @@ import re
 from .theplatform import ThePlatformBaseIE
 from ..compat import (
     compat_parse_qs,
-    compat_str,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
@@ -114,7 +113,7 @@ class MediasetIE(ThePlatformBaseIE):
                 continue
             urlh = ie._request_webpage(
                 embed_url, video_id, note='Following embed URL redirect')
-            embed_url = compat_str(urlh.geturl())
+            embed_url = urlh.geturl()
             program_guid = _program_guid(_qs(embed_url))
             if program_guid:
                 entries.append(embed_url)
index 694a264d672288b47c2700b9265bfc0635158ff2..d6eb1574065dece67e28a4b36fa43478dd48dfa3 100644 (file)
@@ -129,7 +129,7 @@ class MediasiteIE(InfoExtractor):
         query = mobj.group('query')
 
         webpage, urlh = self._download_webpage_handle(url, resource_id)  # XXX: add UrlReferrer?
-        redirect_url = compat_str(urlh.geturl())
+        redirect_url = urlh.geturl()
 
         # XXX: might have also extracted UrlReferrer and QueryString from the html
         service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex(
index 9c8bf05af10b26f0c45da23ddeb5fa42ec51f307..2447c812e021e73991082aefab4bd98e6dd000a1 100644 (file)
@@ -7,6 +7,7 @@ from .common import InfoExtractor
 from ..utils import (
     determine_ext,
     int_or_none,
+    merge_dicts,
     parse_iso8601,
     qualities,
     try_get,
@@ -87,21 +88,25 @@ class NDRIE(NDRBaseIE):
 
     def _extract_embed(self, webpage, display_id):
         embed_url = self._html_search_meta(
-            'embedURL', webpage, 'embed URL', fatal=True)
+            'embedURL', webpage, 'embed URL',
+            default=None) or self._search_regex(
+            r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+            'embed URL', group='url')
         description = self._search_regex(
             r'<p[^>]+itemprop="description">([^<]+)</p>',
             webpage, 'description', default=None) or self._og_search_description(webpage)
         timestamp = parse_iso8601(
             self._search_regex(
                 r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="([^"]+)"',
-                webpage, 'upload date', fatal=False))
-        return {
+                webpage, 'upload date', default=None))
+        info = self._search_json_ld(webpage, display_id, default={})
+        return merge_dicts({
             '_type': 'url_transparent',
             'url': embed_url,
             'display_id': display_id,
             'description': description,
             'timestamp': timestamp,
-        }
+        }, info)
 
 
 class NJoyIE(NDRBaseIE):
index 6a2c6cb7bb6d039c56fcf7325de422846c437ab5..de6a707c4265c4fc61a57db117a432a95468ab54 100644 (file)
@@ -6,7 +6,7 @@ from .common import InfoExtractor
 
 
 class NhkVodIE(InfoExtractor):
-    _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[a-z]+-\d{8}-\d+)'
+    _VALID_URL = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand/(?P<type>video|audio)/(?P<id>\d{7}|[^/]+?-\d{8}-\d+)'
     # Content available only for a limited period of time. Visit
     # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples.
     _TESTS = [{
@@ -30,8 +30,11 @@ class NhkVodIE(InfoExtractor):
     }, {
         'url': 'https://www3.nhk.or.jp/nhkworld/fr/ondemand/audio/plugin-20190404-1/',
         'only_matching': True,
+    }, {
+        'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/',
+        'only_matching': True,
     }]
-    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7/episode/%s/%s/all%s.json'
+    _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/episode/%s/%s/all%s.json'
 
     def _real_extract(self, url):
         lang, m_type, episode_id = re.match(self._VALID_URL, url).groups()
@@ -82,15 +85,9 @@ class NhkVodIE(InfoExtractor):
             audio = episode['audio']
             audio_path = audio['audio']
             info['formats'] = self._extract_m3u8_formats(
-                'https://nhks-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
-                episode_id, 'm4a', m3u8_id='hls', fatal=False)
-            for proto in ('rtmpt', 'rtmp'):
-                info['formats'].append({
-                    'ext': 'flv',
-                    'format_id': proto,
-                    'url': '%s://flv.nhk.or.jp/ondemand/mp4:flv%s' % (proto, audio_path),
-                    'vcodec': 'none',
-                })
+                'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path,
+                episode_id, 'm4a', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False)
             for f in info['formats']:
                 f['language'] = lang
         return info
index 901f44b54f40c2c02e120c636a80b0b5bfb4ea2e..2850af5dbe14231df3ee075d3868744e587d0c14 100644 (file)
@@ -18,7 +18,7 @@ class NovaEmbedIE(InfoExtractor):
     _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)'
     _TEST = {
         'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1',
-        'md5': 'b3834f6de5401baabf31ed57456463f7',
+        'md5': 'ee009bafcc794541570edd44b71cbea3',
         'info_dict': {
             'id': '8o0n0r',
             'ext': 'mp4',
@@ -44,11 +44,17 @@ class NovaEmbedIE(InfoExtractor):
         formats = []
         for format_id, format_list in bitrates.items():
             if not isinstance(format_list, list):
-                continue
+                format_list = [format_list]
             for format_url in format_list:
                 format_url = url_or_none(format_url)
                 if not format_url:
                     continue
+                if format_id == 'hls':
+                    formats.extend(self._extract_m3u8_formats(
+                        format_url, video_id, ext='mp4',
+                        entry_protocol='m3u8_native', m3u8_id='hls',
+                        fatal=False))
+                    continue
                 f = {
                     'url': format_url,
                 }
@@ -91,7 +97,7 @@ class NovaIE(InfoExtractor):
     _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)'
     _TESTS = [{
         'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260',
-        'md5': '1dd7b9d5ea27bc361f110cd855a19bd3',
+        'md5': '249baab7d0104e186e78b0899c7d5f28',
         'info_dict': {
             'id': '1757139',
             'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci',
@@ -113,7 +119,8 @@ class NovaIE(InfoExtractor):
         'params': {
             # rtmp download
             'skip_download': True,
-        }
+        },
+        'skip': 'gone',
     }, {
         # media.cms.nova.cz embed
         'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil',
@@ -128,6 +135,7 @@ class NovaIE(InfoExtractor):
             'skip_download': True,
         },
         'add_ie': [NovaEmbedIE.ie_key()],
+        'skip': 'CHYBA 404: STRÁNKA NENALEZENA',
     }, {
         'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html',
         'only_matching': True,
@@ -152,14 +160,29 @@ class NovaIE(InfoExtractor):
 
         webpage = self._download_webpage(url, display_id)
 
+        description = clean_html(self._og_search_description(webpage, default=None))
+        if site == 'novaplus':
+            upload_date = unified_strdate(self._search_regex(
+                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
+        elif site == 'fanda':
+            upload_date = unified_strdate(self._search_regex(
+                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
+        else:
+            upload_date = None
+
         # novaplus
         embed_id = self._search_regex(
             r'<iframe[^>]+\bsrc=["\'](?:https?:)?//media\.cms\.nova\.cz/embed/([^/?#&]+)',
             webpage, 'embed url', default=None)
         if embed_id:
-            return self.url_result(
-                'https://media.cms.nova.cz/embed/%s' % embed_id,
-                ie=NovaEmbedIE.ie_key(), video_id=embed_id)
+            return {
+                '_type': 'url_transparent',
+                'url': 'https://media.cms.nova.cz/embed/%s' % embed_id,
+                'ie_key': NovaEmbedIE.ie_key(),
+                'id': embed_id,
+                'description': description,
+                'upload_date': upload_date
+            }
 
         video_id = self._search_regex(
             [r"(?:media|video_id)\s*:\s*'(\d+)'",
@@ -233,18 +256,8 @@ class NovaIE(InfoExtractor):
         self._sort_formats(formats)
 
         title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage)
-        description = clean_html(self._og_search_description(webpage, default=None))
         thumbnail = config.get('poster')
 
-        if site == 'novaplus':
-            upload_date = unified_strdate(self._search_regex(
-                r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None))
-        elif site == 'fanda':
-            upload_date = unified_strdate(self._search_regex(
-                r'<span class="date_time">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None))
-        else:
-            upload_date = None
-
         return {
             'id': video_id,
             'display_id': display_id,
index a5e8baa7e2542f4e8d6a8c83dea7bddecf82413d..53acc6e574c0743a223d552d95d8806dc071ed41 100644 (file)
@@ -4,6 +4,7 @@ from .common import InfoExtractor
 from ..utils import (
     int_or_none,
     qualities,
+    url_or_none,
 )
 
 
@@ -48,6 +49,10 @@ class NprIE(InfoExtractor):
             },
         }],
         'expected_warnings': ['Failed to download m3u8 information'],
+    }, {
+        # multimedia, no formats, stream
+        'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -95,6 +100,17 @@ class NprIE(InfoExtractor):
                             'format_id': format_id,
                             'quality': quality(format_id),
                         })
+            for stream_id, stream_entry in media.get('stream', {}).items():
+                if not isinstance(stream_entry, dict):
+                    continue
+                if stream_id != 'hlsUrl':
+                    continue
+                stream_url = url_or_none(stream_entry.get('$text'))
+                if not stream_url:
+                    continue
+                formats.extend(self._extract_m3u8_formats(
+                    stream_url, stream_id, 'mp4', 'm3u8_native',
+                    m3u8_id='hls', fatal=False))
             self._sort_formats(formats)
 
             entries.append({
index 2bb77ab249239163d8318a57e8fd0fdb57d2e32a..fc78ca56c90d37b00c1f396aee7c896d54fb91c9 100644 (file)
@@ -69,10 +69,10 @@ class NYTimesBaseIE(InfoExtractor):
                     'width': int_or_none(video.get('width')),
                     'height': int_or_none(video.get('height')),
                     'filesize': get_file_size(video.get('file_size') or video.get('fileSize')),
-                    'tbr': int_or_none(video.get('bitrate'), 1000),
+                    'tbr': int_or_none(video.get('bitrate'), 1000) or None,
                     'ext': ext,
                 })
-        self._sort_formats(formats)
+        self._sort_formats(formats, ('height', 'width', 'filesize', 'tbr', 'fps', 'format_id'))
 
         thumbnails = []
         for image in video_data.get('images', []):
index d3a83ea2bb5215e34e72ad85bf99867697e2e1b2..48fb9541693c35878317f22ed9dd6e2da4412ced 100644 (file)
@@ -8,6 +8,7 @@ from ..compat import compat_str
 from ..utils import (
     int_or_none,
     parse_resolution,
+    str_or_none,
     try_get,
     unified_timestamp,
     url_or_none,
@@ -415,6 +416,7 @@ class PeerTubeIE(InfoExtractor):
                             peertube\.cpy\.re
                         )'''
     _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
+    _API_BASE = 'https://%s/api/v1/videos/%s/%s'
     _VALID_URL = r'''(?x)
                     (?:
                         peertube:(?P<host>[^:]+):|
@@ -423,26 +425,30 @@ class PeerTubeIE(InfoExtractor):
                     (?P<id>%s)
                     ''' % (_INSTANCES_RE, _UUID_RE)
     _TESTS = [{
-        'url': 'https://peertube.cpy.re/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c',
-        'md5': '80f24ff364cc9d333529506a263e7feb',
+        'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d',
+        'md5': '9bed8c0137913e17b86334e5885aacff',
         'info_dict': {
-            'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c',
+            'id': '9c9de5e8-0a1e-484a-b099-e80766180a6d',
             'ext': 'mp4',
-            'title': 'wow',
-            'description': 'wow such video, so gif',
+            'title': 'What is PeerTube?',
+            'description': 'md5:3fefb8dde2b189186ce0719fda6f7b10',
             'thumbnail': r're:https?://.*\.(?:jpg|png)',
-            'timestamp': 1519297480,
-            'upload_date': '20180222',
-            'uploader': 'Luclu7',
-            'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1',
-            'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7',
-            'license': 'Unknown',
-            'duration': 3,
+            'timestamp': 1538391166,
+            'upload_date': '20181001',
+            'uploader': 'Framasoft',
+            'uploader_id': '3',
+            'uploader_url': 'https://framatube.org/accounts/framasoft',
+            'channel': 'Les vidéos de Framasoft',
+            'channel_id': '2',
+            'channel_url': 'https://framatube.org/video-channels/bf54d359-cfad-4935-9d45-9d6be93f63e8',
+            'language': 'en',
+            'license': 'Attribution - Share Alike',
+            'duration': 113,
             'view_count': int,
             'like_count': int,
             'dislike_count': int,
-            'tags': list,
-            'categories': list,
+            'tags': ['framasoft', 'peertube'],
+            'categories': ['Science & Technology'],
         }
     }, {
         'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
@@ -484,13 +490,38 @@ class PeerTubeIE(InfoExtractor):
                 entries = [peertube_url]
         return entries
 
+    def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True):
+        return self._download_json(
+            self._API_BASE % (host, video_id, path), video_id,
+            note=note, errnote=errnote, fatal=fatal)
+
+    def _get_subtitles(self, host, video_id):
+        captions = self._call_api(
+            host, video_id, 'captions', note='Downloading captions JSON',
+            fatal=False)
+        if not isinstance(captions, dict):
+            return
+        data = captions.get('data')
+        if not isinstance(data, list):
+            return
+        subtitles = {}
+        for e in data:
+            language_id = try_get(e, lambda x: x['language']['id'], compat_str)
+            caption_url = urljoin('https://%s' % host, e.get('captionPath'))
+            if not caption_url:
+                continue
+            subtitles.setdefault(language_id or 'en', []).append({
+                'url': caption_url,
+            })
+        return subtitles
+
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
         host = mobj.group('host') or mobj.group('host_2')
         video_id = mobj.group('id')
 
-        video = self._download_json(
-            'https://%s/api/v1/videos/%s' % (host, video_id), video_id)
+        video = self._call_api(
+            host, video_id, '', note='Downloading video JSON')
 
         title = video['name']
 
@@ -513,10 +544,28 @@ class PeerTubeIE(InfoExtractor):
             formats.append(f)
         self._sort_formats(formats)
 
-        def account_data(field):
-            return try_get(video, lambda x: x['account'][field], compat_str)
+        full_description = self._call_api(
+            host, video_id, 'description', note='Downloading description JSON',
+            fatal=False)
+
+        description = None
+        if isinstance(full_description, dict):
+            description = str_or_none(full_description.get('description'))
+        if not description:
+            description = video.get('description')
+
+        subtitles = self.extract_subtitles(host, video_id)
+
+        def data(section, field, type_):
+            return try_get(video, lambda x: x[section][field], type_)
+
+        def account_data(field, type_):
+            return data('account', field, type_)
+
+        def channel_data(field, type_):
+            return data('channel', field, type_)
 
-        category = try_get(video, lambda x: x['category']['label'], compat_str)
+        category = data('category', 'label', compat_str)
         categories = [category] if category else None
 
         nsfw = video.get('nsfw')
@@ -528,14 +577,17 @@ class PeerTubeIE(InfoExtractor):
         return {
             'id': video_id,
             'title': title,
-            'description': video.get('description'),
+            'description': description,
             'thumbnail': urljoin(url, video.get('thumbnailPath')),
             'timestamp': unified_timestamp(video.get('publishedAt')),
-            'uploader': account_data('displayName'),
-            'uploader_id': account_data('uuid'),
-            'uploder_url': account_data('url'),
-            'license': try_get(
-                video, lambda x: x['licence']['label'], compat_str),
+            'uploader': account_data('displayName', compat_str),
+            'uploader_id': str_or_none(account_data('id', int)),
+            'uploader_url': url_or_none(account_data('url', compat_str)),
+            'channel': channel_data('displayName', compat_str),
+            'channel_id': str_or_none(channel_data('id', int)),
+            'channel_url': url_or_none(channel_data('url', compat_str)),
+            'language': data('language', 'id', compat_str),
+            'license': data('licence', 'label', compat_str),
             'duration': int_or_none(video.get('duration')),
             'view_count': int_or_none(video.get('views')),
             'like_count': int_or_none(video.get('likes')),
@@ -544,4 +596,5 @@ class PeerTubeIE(InfoExtractor):
             'tags': try_get(video, lambda x: x['tags'], list),
             'categories': categories,
             'formats': formats,
+            'subtitles': subtitles
         }
index 602207bebdd6a01d7f33dbf08302ab5a75ccf207..23c8256b59dab4a92ae79ef48dc8e3b0adf0ff68 100644 (file)
@@ -46,7 +46,7 @@ class PlatziBaseIE(InfoExtractor):
             headers={'Referer': self._LOGIN_URL})
 
         # login succeeded
-        if 'platzi.com/login' not in compat_str(urlh.geturl()):
+        if 'platzi.com/login' not in urlh.geturl():
             return
 
         login_error = self._webpage_read_content(
index dd5f17f1192c3543636f6ff24624b0c9cc9a0bd6..80222d42831b8e58116449f96a615ea250b2985f 100644 (file)
@@ -20,20 +20,16 @@ class PokemonIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'The Ol’ Raise and Switch!',
             'description': 'md5:7db77f7107f98ba88401d3adc80ff7af',
-            'timestamp': 1511824728,
-            'upload_date': '20171127',
         },
         'add_id': ['LimelightMedia'],
     }, {
         # no data-video-title
-        'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008',
+        'url': 'https://www.pokemon.com/fr/episodes-pokemon/films-pokemon/pokemon-lascension-de-darkrai-2008',
         'info_dict': {
-            'id': '99f3bae270bf4e5097274817239ce9c8',
+            'id': 'dfbaf830d7e54e179837c50c0c6cc0e1',
             'ext': 'mp4',
-            'title': 'Pokémon: The Rise of Darkrai',
-            'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d',
-            'timestamp': 1417778347,
-            'upload_date': '20141205',
+            'title': "Pokémon : L'ascension de Darkrai",
+            'description': 'md5:d1dbc9e206070c3e14a06ff557659fb5',
         },
         'add_id': ['LimelightMedia'],
         'params': {
diff --git a/youtube_dl/extractor/popcorntimes.py b/youtube_dl/extractor/popcorntimes.py
new file mode 100644 (file)
index 0000000..7bf7f98
--- /dev/null
@@ -0,0 +1,99 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_b64decode,
+    compat_chr,
+)
+from ..utils import int_or_none
+
+
+class PopcorntimesIE(InfoExtractor):
+    _VALID_URL = r'https?://popcorntimes\.tv/[^/]+/m/(?P<id>[^/]+)/(?P<display_id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://popcorntimes.tv/de/m/A1XCFvz/haensel-und-gretel-opera-fantasy',
+        'md5': '93f210991ad94ba8c3485950a2453257',
+        'info_dict': {
+            'id': 'A1XCFvz',
+            'display_id': 'haensel-und-gretel-opera-fantasy',
+            'ext': 'mp4',
+            'title': 'Hänsel und Gretel',
+            'description': 'md5:1b8146791726342e7b22ce8125cf6945',
+            'thumbnail': r're:^https?://.*\.jpg$',
+            'creator': 'John Paul',
+            'release_date': '19541009',
+            'duration': 4260,
+            'tbr': 5380,
+            'width': 720,
+            'height': 540,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id, display_id = mobj.group('id', 'display_id')
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._search_regex(
+            r'<h1>([^<]+)', webpage, 'title',
+            default=None) or self._html_search_meta(
+            'ya:ovs:original_name', webpage, 'title', fatal=True)
+
+        loc = self._search_regex(
+            r'PCTMLOC\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 'loc',
+            group='value')
+
+        loc_b64 = ''
+        for c in loc:
+            c_ord = ord(c)
+            if ord('a') <= c_ord <= ord('z') or ord('A') <= c_ord <= ord('Z'):
+                upper = ord('Z') if c_ord <= ord('Z') else ord('z')
+                c_ord += 13
+                if upper < c_ord:
+                    c_ord -= 26
+            loc_b64 += compat_chr(c_ord)
+
+        video_url = compat_b64decode(loc_b64).decode('utf-8')
+
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']pt-movie-desc[^>]+>(.+?)</div>', webpage,
+            'description', fatal=False)
+
+        thumbnail = self._search_regex(
+            r'<img[^>]+class=["\']video-preview[^>]+\bsrc=(["\'])(?P<value>(?:(?!\1).)+)\1',
+            webpage, 'thumbnail', default=None,
+            group='value') or self._og_search_thumbnail(webpage)
+
+        creator = self._html_search_meta(
+            'video:director', webpage, 'creator', default=None)
+
+        release_date = self._html_search_meta(
+            'video:release_date', webpage, default=None)
+        if release_date:
+            release_date = release_date.replace('-', '')
+
+        def int_meta(name):
+            return int_or_none(self._html_search_meta(
+                name, webpage, default=None))
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'creator': creator,
+            'release_date': release_date,
+            'duration': int_meta('video:duration'),
+            'tbr': int_meta('ya:ovs:bitrate'),
+            'width': int_meta('og:video:width'),
+            'height': int_meta('og:video:height'),
+            'http_headers': {
+                'Referer': url,
+            },
+        }
index 27d65d4b9cdcf1e068b0d6971502eaa0caccf895..c6052ac9f966f332d0cfb1f7acfe68b0a143d2b7 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     ExtractorError,
     int_or_none,
     js_to_json,
+    merge_dicts,
     urljoin,
 )
 
@@ -27,23 +28,22 @@ class PornHdIE(InfoExtractor):
             'view_count': int,
             'like_count': int,
             'age_limit': 18,
-        }
+        },
+        'skip': 'HTTP Error 404: Not Found',
     }, {
-        # removed video
         'url': 'http://www.pornhd.com/videos/1962/sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
-        'md5': '956b8ca569f7f4d8ec563e2c41598441',
+        'md5': '1b7b3a40b9d65a8e5b25f7ab9ee6d6de',
         'info_dict': {
             'id': '1962',
             'display_id': 'sierra-day-gets-his-cum-all-over-herself-hd-porn-video',
             'ext': 'mp4',
-            'title': 'Sierra loves doing laundry',
+            'title': 'md5:98c6f8b2d9c229d0f0fde47f61a1a759',
             'description': 'md5:8ff0523848ac2b8f9b065ba781ccf294',
             'thumbnail': r're:^https?://.*\.jpg',
             'view_count': int,
             'like_count': int,
             'age_limit': 18,
         },
-        'skip': 'Not available anymore',
     }]
 
     def _real_extract(self, url):
@@ -61,7 +61,13 @@ class PornHdIE(InfoExtractor):
             r"(?s)sources'?\s*[:=]\s*(\{.+?\})",
             webpage, 'sources', default='{}')), video_id)
 
+        info = {}
         if not sources:
+            entries = self._parse_html5_media_entries(url, webpage, video_id)
+            if entries:
+                info = entries[0]
+
+        if not sources and not info:
             message = self._html_search_regex(
                 r'(?s)<(div|p)[^>]+class="no-video"[^>]*>(?P<value>.+?)</\1',
                 webpage, 'error message', group='value')
@@ -80,23 +86,29 @@ class PornHdIE(InfoExtractor):
                 'format_id': format_id,
                 'height': height,
             })
-        self._sort_formats(formats)
+        if formats:
+            info['formats'] = formats
+        self._sort_formats(info['formats'])
 
         description = self._html_search_regex(
-            r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1',
-            webpage, 'description', fatal=False, group='value')
+            (r'(?s)<section[^>]+class=["\']video-description[^>]+>(?P<value>.+?)</section>',
+             r'<(div|p)[^>]+class="description"[^>]*>(?P<value>[^<]+)</\1'),
+            webpage, 'description', fatal=False,
+            group='value') or self._html_search_meta(
+            'description', webpage, default=None) or self._og_search_description(webpage)
         view_count = int_or_none(self._html_search_regex(
             r'(\d+) views\s*<', webpage, 'view count', fatal=False))
         thumbnail = self._search_regex(
             r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
-            'thumbnail', fatal=False, group='url')
+            'thumbnail', default=None, group='url')
 
         like_count = int_or_none(self._search_regex(
-            (r'(\d+)\s*</11[^>]+>(?:&nbsp;|\s)*\blikes',
+            (r'(\d+)</span>\s*likes',
+             r'(\d+)\s*</11[^>]+>(?:&nbsp;|\s)*\blikes',
              r'class=["\']save-count["\'][^>]*>\s*(\d+)'),
             webpage, 'like count', fatal=False))
 
-        return {
+        return merge_dicts(info, {
             'id': video_id,
             'display_id': display_id,
             'title': title,
@@ -106,4 +118,4 @@ class PornHdIE(InfoExtractor):
             'like_count': like_count,
             'formats': formats,
             'age_limit': 18,
-        }
+        })
index b3251ccd9b2300188f7efce561c3c5d1fbff702e..3567a32839eef2f75123a3f1b939038cf3eaf678 100644 (file)
@@ -52,7 +52,7 @@ class PornHubIE(PornHubBaseIE):
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
-                            (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
+                            (?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                             (?:www\.)?thumbzilla\.com/video/
                         )
                         (?P<id>[\da-z]+)
@@ -149,6 +149,9 @@ class PornHubIE(PornHubBaseIE):
     }, {
         'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
         'only_matching': True,
+    }, {
+        'url': 'https://www.pornhubpremium.com/view_video.php?viewkey=ph5e4acdae54a82',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -166,6 +169,13 @@ class PornHubIE(PornHubBaseIE):
         host = mobj.group('host') or 'pornhub.com'
         video_id = mobj.group('id')
 
+        if 'premium' in host:
+            if not self._downloader.params.get('cookiefile'):
+                raise ExtractorError(
+                    'PornHub Premium requires authentication.'
+                    ' You may want to use --cookies.',
+                    expected=True)
+
         self._set_cookie(host, 'age_verified', '1')
 
         def dl_webpage(platform):
@@ -189,10 +199,10 @@ class PornHubIE(PornHubBaseIE):
         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
         # on that anymore.
         title = self._html_search_meta(
-            'twitter:title', webpage, default=None) or self._search_regex(
-            (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
-             r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
-             r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
+            'twitter:title', webpage, default=None) or self._html_search_regex(
+            (r'(?s)<h1[^>]+class=["\']title["\'][^>]*>(?P<title>.+?)</h1>',
+             r'<div[^>]+data-video-title=(["\'])(?P<title>(?:(?!\1).)+)\1',
+             r'shareTitle["\']\s*[=:]\s*(["\'])(?P<title>(?:(?!\1).)+)\1'),
             webpage, 'title', group='title')
 
         video_urls = []
@@ -405,7 +415,7 @@ class PornHubPlaylistBaseIE(PornHubBaseIE):
 
 
 class PornHubUserIE(PornHubPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
     _TESTS = [{
         'url': 'https://www.pornhub.com/model/zoe_ph',
         'playlist_mincount': 118,
@@ -473,7 +483,7 @@ class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
 
 
 class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
+    _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
     _TESTS = [{
         'url': 'https://www.pornhub.com/model/zoe_ph/videos',
         'only_matching': True,
@@ -588,7 +598,7 @@ class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
 
 
 class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
-    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
+    _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub(?:premium)?\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
     _TESTS = [{
         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
         'info_dict': {
index 4942437c7dbe918bd408bf805f8b76cc1d7967e4..2cc66512241dbc6f65589e52d842cf70b7250ccb 100644 (file)
@@ -8,7 +8,6 @@ from .common import InfoExtractor
 
 from ..compat import (
     compat_parse_qs,
-    compat_str,
     compat_urlparse,
 )
 from ..utils import (
@@ -39,13 +38,13 @@ class SafariBaseIE(InfoExtractor):
             'Downloading login page')
 
         def is_logged(urlh):
-            return 'learning.oreilly.com/home/' in compat_str(urlh.geturl())
+            return 'learning.oreilly.com/home/' in urlh.geturl()
 
         if is_logged(urlh):
             self.LOGGED_IN = True
             return
 
-        redirect_url = compat_str(urlh.geturl())
+        redirect_url = urlh.geturl()
         parsed_url = compat_urlparse.urlparse(redirect_url)
         qs = compat_parse_qs(parsed_url.query)
         next_uri = compat_urlparse.urljoin(
index e579d42cf525b56500b98c84272b67b279e36baa..9401bf2cf7fcdad2eb218f5a3d072399932fea9a 100644 (file)
@@ -7,9 +7,18 @@ from .common import InfoExtractor
 
 
 class ServusIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)/(?P<id>[aA]{2}-\w+|\d+-\d+)'
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:www\.)?
+                        (?:
+                            servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
+                            servustv\.com/videos
+                        )
+                        /(?P<id>[aA]{2}-\w+|\d+-\d+)
+                    '''
     _TESTS = [{
-        'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
+        # new URL schema
+        'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
         'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
         'info_dict': {
             'id': 'AA-1T6VBU5PW1W12',
@@ -18,6 +27,10 @@ class ServusIE(InfoExtractor):
             'description': 'md5:1247204d85783afe3682644398ff2ec4',
             'thumbnail': r're:^https?://.*\.jpg',
         }
+    }, {
+        # old URL schema
+        'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/',
+        'only_matching': True,
     }, {
         'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/',
         'only_matching': True,
index a0b09f5b1747e3b1b9b3a17b10d482b47512143a..ff6be0b545a79260341ec25e6059d306fe859ef8 100644 (file)
@@ -27,6 +27,7 @@ from ..utils import (
     unified_timestamp,
     update_url_query,
     url_or_none,
+    urlhandle_detect_ext,
 )
 
 
@@ -96,7 +97,7 @@ class SoundcloudIE(InfoExtractor):
                 'repost_count': int,
             }
         },
-        # not streamable song, preview
+        # geo-restricted
         {
             'url': 'https://soundcloud.com/the-concept-band/goldrushed-mastered?in=the-concept-band/sets/the-royal-concept-ep',
             'info_dict': {
@@ -108,17 +109,13 @@ class SoundcloudIE(InfoExtractor):
                 'uploader_id': '9615865',
                 'timestamp': 1337635207,
                 'upload_date': '20120521',
-                'duration': 30,
+                'duration': 227.155,
                 'license': 'all-rights-reserved',
                 'view_count': int,
                 'like_count': int,
                 'comment_count': int,
                 'repost_count': int,
             },
-            'params': {
-                # rtmp
-                'skip_download': True,
-            },
         },
         # private link
         {
@@ -229,7 +226,6 @@ class SoundcloudIE(InfoExtractor):
                 'skip_download': True,
             },
         },
-        # not available via api.soundcloud.com/i1/tracks/id/streams
         {
             'url': 'https://soundcloud.com/giovannisarani/mezzo-valzer',
             'md5': 'e22aecd2bc88e0e4e432d7dcc0a1abf7',
@@ -238,7 +234,7 @@ class SoundcloudIE(InfoExtractor):
                 'ext': 'mp3',
                 'title': 'Mezzo Valzer',
                 'description': 'md5:4138d582f81866a530317bae316e8b61',
-                'uploader': 'Giovanni Sarani',
+                'uploader': 'Micronie',
                 'uploader_id': '3352531',
                 'timestamp': 1551394171,
                 'upload_date': '20190228',
@@ -250,11 +246,9 @@ class SoundcloudIE(InfoExtractor):
                 'comment_count': int,
                 'repost_count': int,
             },
-            'expected_warnings': ['Unable to download JSON metadata'],
         }
     ]
 
-    _API_BASE = 'https://api.soundcloud.com/'
     _API_V2_BASE = 'https://api-v2.soundcloud.com/'
     _BASE_URL = 'https://soundcloud.com/'
     _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
@@ -316,10 +310,9 @@ class SoundcloudIE(InfoExtractor):
     def _resolv_url(cls, url):
         return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url
 
-    def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2):
+    def _extract_info_dict(self, info, full_title=None, secret_token=None):
         track_id = compat_str(info['id'])
         title = info['title']
-        track_base_url = self._API_BASE + 'tracks/%s' % track_id
 
         format_urls = set()
         formats = []
@@ -328,21 +321,22 @@ class SoundcloudIE(InfoExtractor):
             query['secret_token'] = secret_token
 
         if info.get('downloadable') and info.get('has_downloads_left'):
-            format_url = update_url_query(
-                info.get('download_url') or track_base_url + '/download', query)
-            format_urls.add(format_url)
-            if version == 2:
-                v1_info = self._download_json(
-                    track_base_url, track_id, query=query, fatal=False) or {}
-            else:
-                v1_info = info
-            formats.append({
-                'format_id': 'download',
-                'ext': v1_info.get('original_format') or 'mp3',
-                'filesize': int_or_none(v1_info.get('original_content_size')),
-                'url': format_url,
-                'preference': 10,
-            })
+            download_url = update_url_query(
+                self._API_V2_BASE + 'tracks/' + track_id + '/download', query)
+            redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri')
+            if redirect_url:
+                urlh = self._request_webpage(
+                    HEADRequest(redirect_url), track_id, fatal=False)
+                if urlh:
+                    format_url = urlh.geturl()
+                    format_urls.add(format_url)
+                    formats.append({
+                        'format_id': 'download',
+                        'ext': urlhandle_detect_ext(urlh) or 'mp3',
+                        'filesize': int_or_none(urlh.headers.get('Content-Length')),
+                        'url': format_url,
+                        'preference': 10,
+                    })
 
         def invalid_url(url):
             return not url or url in format_urls
@@ -406,42 +400,11 @@ class SoundcloudIE(InfoExtractor):
             }, 'http' if protocol == 'progressive' else protocol,
                 t.get('snipped') or '/preview/' in format_url)
 
-        if not formats:
-            # Old API, does not work for some tracks (e.g.
-            # https://soundcloud.com/giovannisarani/mezzo-valzer)
-            # and might serve preview URLs (e.g.
-            # http://www.soundcloud.com/snbrn/ele)
-            format_dict = self._download_json(
-                track_base_url + '/streams', track_id,
-                'Downloading track url', query=query, fatal=False) or {}
-
-            for key, stream_url in format_dict.items():
-                if invalid_url(stream_url):
-                    continue
-                format_urls.add(stream_url)
-                mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key)
-                if mobj:
-                    protocol, ext, abr = mobj.groups()
-                    add_format({
-                        'abr': abr,
-                        'ext': ext,
-                        'url': stream_url,
-                    }, protocol)
-
-        if not formats:
-            # We fallback to the stream_url in the original info, this
-            # cannot be always used, sometimes it can give an HTTP 404 error
-            urlh = self._request_webpage(
-                HEADRequest(info.get('stream_url') or track_base_url + '/stream'),
-                track_id, query=query, fatal=False)
-            if urlh:
-                stream_url = urlh.geturl()
-                if not invalid_url(stream_url):
-                    add_format({'url': stream_url}, 'http')
-
         for f in formats:
             f['vcodec'] = 'none'
 
+        if not formats and info.get('policy') == 'BLOCK':
+            self.raise_geo_restricted()
         self._sort_formats(formats)
 
         user = info.get('user') or {}
@@ -511,20 +474,24 @@ class SoundcloudIE(InfoExtractor):
                 resolve_title += '/%s' % token
             info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
 
-        version = 2
         info = self._download_json(
-            info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False)
-        if not info:
-            info = self._download_json(
-                info_json_url.replace(self._API_V2_BASE, self._API_BASE),
-                full_title, 'Downloading info JSON', query=query)
-            version = 1
+            info_json_url, full_title, 'Downloading info JSON', query=query)
 
-        return self._extract_info_dict(info, full_title, token, version)
+        return self._extract_info_dict(info, full_title, token)
 
 
 class SoundcloudPlaylistBaseIE(SoundcloudIE):
-    def _extract_track_entries(self, tracks, token=None):
+    def _extract_set(self, playlist, token=None):
+        playlist_id = compat_str(playlist['id'])
+        tracks = playlist.get('tracks') or []
+        if not all([t.get('permalink_url') for t in tracks]) and token:
+            tracks = self._download_json(
+                self._API_V2_BASE + 'tracks', playlist_id,
+                'Downloading tracks', query={
+                    'ids': ','.join([compat_str(t['id']) for t in tracks]),
+                    'playlistId': playlist_id,
+                    'playlistSecretToken': token,
+                })
         entries = []
         for track in tracks:
             track_id = str_or_none(track.get('id'))
@@ -537,7 +504,10 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE):
                     url += '?secret_token=' + token
             entries.append(self.url_result(
                 url, SoundcloudIE.ie_key(), track_id))
-        return entries
+        return self.playlist_result(
+            entries, playlist_id,
+            playlist.get('title'),
+            playlist.get('description'))
 
 
 class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
@@ -548,6 +518,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
         'info_dict': {
             'id': '2284613',
             'title': 'The Royal Concept EP',
+            'description': 'md5:71d07087c7a449e8941a70a29e34671e',
         },
         'playlist_mincount': 5,
     }, {
@@ -570,13 +541,10 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
             msgs = (compat_str(err['error_message']) for err in info['errors'])
             raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
 
-        entries = self._extract_track_entries(info['tracks'], token)
-
-        return self.playlist_result(
-            entries, str_or_none(info.get('id')), info.get('title'))
+        return self._extract_set(info, token)
 
 
-class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
+class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
     def _extract_playlist(self, base_url, playlist_id, playlist_title):
         COMMON_QUERY = {
             'limit': 2000000000,
@@ -774,10 +742,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
             self._API_V2_BASE + 'playlists/' + playlist_id,
             playlist_id, 'Downloading playlist', query=query)
 
-        entries = self._extract_track_entries(data['tracks'], token)
-
-        return self.playlist_result(
-            entries, playlist_id, data.get('title'), data.get('description'))
+        return self._extract_set(data, token)
 
 
 class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
index a3c35a899a2186f1e937771cd0e34df408b2d361..378fc75686313f92a846aaa30579049e9a29eccc 100644 (file)
@@ -13,36 +13,18 @@ from ..utils import (
 class SportDeutschlandIE(InfoExtractor):
     _VALID_URL = r'https?://sportdeutschland\.tv/(?P<sport>[^/?#]+)/(?P<id>[^?#/]+)(?:$|[?#])'
     _TESTS = [{
-        'url': 'http://sportdeutschland.tv/badminton/live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
+        'url': 'https://sportdeutschland.tv/badminton/re-live-deutsche-meisterschaften-2020-halbfinals?playlistId=0',
         'info_dict': {
-            'id': 'live-li-ning-badminton-weltmeisterschaft-2014-kopenhagen',
+            'id': 're-live-deutsche-meisterschaften-2020-halbfinals',
             'ext': 'mp4',
-            'title': 're:Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen',
-            'categories': ['Badminton'],
+            'title': 're:Re-live: Deutsche Meisterschaften 2020.*Halbfinals',
+            'categories': ['Badminton-Deutschland'],
             'view_count': int,
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'description': r're:Die Badminton-WM 2014 aus Kopenhagen bei Sportdeutschland\.TV',
+            'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
             'timestamp': int,
-            'upload_date': 're:^201408[23][0-9]$',
+            'upload_date': '20200201',
+            'description': 're:.*',  # meaningless description for THIS video
         },
-        'params': {
-            'skip_download': 'Live stream',
-        },
-    }, {
-        'url': 'http://sportdeutschland.tv/li-ning-badminton-wm-2014/lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs',
-        'info_dict': {
-            'id': 'lee-li-ning-badminton-weltmeisterschaft-2014-kopenhagen-herren-einzel-wei-vs',
-            'ext': 'mp4',
-            'upload_date': '20140825',
-            'description': 'md5:60a20536b57cee7d9a4ec005e8687504',
-            'timestamp': 1408976060,
-            'duration': 2732,
-            'title': 'Li-Ning Badminton Weltmeisterschaft 2014 Kopenhagen: Herren Einzel, Wei Lee vs. Keun Lee',
-            'thumbnail': r're:^https?://.*\.jpg$',
-            'view_count': int,
-            'categories': ['Li-Ning Badminton WM 2014'],
-
-        }
     }]
 
     def _real_extract(self, url):
@@ -50,7 +32,7 @@ class SportDeutschlandIE(InfoExtractor):
         video_id = mobj.group('id')
         sport_id = mobj.group('sport')
 
-        api_url = 'http://proxy.vidibusdynamic.net/sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
+        api_url = 'https://proxy.vidibusdynamic.net/ssl/backend.sportdeutschland.tv/api/permalinks/%s/%s?access_token=true' % (
             sport_id, video_id)
         req = sanitized_Request(api_url, headers={
             'Accept': 'application/vnd.vidibus.v2.html+json',
index 0901c3163e6cab4723d451b30df8574e359ba899..e12389cad80a83612e10d052b7f36bceba0f1fbf 100644 (file)
@@ -4,19 +4,14 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_parse_qs,
-    compat_urllib_parse_urlparse,
-)
+from ..compat import compat_str
 from ..utils import (
     determine_ext,
     dict_get,
     int_or_none,
-    orderedSet,
+    str_or_none,
     strip_or_none,
     try_get,
-    urljoin,
-    compat_str,
 )
 
 
@@ -237,23 +232,23 @@ class SVTPlayIE(SVTPlayBaseIE):
 
 
 class SVTSeriesIE(SVTPlayBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)'
+    _VALID_URL = r'https?://(?:www\.)?svtplay\.se/(?P<id>[^/?&#]+)(?:.+?\btab=(?P<season_slug>[^&#]+))?'
     _TESTS = [{
         'url': 'https://www.svtplay.se/rederiet',
         'info_dict': {
-            'id': 'rederiet',
+            'id': '14445680',
             'title': 'Rederiet',
-            'description': 'md5:505d491a58f4fcf6eb418ecab947e69e',
+            'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
         },
         'playlist_mincount': 318,
     }, {
-        'url': 'https://www.svtplay.se/rederiet?tab=sasong2',
+        'url': 'https://www.svtplay.se/rederiet?tab=season-2-14445680',
         'info_dict': {
-            'id': 'rederiet-sasong2',
+            'id': 'season-2-14445680',
             'title': 'Rederiet - Säsong 2',
-            'description': 'md5:505d491a58f4fcf6eb418ecab947e69e',
+            'description': 'md5:d9fdfff17f5d8f73468176ecd2836039',
         },
-        'playlist_count': 12,
+        'playlist_mincount': 12,
     }]
 
     @classmethod
@@ -261,83 +256,87 @@ class SVTSeriesIE(SVTPlayBaseIE):
         return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTSeriesIE, cls).suitable(url)
 
     def _real_extract(self, url):
-        series_id = self._match_id(url)
-
-        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
-        season_slug = qs.get('tab', [None])[0]
-
-        if season_slug:
-            series_id += '-%s' % season_slug
-
-        webpage = self._download_webpage(
-            url, series_id, 'Downloading series page')
-
-        root = self._parse_json(
-            self._search_regex(
-                self._SVTPLAY_RE, webpage, 'content', group='json'),
-            series_id)
+        series_slug, season_id = re.match(self._VALID_URL, url).groups()
+
+        series = self._download_json(
+            'https://api.svt.se/contento/graphql', series_slug,
+            'Downloading series page', query={
+                'query': '''{
+  listablesBySlug(slugs: ["%s"]) {
+    associatedContent(include: [productionPeriod, season]) {
+      items {
+        item {
+          ... on Episode {
+            videoSvtId
+          }
+        }
+      }
+      id
+      name
+    }
+    id
+    longDescription
+    name
+    shortDescription
+  }
+}''' % series_slug,
+            })['data']['listablesBySlug'][0]
 
         season_name = None
 
         entries = []
-        for season in root['relatedVideoContent']['relatedVideosAccordion']:
+        for season in series['associatedContent']:
             if not isinstance(season, dict):
                 continue
-            if season_slug:
-                if season.get('slug') != season_slug:
+            if season_id:
+                if season.get('id') != season_id:
                     continue
                 season_name = season.get('name')
-            videos = season.get('videos')
-            if not isinstance(videos, list):
+            items = season.get('items')
+            if not isinstance(items, list):
                 continue
-            for video in videos:
-                content_url = video.get('contentUrl')
-                if not content_url or not isinstance(content_url, compat_str):
+            for item in items:
+                video = item.get('item') or {}
+                content_id = video.get('videoSvtId')
+                if not content_id or not isinstance(content_id, compat_str):
                     continue
-                entries.append(
-                    self.url_result(
-                        urljoin(url, content_url),
-                        ie=SVTPlayIE.ie_key(),
-                        video_title=video.get('title')
-                    ))
-
-        metadata = root.get('metaData')
-        if not isinstance(metadata, dict):
-            metadata = {}
+                entries.append(self.url_result(
+                    'svt:' + content_id, SVTPlayIE.ie_key(), content_id))
 
-        title = metadata.get('title')
-        season_name = season_name or season_slug
+        title = series.get('name')
+        season_name = season_name or season_id
 
         if title and season_name:
             title = '%s - %s' % (title, season_name)
-        elif season_slug:
-            title = season_slug
+        elif season_id:
+            title = season_id
 
         return self.playlist_result(
-            entries, series_id, title, metadata.get('description'))
+            entries, season_id or series.get('id'), title,
+            dict_get(series, ('longDescription', 'shortDescription')))
 
 
 class SVTPageIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?svt\.se/(?:[^/]+/)*(?P<id>[^/?&#]+)'
+    _VALID_URL = r'https?://(?:www\.)?svt\.se/(?P<path>(?:[^/]+/)*(?P<id>[^/?&#]+))'
     _TESTS = [{
-        'url': 'https://www.svt.se/sport/oseedat/guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
+        'url': 'https://www.svt.se/sport/ishockey/bakom-masken-lehners-kamp-mot-mental-ohalsa',
         'info_dict': {
-            'id': 'guide-sommartraningen-du-kan-gora-var-och-nar-du-vill',
-            'title': 'GUIDE: Sommarträning du kan göra var och när du vill',
+            'id': '25298267',
+            'title': 'Bakom masken – Lehners kamp mot mental ohälsa',
         },
-        'playlist_count': 7,
+        'playlist_count': 4,
     }, {
-        'url': 'https://www.svt.se/nyheter/inrikes/ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
+        'url': 'https://www.svt.se/nyheter/utrikes/svenska-andrea-ar-en-mil-fran-branderna-i-kalifornien',
         'info_dict': {
-            'id': 'ebba-busch-thor-kd-har-delvis-ratt-om-no-go-zoner',
-            'title': 'Ebba Busch Thor har bara delvis rätt om ”no-go-zoner”',
+            'id': '24243746',
+            'title': 'Svenska Andrea redo att fly sitt hem i Kalifornien',
         },
-        'playlist_count': 1,
+        'playlist_count': 2,
     }, {
         # only programTitle
         'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
         'info_dict': {
-            'id': '2900353',
+            'id': '8439V2K',
             'ext': 'mp4',
             'title': 'Stjärnorna skojar till det - under SVT-intervjun',
             'duration': 27,
@@ -356,16 +355,26 @@ class SVTPageIE(InfoExtractor):
         return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
 
     def _real_extract(self, url):
-        playlist_id = self._match_id(url)
+        path, display_id = re.match(self._VALID_URL, url).groups()
 
-        webpage = self._download_webpage(url, playlist_id)
+        article = self._download_json(
+            'https://api.svt.se/nss-api/page/' + path, display_id,
+            query={'q': 'articles'})['articles']['content'][0]
 
-        entries = [
-            self.url_result(
-                'svt:%s' % video_id, ie=SVTPlayIE.ie_key(), video_id=video_id)
-            for video_id in orderedSet(re.findall(
-                r'data-video-id=["\'](\d+)', webpage))]
+        entries = []
 
-        title = strip_or_none(self._og_search_title(webpage, default=None))
+        def _process_content(content):
+            if content.get('_type') in ('VIDEOCLIP', 'VIDEOEPISODE'):
+                video_id = compat_str(content['image']['svtId'])
+                entries.append(self.url_result(
+                    'svt:' + video_id, SVTPlayIE.ie_key(), video_id))
 
-        return self.playlist_result(entries, playlist_id, title)
+        for media in article.get('media', []):
+            _process_content(media)
+
+        for obj in article.get('structuredBody', []):
+            _process_content(obj.get('content') or {})
+
+        return self.playlist_result(
+            entries, str_or_none(article.get('id')),
+            strip_or_none(article.get('title')))
index 6b7f13b437e3f3671b355b7dabcea9fd5056969e..a75369dbe8a3582595ae339d58887eaefd220536 100644 (file)
@@ -4,11 +4,12 @@ import re
 
 from .common import InfoExtractor
 from .wistia import WistiaIE
-from ..compat import compat_str
 from ..utils import (
     clean_html,
     ExtractorError,
+    int_or_none,
     get_element_by_class,
+    strip_or_none,
     urlencode_postdata,
     urljoin,
 )
@@ -20,8 +21,8 @@ class TeachableBaseIE(InfoExtractor):
 
     _SITES = {
         # Only notable ones here
-        'upskillcourses.com': 'upskill',
-        'academy.gns3.com': 'gns3',
+        'v1.upskillcourses.com': 'upskill',
+        'gns3.teachable.com': 'gns3',
         'academyhacker.com': 'academyhacker',
         'stackskills.com': 'stackskills',
         'market.saleshacker.com': 'saleshacker',
@@ -58,7 +59,7 @@ class TeachableBaseIE(InfoExtractor):
             self._logged_in = True
             return
 
-        login_url = compat_str(urlh.geturl())
+        login_url = urlh.geturl()
 
         login_form = self._hidden_inputs(login_page)
 
@@ -110,27 +111,29 @@ class TeachableIE(TeachableBaseIE):
                     ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
 
     _TESTS = [{
-        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+        'url': 'https://gns3.teachable.com/courses/gns3-certified-associate/lectures/6842364',
         'info_dict': {
-            'id': 'uzw6zw58or',
-            'ext': 'mp4',
-            'title': 'Welcome to the Course!',
-            'description': 'md5:65edb0affa582974de4625b9cdea1107',
-            'duration': 138.763,
-            'timestamp': 1479846621,
-            'upload_date': '20161122',
+            'id': 'untlgzk1v7',
+            'ext': 'bin',
+            'title': 'Overview',
+            'description': 'md5:071463ff08b86c208811130ea1c2464c',
+            'duration': 736.4,
+            'timestamp': 1542315762,
+            'upload_date': '20181115',
+            'chapter': 'Welcome',
+            'chapter_number': 1,
         },
         'params': {
             'skip_download': True,
         },
     }, {
-        'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
+        'url': 'http://v1.upskillcourses.com/courses/119763/lectures/1747100',
         'only_matching': True,
     }, {
-        'url': 'https://academy.gns3.com/courses/423415/lectures/6885939',
+        'url': 'https://gns3.teachable.com/courses/423415/lectures/6885939',
         'only_matching': True,
     }, {
-        'url': 'teachable:https://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+        'url': 'teachable:https://v1.upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
         'only_matching': True,
     }]
 
@@ -160,8 +163,8 @@ class TeachableIE(TeachableBaseIE):
 
         webpage = self._download_webpage(url, video_id)
 
-        wistia_url = WistiaIE._extract_url(webpage)
-        if not wistia_url:
+        wistia_urls = WistiaIE._extract_urls(webpage)
+        if not wistia_urls:
             if any(re.search(p, webpage) for p in (
                     r'class=["\']lecture-contents-locked',
                     r'>\s*Lecture contents locked',
@@ -174,12 +177,37 @@ class TeachableIE(TeachableBaseIE):
 
         title = self._og_search_title(webpage, default=None)
 
-        return {
+        chapter = None
+        chapter_number = None
+        section_item = self._search_regex(
+            r'(?s)(?P<li><li[^>]+\bdata-lecture-id=["\']%s[^>]+>.+?</li>)' % video_id,
+            webpage, 'section item', default=None, group='li')
+        if section_item:
+            chapter_number = int_or_none(self._search_regex(
+                r'data-ss-position=["\'](\d+)', section_item, 'section id',
+                default=None))
+            if chapter_number is not None:
+                sections = []
+                for s in re.findall(
+                        r'(?s)<div[^>]+\bclass=["\']section-title[^>]+>(.+?)</div>', webpage):
+                    section = strip_or_none(clean_html(s))
+                    if not section:
+                        sections = []
+                        break
+                    sections.append(section)
+                if chapter_number <= len(sections):
+                    chapter = sections[chapter_number - 1]
+
+        entries = [{
             '_type': 'url_transparent',
             'url': wistia_url,
             'ie_key': WistiaIE.ie_key(),
             'title': title,
-        }
+            'chapter': chapter,
+            'chapter_number': chapter_number,
+        } for wistia_url in wistia_urls]
+
+        return self.playlist_result(entries, video_id, title)
 
 
 class TeachableCourseIE(TeachableBaseIE):
@@ -191,20 +219,20 @@ class TeachableCourseIE(TeachableBaseIE):
                         /(?:courses|p)/(?:enrolled/)?(?P<id>[^/?#&]+)
                     ''' % TeachableBaseIE._VALID_URL_SUB_TUPLE
     _TESTS = [{
-        'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
+        'url': 'http://v1.upskillcourses.com/courses/essential-web-developer-course/',
         'info_dict': {
             'id': 'essential-web-developer-course',
             'title': 'The Essential Web Developer Course (Free)',
         },
         'playlist_count': 192,
     }, {
-        'url': 'http://upskillcourses.com/courses/119763/',
+        'url': 'http://v1.upskillcourses.com/courses/119763/',
         'only_matching': True,
     }, {
-        'url': 'http://upskillcourses.com/courses/enrolled/119763',
+        'url': 'http://v1.upskillcourses.com/courses/enrolled/119763',
         'only_matching': True,
     }, {
-        'url': 'https://academy.gns3.com/courses/enrolled/423415',
+        'url': 'https://gns3.teachable.com/courses/enrolled/423415',
         'only_matching': True,
     }, {
         'url': 'teachable:https://learn.vrdev.school/p/gear-vr-developer-mini',
index d37e1b0557cf3ba241a25e7e56d28c8dc679b1d0..9ba3da341dac65d18a599a790bff9c95b0e52eb8 100644 (file)
@@ -11,6 +11,7 @@ from ..utils import (
     determine_ext,
     int_or_none,
     str_or_none,
+    try_get,
     urljoin,
 )
 
@@ -24,7 +25,7 @@ class TelecincoIE(InfoExtractor):
         'info_dict': {
             'id': '1876350223',
             'title': 'Bacalao con kokotxas al pil-pil',
-            'description': 'md5:1382dacd32dd4592d478cbdca458e5bb',
+            'description': 'md5:716caf5601e25c3c5ab6605b1ae71529',
         },
         'playlist': [{
             'md5': 'adb28c37238b675dad0f042292f209a7',
@@ -55,6 +56,26 @@ class TelecincoIE(InfoExtractor):
             'description': 'md5:2771356ff7bfad9179c5f5cd954f1477',
             'duration': 50,
         },
+    }, {
+        # video in opening's content
+        'url': 'https://www.telecinco.es/vivalavida/fiorella-sobrina-edmundo-arrocet-entrevista_18_2907195140.html',
+        'info_dict': {
+            'id': '2907195140',
+            'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
+            'description': 'md5:73f340a7320143d37ab895375b2bf13a',
+        },
+        'playlist': [{
+            'md5': 'adb28c37238b675dad0f042292f209a7',
+            'info_dict': {
+                'id': 'TpI2EttSDAReWpJ1o0NVh2',
+                'ext': 'mp4',
+                'title': 'La surrealista entrevista a la sobrina de Edmundo Arrocet: "No puedes venir aquí y tomarnos por tontos"',
+                'duration': 1015,
+            },
+        }],
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'http://www.telecinco.es/informativos/nacional/Pablo_Iglesias-Informativos_Telecinco-entrevista-Pedro_Piqueras_2_1945155182.html',
         'only_matching': True,
@@ -135,17 +156,28 @@ class TelecincoIE(InfoExtractor):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
         article = self._parse_json(self._search_regex(
-            r'window\.\$REACTBASE_STATE\.article\s*=\s*({.+})',
+            r'window\.\$REACTBASE_STATE\.article(?:_multisite)?\s*=\s*({.+})',
             webpage, 'article'), display_id)['article']
         title = article.get('title')
-        description = clean_html(article.get('leadParagraph'))
+        description = clean_html(article.get('leadParagraph')) or ''
         if article.get('editorialType') != 'VID':
             entries = []
-            for p in article.get('body', []):
+            body = [article.get('opening')]
+            body.extend(try_get(article, lambda x: x['body'], list) or [])
+            for p in body:
+                if not isinstance(p, dict):
+                    continue
                 content = p.get('content')
-                if p.get('type') != 'video' or not content:
+                if not content:
+                    continue
+                type_ = p.get('type')
+                if type_ == 'paragraph':
+                    content_str = str_or_none(content)
+                    if content_str:
+                        description += content_str
                     continue
-                entries.append(self._parse_content(content, url))
+                if type_ == 'video' and isinstance(content, dict):
+                    entries.append(self._parse_content(content, url))
             return self.playlist_result(
                 entries, str_or_none(article.get('id')), title, description)
         content = article['opening']['content']
index ae9f66787439462967baa63dc58f39870fb89382..c82c94b3a0009da2cf0938c92910feec84de018b 100644 (file)
@@ -38,8 +38,6 @@ class TeleQuebecIE(TeleQuebecBaseIE):
             'ext': 'mp4',
             'title': 'Un petit choc et puis repart!',
             'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374',
-            'upload_date': '20180222',
-            'timestamp': 1519326631,
         },
         'params': {
             'skip_download': True,
index 0e2370cd828f78a2e1a708852a392a05d96e3039..0631cb7aba8a7068a291fb8e67de0d5e04acf482 100644 (file)
@@ -17,14 +17,12 @@ class TFOIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?tfo\.org/(?:en|fr)/(?:[^/]+/){2}(?P<id>\d+)'
     _TEST = {
         'url': 'http://www.tfo.org/en/universe/tfo-247/100463871/video-game-hackathon',
-        'md5': '47c987d0515561114cf03d1226a9d4c7',
+        'md5': 'cafbe4f47a8dae0ca0159937878100d6',
         'info_dict': {
-            'id': '100463871',
+            'id': '7da3d50e495c406b8fc0b997659cc075',
             'ext': 'mp4',
             'title': 'Video Game Hackathon',
             'description': 'md5:558afeba217c6c8d96c60e5421795c07',
-            'upload_date': '20160212',
-            'timestamp': 1455310233,
         }
     }
 
index 6ab147ad726306ba9250599d34491a50e64e82d0..387f955eee5752ac8797c85375070ba77a897075 100644 (file)
@@ -2,43 +2,42 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import try_get
 
 
 class ThisOldHouseIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'
+    _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/]+/)?\d+)/(?P<id>[^/?#]+)'
     _TESTS = [{
         'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
-        'md5': '568acf9ca25a639f0c4ff905826b662f',
         'info_dict': {
-            'id': '2REGtUDQ',
+            'id': '5dcdddf673c3f956ef5db202',
             'ext': 'mp4',
             'title': 'How to Build a Storage Bench',
             'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.',
             'timestamp': 1442548800,
             'upload_date': '20150918',
-        }
+        },
+        'params': {
+            'skip_download': True,
+        },
     }, {
         'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins',
         'only_matching': True,
     }, {
         'url': 'https://www.thisoldhouse.com/tv-episode/ask-toh-shelf-rough-electric',
         'only_matching': True,
+    }, {
+        'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.thisoldhouse.com/21113884/s41-e13-paradise-lost',
+        'only_matching': True,
     }]
+    _ZYPE_TMPL = 'https://player.zype.com/embed/%s.html?api_key=hsOk_yMSPYNrT22e9pu8hihLXjaZf0JW5jsOWv4ZqyHJFvkJn6rtToHl09tbbsbe'
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
         webpage = self._download_webpage(url, display_id)
         video_id = self._search_regex(
-            (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1',
-             r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'),
-            webpage, 'video id', default=None, group='id')
-        if not video_id:
-            drupal_settings = self._parse_json(self._search_regex(
-                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
-                webpage, 'drupal settings'), display_id)
-            video_id = try_get(
-                drupal_settings, lambda x: x['jwplatform']['video_id'],
-                compat_str) or list(drupal_settings['comScore'])[0]
-        return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id)
+            r'<iframe[^>]+src=[\'"](?:https?:)?//thisoldhouse\.chorus\.build/videos/zype/([0-9a-f]{24})',
+            webpage, 'video id')
+        return self.url_result(self._ZYPE_TMPL % video_id, 'Zype', video_id)
index 5e5efda0f0780fb98b7c37b788ad2734a837e90d..ca2e36efe4216ad66d46252c662ea4cc5395c3ca 100644 (file)
@@ -17,9 +17,9 @@ from ..utils import (
 
 class ToggleIE(InfoExtractor):
     IE_NAME = 'toggle'
-    _VALID_URL = r'https?://video\.toggle\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:(?:www\.)?mewatch|video\.toggle)\.sg/(?:en|zh)/(?:[^/]+/){2,}(?P<id>[0-9]+)'
     _TESTS = [{
-        'url': 'http://video.toggle.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
+        'url': 'http://www.mewatch.sg/en/series/lion-moms-tif/trailers/lion-moms-premier/343115',
         'info_dict': {
             'id': '343115',
             'ext': 'mp4',
@@ -33,7 +33,7 @@ class ToggleIE(InfoExtractor):
         }
     }, {
         'note': 'DRM-protected video',
-        'url': 'http://video.toggle.sg/en/movies/dug-s-special-mission/341413',
+        'url': 'http://www.mewatch.sg/en/movies/dug-s-special-mission/341413',
         'info_dict': {
             'id': '341413',
             'ext': 'wvm',
@@ -48,7 +48,7 @@ class ToggleIE(InfoExtractor):
     }, {
         # this also tests correct video id extraction
         'note': 'm3u8 links are geo-restricted, but Android/mp4 is okay',
-        'url': 'http://video.toggle.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861',
+        'url': 'http://www.mewatch.sg/en/series/28th-sea-games-5-show/28th-sea-games-5-show-ep11/332861',
         'info_dict': {
             'id': '332861',
             'ext': 'mp4',
@@ -65,19 +65,22 @@ class ToggleIE(InfoExtractor):
         'url': 'http://video.toggle.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331',
         'only_matching': True,
     }, {
-        'url': 'http://video.toggle.sg/zh/series/zero-calling-s2-hd/ep13/336367',
+        'url': 'http://www.mewatch.sg/en/clips/seraph-sun-aloysius-will-suddenly-sing-some-old-songs-in-high-pitch-on-set/343331',
         'only_matching': True,
     }, {
-        'url': 'http://video.toggle.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302',
+        'url': 'http://www.mewatch.sg/zh/series/zero-calling-s2-hd/ep13/336367',
         'only_matching': True,
     }, {
-        'url': 'http://video.toggle.sg/en/movies/seven-days/321936',
+        'url': 'http://www.mewatch.sg/en/series/vetri-s2/webisodes/jeeva-is-an-orphan-vetri-s2-webisode-7/342302',
         'only_matching': True,
     }, {
-        'url': 'https://video.toggle.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456',
+        'url': 'http://www.mewatch.sg/en/movies/seven-days/321936',
         'only_matching': True,
     }, {
-        'url': 'http://video.toggle.sg/en/channels/eleven-plus/401585',
+        'url': 'https://www.mewatch.sg/en/tv-show/news/may-2017-cna-singapore-tonight/fri-19-may-2017/512456',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.mewatch.sg/en/channels/eleven-plus/401585',
         'only_matching': True,
     }]
 
index edbb0aa6944ba82b36415875f2d99e570b3373fc..ae584ad697bdf3f460eff033b8f43e75776942ee 100644 (file)
@@ -4,7 +4,6 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_str
 from ..utils import (
     ExtractorError,
     int_or_none,
@@ -151,7 +150,7 @@ class TumblrIE(InfoExtractor):
         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
         webpage, urlh = self._download_webpage_handle(url, video_id)
 
-        redirect_url = compat_str(urlh.geturl())
+        redirect_url = urlh.geturl()
         if 'tumblr.com/safe-mode' in redirect_url or redirect_url.startswith('/safe-mode'):
             raise ExtractorError(
                 'This Tumblr may contain sensitive media. '
index 611fdc0c6c7002c1669200c7ace75bf498a85c6c..8bda9348d723073b894d2d77b6556b51d89dad80 100644 (file)
@@ -106,7 +106,7 @@ class TV2DKBornholmPlayIE(InfoExtractor):
         video_id = self._match_id(url)
 
         video = self._download_json(
-            'http://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id,
+            'https://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id,
             data=json.dumps({
                 'playlist_id': video_id,
                 'serienavn': '',
index 88b6baa316b54eb58e3deb5d69f2fd04c1795bba..b7fe082b9c00e6600b6bf9f6e29849a3fd4eb3b6 100644 (file)
@@ -3,31 +3,51 @@ from __future__ import unicode_literals
 
 from .common import InfoExtractor
 from ..utils import (
-    clean_html,
     determine_ext,
     extract_attributes,
-    get_element_by_class,
     int_or_none,
     parse_duration,
-    parse_iso8601,
 )
 
 
 class TV5MondePlusIE(InfoExtractor):
     IE_DESC = 'TV5MONDE+'
-    _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
-    _TEST = {
-        'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
-        'md5': '12130fc199f020673138a83466542ec6',
+    _VALID_URL = r'https?://(?:www\.)?(?:tv5mondeplus|revoir\.tv5monde)\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
+    _TESTS = [{
+        # movie
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/cinema/rendez-vous-a-atlit',
+        'md5': '8cbde5ea7b296cf635073e27895e227f',
         'info_dict': {
-            'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
+            'id': '822a4756-0712-7329-1859-a13ac7fd1407',
+            'display_id': 'rendez-vous-a-atlit',
             'ext': 'mp4',
-            'title': 'Tdah, mon amour - Enfants',
-            'description': 'md5:230e3aca23115afcf8006d1bece6df74',
-            'upload_date': '20170401',
-            'timestamp': 1491022860,
-        }
-    }
+            'title': 'Rendez-vous à Atlit',
+            'description': 'md5:2893a4c5e1dbac3eedff2d87956e4efb',
+            'upload_date': '20200130',
+        },
+    }, {
+        # series episode
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/c-est-la-vie-ennemie-juree',
+        'info_dict': {
+            'id': '0df7007c-4900-3936-c601-87a13a93a068',
+            'display_id': 'c-est-la-vie-ennemie-juree',
+            'ext': 'mp4',
+            'title': "C'est la vie - Ennemie jurée",
+            'description': 'md5:dfb5c63087b6f35fe0cc0af4fe44287e',
+            'upload_date': '20200130',
+            'series': "C'est la vie",
+            'episode': 'Ennemie jurée',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/series-fictions/neuf-jours-en-hiver-neuf-jours-en-hiver',
+        'only_matching': True,
+    }, {
+        'url': 'https://revoir.tv5monde.com/toutes-les-videos/info-societe/le-journal-de-la-rts-edition-du-30-01-20-19h30',
+        'only_matching': True,
+    }]
     _GEO_BYPASS = False
 
     def _real_extract(self, url):
@@ -37,11 +57,7 @@ class TV5MondePlusIE(InfoExtractor):
         if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
             self.raise_geo_restricted(countries=['FR'])
 
-        series = get_element_by_class('video-detail__title', webpage)
-        title = episode = get_element_by_class(
-            'video-detail__subtitle', webpage) or series
-        if series and series != title:
-            title = '%s - %s' % (series, title)
+        title = episode = self._html_search_regex(r'<h1>([^<]+)', webpage, 'title')
         vpl_data = extract_attributes(self._search_regex(
             r'(<[^>]+class="video_player_loader"[^>]+>)',
             webpage, 'video player loader'))
@@ -65,15 +81,37 @@ class TV5MondePlusIE(InfoExtractor):
                 })
         self._sort_formats(formats)
 
+        description = self._html_search_regex(
+            r'(?s)<div[^>]+class=["\']episode-texte[^>]+>(.+?)</div>', webpage,
+            'description', fatal=False)
+
+        series = self._html_search_regex(
+            r'<p[^>]+class=["\']episode-emission[^>]+>([^<]+)', webpage,
+            'series', default=None)
+
+        if series and series != title:
+            title = '%s - %s' % (series, title)
+
+        upload_date = self._search_regex(
+            r'(?:date_publication|publish_date)["\']\s*:\s*["\'](\d{4}_\d{2}_\d{2})',
+            webpage, 'upload date', default=None)
+        if upload_date:
+            upload_date = upload_date.replace('_', '')
+
+        video_id = self._search_regex(
+            (r'data-guid=["\']([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+             r'id_contenu["\']\s:\s*(\d+)'), webpage, 'video id',
+            default=display_id)
+
         return {
-            'id': display_id,
+            'id': video_id,
             'display_id': display_id,
             'title': title,
-            'description': clean_html(get_element_by_class('video-detail__description', webpage)),
+            'description': description,
             'thumbnail': vpl_data.get('data-image'),
             'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)),
-            'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)),
+            'upload_date': upload_date,
             'formats': formats,
-            'episode': episode,
             'series': series,
+            'episode': episode,
         }
index 0b863df2ff4ad214162c6187ac7aaa65fe3fc6c9..443f46e8a3537165d620c2db8863634e9f922ab6 100644 (file)
@@ -9,8 +9,8 @@ from ..utils import (
 
 
 class TVAIE(InfoExtractor):
-    _VALID_URL = r'https?://videos\.tva\.ca/details/_(?P<id>\d+)'
-    _TEST = {
+    _VALID_URL = r'https?://videos?\.tva\.ca/details/_(?P<id>\d+)'
+    _TESTS = [{
         'url': 'https://videos.tva.ca/details/_5596811470001',
         'info_dict': {
             'id': '5596811470001',
@@ -24,7 +24,10 @@ class TVAIE(InfoExtractor):
             # m3u8 download
             'skip_download': True,
         }
-    }
+    }, {
+        'url': 'https://video.tva.ca/details/_5596811470001',
+        'only_matching': True,
+    }]
     BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5481942443001/default_default/index.html?videoId=%s'
 
     def _real_extract(self, url):
index 2830c212ebbf9c6daa819667cc994001f8dc798a..74d14049b482a702bf464a40f2e5f361dc7cd72a 100644 (file)
@@ -17,7 +17,7 @@ class TwentyFourVideoIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                     https?://
                         (?P<host>
-                            (?:(?:www|porno)\.)?24video\.
+                            (?:(?:www|porno?)\.)?24video\.
                             (?:net|me|xxx|sexy?|tube|adult|site|vip)
                         )/
                         (?:
@@ -62,6 +62,9 @@ class TwentyFourVideoIE(InfoExtractor):
     }, {
         'url': 'https://www.24video.vip/video/view/1044982',
         'only_matching': True,
+    }, {
+        'url': 'https://porn.24video.net/video/2640421-vsya-takay',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index a8c2502af8132834a34b8ef9c8ade935dd432604..0db2dca41c7e7e914c66956987d2f921b81b1791 100644 (file)
@@ -575,8 +575,8 @@ class TwitchStreamIE(TwitchBaseIE):
         channel_id = self._match_id(url)
 
         stream = self._call_api(
-            'kraken/streams/%s?stream_type=all' % channel_id, channel_id,
-            'Downloading stream JSON').get('stream')
+            'kraken/streams/%s?stream_type=all' % channel_id.lower(),
+            channel_id, 'Downloading stream JSON').get('stream')
 
         if not stream:
             raise ExtractorError('%s is offline' % channel_id, expected=True)
index 851ad936cfc012b02c3125c9a0a3e898f9c6f005..d6b92b1c833072bab2bbacb8503693b6da156427 100644 (file)
@@ -1,28 +1,62 @@
 from __future__ import unicode_literals
 
-import base64
+import json
 import re
 
 from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
+from ..compat import compat_HTTPError
 from ..utils import (
     ExtractorError,
-    clean_html,
-    determine_ext,
     int_or_none,
-    js_to_json,
     parse_age_limit,
-    parse_duration,
-    try_get,
 )
 
 
 class ViewLiftBaseIE(InfoExtractor):
-    _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm)\.com|hoichoi\.tv'
+    _API_BASE = 'https://prod-api.viewlift.com/'
+    _DOMAINS_REGEX = r'(?:(?:main\.)?snagfilms|snagxtreme|funnyforfree|kiddovid|winnersview|(?:monumental|lax)sportsnetwork|vayafilm|failarmy|ftfnext|lnppass\.legapallacanestro|moviespree|app\.myoutdoortv|neoufitness|pflmma|theidentitytb)\.com|(?:hoichoi|app\.horseandcountry|kronon|marquee|supercrosslive)\.tv'
+    _SITE_MAP = {
+        'ftfnext': 'lax',
+        'funnyforfree': 'snagfilms',
+        'hoichoi': 'hoichoitv',
+        'kiddovid': 'snagfilms',
+        'laxsportsnetwork': 'lax',
+        'legapallacanestro': 'lnp',
+        'marquee': 'marquee-tv',
+        'monumentalsportsnetwork': 'monumental-network',
+        'moviespree': 'bingeflix',
+        'pflmma': 'pfl',
+        'snagxtreme': 'snagfilms',
+        'theidentitytb': 'tampabay',
+        'vayafilm': 'snagfilms',
+    }
+    _TOKENS = {}
+
+    def _call_api(self, site, path, video_id, query):
+        token = self._TOKENS.get(site)
+        if not token:
+            token_query = {'site': site}
+            email, password = self._get_login_info(netrc_machine=site)
+            if email:
+                resp = self._download_json(
+                    self._API_BASE + 'identity/signin', video_id,
+                    'Logging in', query=token_query, data=json.dumps({
+                        'email': email,
+                        'password': password,
+                    }).encode())
+            else:
+                resp = self._download_json(
+                    self._API_BASE + 'identity/anonymous-token', video_id,
+                    'Downloading authorization token', query=token_query)
+            self._TOKENS[site] = token = resp['authorizationToken']
+        return self._download_json(
+            self._API_BASE + path, video_id,
+            headers={'Authorization': token}, query=query)
 
 
 class ViewLiftEmbedIE(ViewLiftBaseIE):
-    _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?:%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX
+    IE_NAME = 'viewlift:embed'
+    _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX
     _TESTS = [{
         'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
         'md5': '2924e9215c6eff7a55ed35b72276bd93',
@@ -30,6 +64,9 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
             'id': '74849a00-85a9-11e1-9660-123139220831',
             'ext': 'mp4',
             'title': '#whilewewatch',
+            'description': 'md5:b542bef32a6f657dadd0df06e26fb0c8',
+            'timestamp': 1334350096,
+            'upload_date': '20120413',
         }
     }, {
         # invalid labels, 360p is better that 480p
@@ -39,7 +76,8 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
             'id': '17ca0950-a74a-11e0-a92a-0026bb61d036',
             'ext': 'mp4',
             'title': 'Life in Limbo',
-        }
+        },
+        'skip': 'The video does not exist',
     }, {
         'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
         'only_matching': True,
@@ -54,67 +92,68 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
             return mobj.group('url')
 
     def _real_extract(self, url):
-        video_id = self._match_id(url)
-
-        webpage = self._download_webpage(url, video_id)
-
-        if '>This film is not playable in your area.<' in webpage:
-            raise ExtractorError(
-                'Film %s is not playable in your area.' % video_id, expected=True)
+        domain, film_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[-2]
+        if site in self._SITE_MAP:
+            site = self._SITE_MAP[site]
+        try:
+            content_data = self._call_api(
+                site, 'entitlement/video/status', film_id, {
+                    'id': film_id
+                })['video']
+        except ExtractorError as e:
+            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+                error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage')
+                if error_message == 'User does not have a valid subscription or has not purchased this content.':
+                    self.raise_login_required()
+                raise ExtractorError(error_message, expected=True)
+            raise
+        gist = content_data['gist']
+        title = gist['title']
+        video_assets = content_data['streamingInfo']['videoAssets']
 
         formats = []
-        has_bitrate = False
-        sources = self._parse_json(self._search_regex(
-            r'(?s)sources:\s*(\[.+?\]),', webpage,
-            'sources', default='[]'), video_id, js_to_json)
-        for source in sources:
-            file_ = source.get('file')
-            if not file_:
+        mpeg_video_assets = video_assets.get('mpeg') or []
+        for video_asset in mpeg_video_assets:
+            video_asset_url = video_asset.get('url')
+            if not video_asset:
                 continue
-            type_ = source.get('type')
-            ext = determine_ext(file_)
-            format_id = source.get('label') or ext
-            if all(v in ('m3u8', 'hls') for v in (type_, ext)):
-                formats.extend(self._extract_m3u8_formats(
-                    file_, video_id, 'mp4', 'm3u8_native',
-                    m3u8_id='hls', fatal=False))
-            else:
-                bitrate = int_or_none(self._search_regex(
-                    [r'(\d+)kbps', r'_\d{1,2}x\d{1,2}_(\d{3,})\.%s' % ext],
-                    file_, 'bitrate', default=None))
-                if not has_bitrate and bitrate:
-                    has_bitrate = True
-                height = int_or_none(self._search_regex(
-                    r'^(\d+)[pP]$', format_id, 'height', default=None))
-                formats.append({
-                    'url': file_,
-                    'format_id': 'http-%s%s' % (format_id, ('-%dk' % bitrate if bitrate else '')),
-                    'tbr': bitrate,
-                    'height': height,
-                })
-        if not formats:
-            hls_url = self._parse_json(self._search_regex(
-                r'filmInfo\.src\s*=\s*({.+?});',
-                webpage, 'src'), video_id, js_to_json)['src']
-            formats = self._extract_m3u8_formats(
-                hls_url, video_id, 'mp4', 'm3u8_native',
-                m3u8_id='hls', fatal=False)
-        field_preference = None if has_bitrate else ('height', 'tbr', 'format_id')
-        self._sort_formats(formats, field_preference)
-
-        title = self._search_regex(
-            [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
-            webpage, 'title')
-
-        return {
-            'id': video_id,
+            bitrate = int_or_none(video_asset.get('bitrate'))
+            height = int_or_none(self._search_regex(
+                r'^_?(\d+)[pP]$', video_asset.get('renditionValue'),
+                'height', default=None))
+            formats.append({
+                'url': video_asset_url,
+                'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''),
+                'tbr': bitrate,
+                'height': height,
+                'vcodec': video_asset.get('codec'),
+            })
+
+        hls_url = video_assets.get('hls')
+        if hls_url:
+            formats.extend(self._extract_m3u8_formats(
+                hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
+        self._sort_formats(formats, ('height', 'tbr', 'format_id'))
+
+        info = {
+            'id': film_id,
             'title': title,
+            'description': gist.get('description'),
+            'thumbnail': gist.get('videoImageUrl'),
+            'duration': int_or_none(gist.get('runtime')),
+            'age_limit': parse_age_limit(content_data.get('parentalRating')),
+            'timestamp': int_or_none(gist.get('publishDate'), 1000),
             'formats': formats,
         }
+        for k in ('categories', 'tags'):
+            info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')]
+        return info
 
 
 class ViewLiftIE(ViewLiftBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?:/(?:films/title|show|(?:news/)?videos?))?/(?P<id>[^?#]+)' % ViewLiftBaseIE._DOMAINS_REGEX
+    IE_NAME = 'viewlift'
+    _VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX
     _TESTS = [{
         'url': 'http://www.snagfilms.com/films/title/lost_for_life',
         'md5': '19844f897b35af219773fd63bdec2942',
@@ -151,10 +190,13 @@ class ViewLiftIE(ViewLiftBaseIE):
             'id': '00000148-7b53-de26-a9fb-fbf306f70020',
             'display_id': 'augie_alone/s_2_ep_12_love',
             'ext': 'mp4',
-            'title': 'Augie, Alone:S. 2 Ep. 12 - Love',
-            'description': 'md5:db2a5c72d994f16a780c1eb353a8f403',
+            'title': 'S. 2 Ep. 12 - Love',
+            'description': 'Augie finds love.',
             'thumbnail': r're:^https?://.*\.jpg',
             'duration': 107,
+            'upload_date': '20141012',
+            'timestamp': 1413129540,
+            'age_limit': 17,
         },
         'params': {
             'skip_download': True,
@@ -177,6 +219,9 @@ class ViewLiftIE(ViewLiftBaseIE):
         # Was once Kaltura embed
         'url': 'https://www.monumentalsportsnetwork.com/videos/john-carlson-postgame-2-25-15',
         'only_matching': True,
+    }, {
+        'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -184,119 +229,22 @@ class ViewLiftIE(ViewLiftBaseIE):
         return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url)
 
     def _real_extract(self, url):
-        domain, display_id = re.match(self._VALID_URL, url).groups()
-
-        webpage = self._download_webpage(url, display_id)
-
-        if ">Sorry, the Film you're looking for is not available.<" in webpage:
-            raise ExtractorError(
-                'Film %s is not available.' % display_id, expected=True)
-
-        initial_store_state = self._search_regex(
-            r"window\.initialStoreState\s*=.*?JSON\.parse\(unescape\(atob\('([^']+)'\)\)\)",
-            webpage, 'Initial Store State', default=None)
-        if initial_store_state:
-            modules = self._parse_json(compat_urllib_parse_unquote(base64.b64decode(
-                initial_store_state).decode()), display_id)['page']['data']['modules']
-            content_data = next(m['contentData'][0] for m in modules if m.get('moduleType') == 'VideoDetailModule')
-            gist = content_data['gist']
-            film_id = gist['id']
-            title = gist['title']
-            video_assets = try_get(
-                content_data, lambda x: x['streamingInfo']['videoAssets'], dict)
-            if not video_assets:
-                token = self._download_json(
-                    'https://prod-api.viewlift.com/identity/anonymous-token',
-                    film_id, 'Downloading authorization token',
-                    query={'site': 'snagfilms'})['authorizationToken']
-                video_assets = self._download_json(
-                    'https://prod-api.viewlift.com/entitlement/video/status',
-                    film_id, headers={
-                        'Authorization': token,
-                        'Referer': url,
-                    }, query={
-                        'id': film_id
-                    })['video']['streamingInfo']['videoAssets']
-
-            formats = []
-            mpeg_video_assets = video_assets.get('mpeg') or []
-            for video_asset in mpeg_video_assets:
-                video_asset_url = video_asset.get('url')
-                if not video_asset:
-                    continue
-                bitrate = int_or_none(video_asset.get('bitrate'))
-                height = int_or_none(self._search_regex(
-                    r'^_?(\d+)[pP]$', video_asset.get('renditionValue'),
-                    'height', default=None))
-                formats.append({
-                    'url': video_asset_url,
-                    'format_id': 'http%s' % ('-%d' % bitrate if bitrate else ''),
-                    'tbr': bitrate,
-                    'height': height,
-                    'vcodec': video_asset.get('codec'),
-                })
-
-            hls_url = video_assets.get('hls')
-            if hls_url:
-                formats.extend(self._extract_m3u8_formats(
-                    hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
-            self._sort_formats(formats, ('height', 'tbr', 'format_id'))
-
-            info = {
-                'id': film_id,
-                'display_id': display_id,
-                'title': title,
-                'description': gist.get('description'),
-                'thumbnail': gist.get('videoImageUrl'),
-                'duration': int_or_none(gist.get('runtime')),
-                'age_limit': parse_age_limit(content_data.get('parentalRating')),
-                'timestamp': int_or_none(gist.get('publishDate'), 1000),
-                'formats': formats,
-            }
-            for k in ('categories', 'tags'):
-                info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')]
-            return info
-        else:
-            film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
-
-            snag = self._parse_json(
-                self._search_regex(
-                    r'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag', default='[]'),
-                display_id)
-
-            for item in snag:
-                if item.get('data', {}).get('film', {}).get('id') == film_id:
-                    data = item['data']['film']
-                    title = data['title']
-                    description = clean_html(data.get('synopsis'))
-                    thumbnail = data.get('image')
-                    duration = int_or_none(data.get('duration') or data.get('runtime'))
-                    categories = [
-                        category['title'] for category in data.get('categories', [])
-                        if category.get('title')]
-                    break
-            else:
-                title = self._html_search_regex(
-                    (r'itemprop="title">([^<]+)<',
-                     r'(?s)itemprop="title">(.+?)<div'), webpage, 'title')
-                description = self._html_search_regex(
-                    r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
-                    webpage, 'description', default=None) or self._og_search_description(webpage)
-                thumbnail = self._og_search_thumbnail(webpage)
-                duration = parse_duration(self._search_regex(
-                    r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
-                    webpage, 'duration', fatal=False))
-                categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
-
-            return {
-                '_type': 'url_transparent',
-                'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
-                'id': film_id,
-                'display_id': display_id,
-                'title': title,
-                'description': description,
-                'thumbnail': thumbnail,
-                'duration': duration,
-                'categories': categories,
-                'ie_key': 'ViewLiftEmbed',
-            }
+        domain, path, display_id = re.match(self._VALID_URL, url).groups()
+        site = domain.split('.')[-2]
+        if site in self._SITE_MAP:
+            site = self._SITE_MAP[site]
+        modules = self._call_api(
+            site, 'content/pages', display_id, {
+                'includeContent': 'true',
+                'moduleOffset': 1,
+                'path': path,
+                'site': site,
+            })['modules']
+        film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule')
+        return {
+            '_type': 'url_transparent',
+            'url': 'http://%s/embed/player?filmId=%s' % (domain, film_id),
+            'id': film_id,
+            'display_id': display_id,
+            'ie_key': 'ViewLiftEmbed',
+        }
index baa46d5f3513cbde337f144c7143a9c501455ff4..8cd611e1e42f177e7331ca4c3ea08a350a613e7f 100644 (file)
@@ -33,6 +33,7 @@ from ..utils import (
     unified_timestamp,
     unsmuggle_url,
     urlencode_postdata,
+    urljoin,
     unescapeHTML,
 )
 
@@ -191,7 +192,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
             for tt in text_tracks:
                 subtitles[tt['lang']] = [{
                     'ext': 'vtt',
-                    'url': 'https://vimeo.com' + tt['url'],
+                    'url': urljoin('https://vimeo.com', tt['url']),
                 }]
 
         thumbnails = []
@@ -591,7 +592,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
             # Retrieve video webpage to extract further information
             webpage, urlh = self._download_webpage_handle(
                 url, video_id, headers=headers)
-            redirect_url = compat_str(urlh.geturl())
+            redirect_url = urlh.geturl()
         except ExtractorError as ee:
             if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
                 errmsg = ee.cause.read()
@@ -841,33 +842,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
         return self._TITLE or self._html_search_regex(
             self._TITLE_RE, webpage, 'list title', fatal=False)
 
-    def _login_list_password(self, page_url, list_id, webpage):
-        login_form = self._search_regex(
-            r'(?s)<form[^>]+?id="pw_form"(.*?)</form>',
-            webpage, 'login form', default=None)
-        if not login_form:
-            return webpage
-
-        password = self._downloader.params.get('videopassword')
-        if password is None:
-            raise ExtractorError('This album is protected by a password, use the --video-password option', expected=True)
-        fields = self._hidden_inputs(login_form)
-        token, vuid = self._extract_xsrft_and_vuid(webpage)
-        fields['token'] = token
-        fields['password'] = password
-        post = urlencode_postdata(fields)
-        password_path = self._search_regex(
-            r'action="([^"]+)"', login_form, 'password URL')
-        password_url = compat_urlparse.urljoin(page_url, password_path)
-        password_request = sanitized_Request(password_url, post)
-        password_request.add_header('Content-type', 'application/x-www-form-urlencoded')
-        self._set_vimeo_cookie('vuid', vuid)
-        self._set_vimeo_cookie('xsrft', token)
-
-        return self._download_webpage(
-            password_request, list_id,
-            'Verifying the password', 'Wrong password')
-
     def _title_and_entries(self, list_id, base_url):
         for pagenum in itertools.count(1):
             page_url = self._page_url(base_url, pagenum)
@@ -876,7 +850,6 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
                 'Downloading page %s' % pagenum)
 
             if pagenum == 1:
-                webpage = self._login_list_password(page_url, list_id, webpage)
                 yield self._extract_list_title(webpage)
 
             # Try extracting href first since not all videos are available via
@@ -923,7 +896,7 @@ class VimeoUserIE(VimeoChannelIE):
     _BASE_URL_TEMPL = 'https://vimeo.com/%s'
 
 
-class VimeoAlbumIE(VimeoChannelIE):
+class VimeoAlbumIE(VimeoBaseInfoExtractor):
     IE_NAME = 'vimeo:album'
     _VALID_URL = r'https://vimeo\.com/(?:album|showcase)/(?P<id>\d+)(?:$|[?#]|/(?!video))'
     _TITLE_RE = r'<header id="page_header">\n\s*<h1>(.*?)</h1>'
@@ -973,13 +946,39 @@ class VimeoAlbumIE(VimeoChannelIE):
     def _real_extract(self, url):
         album_id = self._match_id(url)
         webpage = self._download_webpage(url, album_id)
-        webpage = self._login_list_password(url, album_id, webpage)
-        api_config = self._extract_vimeo_config(webpage, album_id)['api']
+        viewer = self._parse_json(self._search_regex(
+            r'bootstrap_data\s*=\s*({.+?})</script>',
+            webpage, 'bootstrap data'), album_id)['viewer']
+        jwt = viewer['jwt']
+        album = self._download_json(
+            'https://api.vimeo.com/albums/' + album_id,
+            album_id, headers={'Authorization': 'jwt ' + jwt},
+            query={'fields': 'description,name,privacy'})
+        hashed_pass = None
+        if try_get(album, lambda x: x['privacy']['view']) == 'password':
+            password = self._downloader.params.get('videopassword')
+            if not password:
+                raise ExtractorError(
+                    'This album is protected by a password, use the --video-password option',
+                    expected=True)
+            self._set_vimeo_cookie('vuid', viewer['vuid'])
+            try:
+                hashed_pass = self._download_json(
+                    'https://vimeo.com/showcase/%s/auth' % album_id,
+                    album_id, 'Verifying the password', data=urlencode_postdata({
+                        'password': password,
+                        'token': viewer['xsrft'],
+                    }), headers={
+                        'X-Requested-With': 'XMLHttpRequest',
+                    })['hashed_pass']
+            except ExtractorError as e:
+                if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+                    raise ExtractorError('Wrong password', expected=True)
+                raise
         entries = OnDemandPagedList(functools.partial(
-            self._fetch_page, album_id, api_config['jwt'],
-            api_config.get('hashed_pass')), self._PAGE_SIZE)
-        return self.playlist_result(entries, album_id, self._html_search_regex(
-            r'<title>\s*(.+?)(?:\s+on Vimeo)?</title>', webpage, 'title', fatal=False))
+            self._fetch_page, album_id, jwt, hashed_pass), self._PAGE_SIZE)
+        return self.playlist_result(
+            entries, album_id, album.get('name'), album.get('description'))
 
 
 class VimeoGroupsIE(VimeoChannelIE):
index 085514d470bcee6d5719a9260ceefdfc3a9cdb17..168e5e90152b44d76dcbbbeb1b274db5dcbf5827 100644 (file)
@@ -45,22 +45,23 @@ class WistiaIE(InfoExtractor):
     # https://wistia.com/support/embed-and-share/video-on-your-website
     @staticmethod
     def _extract_url(webpage):
-        match = re.search(
-            r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage)
-        if match:
-            return unescapeHTML(match.group('url'))
+        urls = WistiaIE._extract_urls(webpage)
+        return urls[0] if urls else None
 
-        match = re.search(
-            r'''(?sx)
-                <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
-                <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2
-            ''', webpage)
-        if match:
-            return 'wistia:%s' % match.group('id')
-
-        match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage)
-        if match:
-            return 'wistia:%s' % match.group('id')
+    @staticmethod
+    def _extract_urls(webpage):
+        urls = []
+        for match in re.finditer(
+                r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
+            urls.append(unescapeHTML(match.group('url')))
+        for match in re.finditer(
+                r'''(?sx)
+                    <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2
+                ''', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage):
+            urls.append('wistia:%s' % match.group('id'))
+        return urls
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index a5b94d2794166d452464b728ec40b2b258459c64..0f7be6a7d93adc3a4fea8c6995cd8b58a084b9b4 100644 (file)
@@ -113,7 +113,7 @@ class XHamsterIE(InfoExtractor):
         display_id = mobj.group('display_id') or mobj.group('display_id_2')
 
         desktop_url = re.sub(r'^(https?://(?:.+?\.)?)m\.', r'\1', url)
-        webpage = self._download_webpage(desktop_url, video_id)
+        webpage, urlh = self._download_webpage_handle(desktop_url, video_id)
 
         error = self._html_search_regex(
             r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
@@ -161,6 +161,9 @@ class XHamsterIE(InfoExtractor):
                         'ext': determine_ext(format_url, 'mp4'),
                         'height': get_height(quality),
                         'filesize': filesize,
+                        'http_headers': {
+                            'Referer': urlh.geturl(),
+                        },
                     })
             self._sort_formats(formats)
 
index c6c0b3291c8320064fa0a7529be5b5d78f14461c..01b253dcb1e8c92232a06c0b2b4153a545dabcc1 100644 (file)
@@ -47,7 +47,7 @@ class XTubeIE(InfoExtractor):
             'display_id': 'A-Super-Run-Part-1-YT',
             'ext': 'flv',
             'title': 'A Super Run - Part 1 (YT)',
-            'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+            'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
             'uploader': 'tshirtguy59',
             'duration': 579,
             'view_count': int,
@@ -87,10 +87,24 @@ class XTubeIE(InfoExtractor):
                 'Cookie': 'age_verified=1; cookiesAccepted=1',
             })
 
-        sources = self._parse_json(self._search_regex(
-            r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
-            webpage, 'sources', group='sources'), video_id,
-            transform_source=js_to_json)
+        title, thumbnail, duration = [None] * 3
+
+        config = self._parse_json(self._search_regex(
+            r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config',
+            default='{}'), video_id, transform_source=js_to_json, fatal=False)
+        if config:
+            config = config.get('mainRoll')
+            if isinstance(config, dict):
+                title = config.get('title')
+                thumbnail = config.get('poster')
+                duration = int_or_none(config.get('duration'))
+                sources = config.get('sources') or config.get('format')
+
+        if not isinstance(sources, dict):
+            sources = self._parse_json(self._search_regex(
+                r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+                webpage, 'sources', group='sources'), video_id,
+                transform_source=js_to_json)
 
         formats = []
         for format_id, format_url in sources.items():
@@ -102,20 +116,25 @@ class XTubeIE(InfoExtractor):
         self._remove_duplicate_formats(formats)
         self._sort_formats(formats)
 
-        title = self._search_regex(
-            (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
-            webpage, 'title', group='title')
-        description = self._search_regex(
+        if not title:
+            title = self._search_regex(
+                (r'<h1>\s*(?P<title>[^<]+?)\s*</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'),
+                webpage, 'title', group='title')
+        description = self._og_search_description(
+            webpage, default=None) or self._html_search_meta(
+            'twitter:description', webpage, default=None) or self._search_regex(
             r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False)
         uploader = self._search_regex(
             (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"',
              r'<span[^>]+class="nickname"[^>]*>([^<]+)'),
             webpage, 'uploader', fatal=False)
-        duration = parse_duration(self._search_regex(
-            r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>',
-            webpage, 'duration', fatal=False))
+        if not duration:
+            duration = parse_duration(self._search_regex(
+                r'<dt>Runtime:?</dt>\s*<dd>([^<]+)</dd>',
+                webpage, 'duration', fatal=False))
         view_count = str_to_int(self._search_regex(
-            r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>',
+            (r'["\']viewsCount["\'][^>]*>(\d+)\s+views',
+             r'<dt>Views:?</dt>\s*<dd>([\d,\.]+)</dd>'),
             webpage, 'view count', fatal=False))
         comment_count = str_to_int(self._html_search_regex(
             r'>Comments? \(([\d,\.]+)\)<',
@@ -126,6 +145,7 @@ class XTubeIE(InfoExtractor):
             'display_id': display_id,
             'title': title,
             'description': description,
+            'thumbnail': thumbnail,
             'uploader': uploader,
             'duration': duration,
             'view_count': view_count,
@@ -144,7 +164,7 @@ class XTubeUserIE(InfoExtractor):
             'id': 'greenshowers-4056496',
             'age_limit': 18,
         },
-        'playlist_mincount': 155,
+        'playlist_mincount': 154,
     }
 
     def _real_extract(self, url):
index dff69fcb7aca250373fc0e70b2f8278ed2661755..88aabd272c98e944523f3b333174342ba23c9fe1 100644 (file)
@@ -44,7 +44,7 @@ class YouJizzIE(InfoExtractor):
 
         encodings = self._parse_json(
             self._search_regex(
-                r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
+                r'[Ee]ncodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
                 default='[]'),
             video_id, fatal=False)
         for encoding in encodings:
index eacaa5ecdb70d2a16748b4c2e58edc14d7d69484..908defecd3f24ef4070c46f6e8d195aea625ab4b 100644 (file)
@@ -29,7 +29,6 @@ from ..compat import (
 from ..utils import (
     bool_or_none,
     clean_html,
-    dict_get,
     error_to_compat_str,
     extract_attributes,
     ExtractorError,
@@ -570,7 +569,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'upload_date': '20120506',
                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
                 'alt_title': 'I Love It (feat. Charli XCX)',
-                'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8',
+                'description': 'md5:19a2f98d9032b9311e686ed039564f63',
                 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli',
                          'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop',
                          'iconic ep', 'iconic', 'love', 'it'],
@@ -685,12 +684,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'id': 'nfWlot6h_JM',
                 'ext': 'm4a',
                 'title': 'Taylor Swift - Shake It Off',
-                'description': 'md5:bec2185232c05479482cb5a9b82719bf',
+                'description': 'md5:307195cd21ff7fa352270fe884570ef0',
                 'duration': 242,
                 'uploader': 'TaylorSwiftVEVO',
                 'uploader_id': 'TaylorSwiftVEVO',
                 'upload_date': '20140818',
-                'creator': 'Taylor Swift',
             },
             'params': {
                 'youtube_include_dash_manifest': True,
@@ -755,11 +753,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'upload_date': '20100430',
                 'uploader_id': 'deadmau5',
                 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
-                'creator': 'deadmau5',
+                'creator': 'Dada Life, deadmau5',
                 'description': 'md5:12c56784b8032162bb936a5f76d55360',
                 'uploader': 'deadmau5',
                 'title': 'Deadmau5 - Some Chords (HD)',
-                'alt_title': 'Some Chords',
+                'alt_title': 'This Machine Kills Some Chords',
             },
             'expected_warnings': [
                 'DASH manifest missing',
@@ -1135,6 +1133,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': True,
                 'youtube_include_dash_manifest': False,
             },
+            'skip': 'not actual anymore',
         },
         {
             # Youtube Music Auto-generated description
@@ -1145,8 +1144,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'title': 'Voyeur Girl',
                 'description': 'md5:7ae382a65843d6df2685993e90a8628f',
                 'upload_date': '20190312',
-                'uploader': 'Various Artists - Topic',
-                'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
+                'uploader': 'Stephen - Topic',
+                'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA',
                 'artist': 'Stephen',
                 'track': 'Voyeur Girl',
                 'album': 'it\'s too much love to know my dear',
@@ -1210,7 +1209,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'id': '-hcAI0g-f5M',
                 'ext': 'mp4',
                 'title': 'Put It On Me',
-                'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
+                'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
                 'upload_date': '20180426',
                 'uploader': 'Matt Maeson - Topic',
                 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
@@ -1256,7 +1255,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
     def _extract_signature_function(self, video_id, player_url, example_sig):
         id_m = re.match(
-            r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
+            r'.*?[-.](?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
             player_url)
         if not id_m:
             raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1708,9 +1707,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         def extract_view_count(v_info):
             return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
 
-        def extract_token(v_info):
-            return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token'))
-
         def extract_player_response(player_response, video_id):
             pl_response = str_or_none(player_response)
             if not pl_response:
@@ -1723,6 +1719,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         player_response = {}
 
         # Get video info
+        video_info = {}
         embed_webpage = None
         if re.search(r'player-age-gate-content">', video_webpage) is not None:
             age_gate = True
@@ -1737,19 +1734,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                     r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''),
             })
             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
-            video_info_webpage = self._download_webpage(
-                video_info_url, video_id,
-                note='Refetching age-gated info webpage',
-                errnote='unable to download video info webpage')
-            video_info = compat_parse_qs(video_info_webpage)
-            pl_response = video_info.get('player_response', [None])[0]
-            player_response = extract_player_response(pl_response, video_id)
-            add_dash_mpd(video_info)
-            view_count = extract_view_count(video_info)
+            try:
+                video_info_webpage = self._download_webpage(
+                    video_info_url, video_id,
+                    note='Refetching age-gated info webpage',
+                    errnote='unable to download video info webpage')
+            except ExtractorError:
+                video_info_webpage = None
+            if video_info_webpage:
+                video_info = compat_parse_qs(video_info_webpage)
+                pl_response = video_info.get('player_response', [None])[0]
+                player_response = extract_player_response(pl_response, video_id)
+                add_dash_mpd(video_info)
+                view_count = extract_view_count(video_info)
         else:
             age_gate = False
-            video_info = None
-            sts = None
             # Try looking directly into the video webpage
             ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
             if ytplayer_config:
@@ -1766,61 +1765,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
                 if args.get('livestream') == '1' or args.get('live_playback') == 1:
                     is_live = True
-                sts = ytplayer_config.get('sts')
                 if not player_response:
                     player_response = extract_player_response(args.get('player_response'), video_id)
             if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
                 add_dash_mpd_pr(player_response)
-                # We also try looking in get_video_info since it may contain different dashmpd
-                # URL that points to a DASH manifest with possibly different itag set (some itags
-                # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH
-                # manifest pointed by get_video_info's dashmpd).
-                # The general idea is to take a union of itags of both DASH manifests (for example
-                # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093)
-                self.report_video_info_webpage_download(video_id)
-                for el in ('embedded', 'detailpage', 'vevo', ''):
-                    query = {
-                        'video_id': video_id,
-                        'ps': 'default',
-                        'eurl': '',
-                        'gl': 'US',
-                        'hl': 'en',
-                    }
-                    if el:
-                        query['el'] = el
-                    if sts:
-                        query['sts'] = sts
-                    video_info_webpage = self._download_webpage(
-                        '%s://www.youtube.com/get_video_info' % proto,
-                        video_id, note=False,
-                        errnote='unable to download video info webpage',
-                        fatal=False, query=query)
-                    if not video_info_webpage:
-                        continue
-                    get_video_info = compat_parse_qs(video_info_webpage)
-                    if not player_response:
-                        pl_response = get_video_info.get('player_response', [None])[0]
-                        player_response = extract_player_response(pl_response, video_id)
-                    add_dash_mpd(get_video_info)
-                    if view_count is None:
-                        view_count = extract_view_count(get_video_info)
-                    if not video_info:
-                        video_info = get_video_info
-                    get_token = extract_token(get_video_info)
-                    if get_token:
-                        # Different get_video_info requests may report different results, e.g.
-                        # some may report video unavailability, but some may serve it without
-                        # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362,
-                        # the original webpage as well as el=info and el=embedded get_video_info
-                        # requests report video unavailability due to geo restriction while
-                        # el=detailpage succeeds and returns valid data). This is probably
-                        # due to YouTube measures against IP ranges of hosting providers.
-                        # Working around by preferring the first succeeded video_info containing
-                        # the token if no such video_info yet was found.
-                        token = extract_token(video_info)
-                        if not token:
-                            video_info = get_video_info
-                        break
 
         def extract_unavailable_message():
             messages = []
@@ -1833,13 +1781,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             if messages:
                 return '\n'.join(messages)
 
-        if not video_info:
+        if not video_info and not player_response:
             unavailable_message = extract_unavailable_message()
             if not unavailable_message:
                 unavailable_message = 'Unable to extract video data'
             raise ExtractorError(
                 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id)
 
+        if not isinstance(video_info, dict):
+            video_info = {}
+
         video_details = try_get(
             player_response, lambda x: x['videoDetails'], dict) or {}
 
@@ -2035,7 +1986,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                                 else:
                                     player_version = self._search_regex(
                                         [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
-                                         r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
+                                         r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'],
                                         player_url,
                                         'html5 player', fatal=False)
                                     player_desc = 'html5 player %s' % player_version
@@ -2392,30 +2343,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                         f['stretched_ratio'] = ratio
 
         if not formats:
-            token = extract_token(video_info)
-            if not token:
-                if 'reason' in video_info:
-                    if 'The uploader has not made this video available in your country.' in video_info['reason']:
-                        regions_allowed = self._html_search_meta(
-                            'regionsAllowed', video_webpage, default=None)
-                        countries = regions_allowed.split(',') if regions_allowed else None
-                        self.raise_geo_restricted(
-                            msg=video_info['reason'][0], countries=countries)
-                    reason = video_info['reason'][0]
-                    if 'Invalid parameters' in reason:
-                        unavailable_message = extract_unavailable_message()
-                        if unavailable_message:
-                            reason = unavailable_message
-                    raise ExtractorError(
-                        'YouTube said: %s' % reason,
-                        expected=True, video_id=video_id)
-                else:
-                    raise ExtractorError(
-                        '"token" parameter not in video info for unknown reason',
-                        video_id=video_id)
-
-        if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])):
-            raise ExtractorError('This video is DRM protected.', expected=True)
+            if 'reason' in video_info:
+                if 'The uploader has not made this video available in your country.' in video_info['reason']:
+                    regions_allowed = self._html_search_meta(
+                        'regionsAllowed', video_webpage, default=None)
+                    countries = regions_allowed.split(',') if regions_allowed else None
+                    self.raise_geo_restricted(
+                        msg=video_info['reason'][0], countries=countries)
+                reason = video_info['reason'][0]
+                if 'Invalid parameters' in reason:
+                    unavailable_message = extract_unavailable_message()
+                    if unavailable_message:
+                        reason = unavailable_message
+                raise ExtractorError(
+                    'YouTube said: %s' % reason,
+                    expected=True, video_id=video_id)
+            if video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos']):
+                raise ExtractorError('This video is DRM protected.', expected=True)
 
         self._sort_formats(formats)
 
@@ -2495,20 +2439,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
     _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
     IE_NAME = 'youtube:playlist'
     _TESTS = [{
-        'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
+        'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
         'info_dict': {
-            'title': 'ytdl test PL',
-            'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
+            'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+            'uploader': 'Sergey M.',
+            'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+            'title': 'youtube-dl public playlist',
         },
-        'playlist_count': 3,
+        'playlist_count': 1,
     }, {
-        'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
+        'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
         'info_dict': {
-            'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
-            'title': 'YDL_Empty_List',
+            'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
+            'uploader': 'Sergey M.',
+            'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
+            'title': 'youtube-dl empty playlist',
         },
         'playlist_count': 0,
-        'skip': 'This playlist is private',
     }, {
         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
@@ -2518,7 +2465,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
             'uploader': 'Christiaan008',
             'uploader_id': 'ChRiStIaAn008',
         },
-        'playlist_count': 95,
+        'playlist_count': 96,
     }, {
         'note': 'issue #673',
         'url': 'PLBB231211A4F62143',
index bacb82eeeb2a549edbb0cbf6d0a67e07f28b595b..f6496f5168cf057c9c415cc7461105462ad66370 100644 (file)
@@ -29,7 +29,6 @@ class ZapiksIE(InfoExtractor):
                 'timestamp': 1359044972,
                 'upload_date': '20130124',
                 'view_count': int,
-                'comment_count': int,
             },
         },
         {
index 145c123a42fee5e67c0fd8c2750ea13562632666..656864b2ed8a9982c1da934b01aff6bcb7126acf 100644 (file)
@@ -244,14 +244,14 @@ class ZDFChannelIE(ZDFBaseIE):
             'id': 'das-aktuelle-sportstudio',
             'title': 'das aktuelle sportstudio | ZDF',
         },
-        'playlist_count': 21,
+        'playlist_mincount': 23,
     }, {
         'url': 'https://www.zdf.de/dokumentation/planet-e',
         'info_dict': {
             'id': 'planet-e',
             'title': 'planet e.',
         },
-        'playlist_count': 4,
+        'playlist_mincount': 50,
     }, {
         'url': 'https://www.zdf.de/filme/taunuskrimi/',
         'only_matching': True,
index 1ffabc62bedacb42aeb34f585d04ed7bc3ff8045..8826b382c3cc70ab21bacef8de94314ec193948b 100644 (file)
@@ -134,7 +134,7 @@ def parseOpts(overrideArguments=None):
         action='help',
         help='Print this help text and exit')
     general.add_option(
-        '-v', '--version',
+        '--version',
         action='version',
         help='Print program version and exit')
     general.add_option(
index 002ea7f3386215c61bcf3bc60419d0059abf2bc2..84c9646171e0b8f8d6a6397bff5339205cdadcd7 100644 (file)
@@ -9,6 +9,7 @@ import subprocess
 import sys
 from zipimport import zipimporter
 
+from .compat import compat_realpath
 from .utils import encode_compat_str
 
 from .version import __version__
@@ -84,7 +85,9 @@ def update_self(to_screen, verbose, opener):
     print_notes(to_screen, versions_info['versions'])
 
     # sys.executable is set to the full pathname of the exe-file for py2exe
-    filename = sys.executable if hasattr(sys, 'frozen') else sys.argv[0]
+    # though symlinks are not followed so that we need to do this manually
+    # with help of realpath
+    filename = compat_realpath(sys.executable if hasattr(sys, 'frozen') else sys.argv[0])
 
     if not os.access(filename, os.W_OK):
         to_screen('ERROR: no write permissions on %s' % filename)
index f6204692a81002cdfc44b02d183126e755283bd9..38262bee4cd08f97e4f5009d7066d1466bde25a4 100644 (file)
@@ -2729,6 +2729,11 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
 
 
 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
+    """
+    See [1] for cookie file format.
+
+    1. https://curl.haxx.se/docs/http-cookies.html
+    """
     _HTTPONLY_PREFIX = '#HttpOnly_'
 
     def save(self, filename=None, ignore_discard=False, ignore_expires=False):
@@ -2795,6 +2800,15 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
     https_response = http_response
 
 
+class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
+    if sys.version_info[0] < 3:
+        def redirect_request(self, req, fp, code, msg, headers, newurl):
+            # On python 2 urlh.geturl() may sometimes return redirect URL
+            # as byte string instead of unicode. This workaround allows
+            # to force it always return unicode.
+            return compat_urllib_request.HTTPRedirectHandler.redirect_request(self, req, fp, code, msg, headers, compat_str(newurl))
+
+
 def extract_timezone(date_str):
     m = re.search(
         r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
index fa6f7289a0e11c045f741090bfaaef6f006e5ed6..5aedd32688ef2c9c2d6409d85091e4f2e4afe618 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2020.01.24'
+__version__ = '2020.03.24'