]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - test/test_InfoExtractor.py
Prepare to upload.
[youtubedl] / test / test_InfoExtractor.py
index 4833396a521bf1d7a072db8ad425bed333235248..71f6608feae4a5bcad37e96d74e795e7535a5dc7 100644 (file)
@@ -9,11 +9,30 @@ import sys
 import unittest
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
-from test.helper import FakeYDL, expect_dict, expect_value
-from youtube_dl.compat import compat_etree_fromstring
+from test.helper import FakeYDL, expect_dict, expect_value, http_server_port
+from youtube_dl.compat import compat_etree_fromstring, compat_http_server
 from youtube_dl.extractor.common import InfoExtractor
 from youtube_dl.extractor import YoutubeIE, get_info_extractor
 from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
+import threading
+
+
+TEAPOT_RESPONSE_STATUS = 418
+TEAPOT_RESPONSE_BODY = "<h1>418 I'm a teapot</h1>"
+
+
+class InfoExtractorTestRequestHandler(compat_http_server.BaseHTTPRequestHandler):
+    def log_message(self, format, *args):
+        pass
+
+    def do_GET(self):
+        if self.path == '/teapot':
+            self.send_response(TEAPOT_RESPONSE_STATUS)
+            self.send_header('Content-Type', 'text/html; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(TEAPOT_RESPONSE_BODY.encode())
+        else:
+            assert False
 
 
 class TestIE(InfoExtractor):
@@ -42,6 +61,7 @@ class TestInfoExtractor(unittest.TestCase):
             <meta content='Foo' property=og:foobar>
             <meta name="og:test1" content='foo > < bar'/>
             <meta name="og:test2" content="foo >//< bar"/>
+            <meta property=og-test3 content='Ill-formatted opengraph'/>
             '''
         self.assertEqual(ie._og_search_title(html), 'Foo')
         self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
@@ -50,6 +70,7 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
         self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
         self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
+        self.assertEqual(ie._og_search_property('test3', html), 'Ill-formatted opengraph')
         self.assertEqual(ie._og_search_property(('test0', 'test1'), html), 'foo > < bar')
         self.assertRaises(RegexNotFoundError, ie._og_search_property, 'test0', html, None, fatal=True)
         self.assertRaises(RegexNotFoundError, ie._og_search_property, ('test0', 'test00'), html, None, fatal=True)
@@ -86,6 +107,184 @@ class TestInfoExtractor(unittest.TestCase):
         self.assertRaises(ExtractorError, self.ie._download_json, uri, None)
         self.assertEqual(self.ie._download_json(uri, None, fatal=False), None)
 
+    def test_parse_html5_media_entries(self):
+        # from https://www.r18.com/
+        # with kpbs in label
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://www.r18.com/',
+                r'''
+                <video id="samplevideo_amateur" class="js-samplevideo video-js vjs-default-skin vjs-big-play-centered" controls preload="auto" width="400" height="225" poster="//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg">
+                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4" type="video/mp4"  res="240" label="300kbps">
+                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4" type="video/mp4"  res="480" label="1000kbps">
+                    <source id="video_source" src="https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4" type="video/mp4"  res="740" label="1500kbps">
+                    <p>Your browser does not support the video tag.</p>
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_sm_w.mp4',
+                    'ext': 'mp4',
+                    'format_id': '300kbps',
+                    'height': 240,
+                    'tbr': 300,
+                }, {
+                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dm_w.mp4',
+                    'ext': 'mp4',
+                    'format_id': '1000kbps',
+                    'height': 480,
+                    'tbr': 1000,
+                }, {
+                    'url': 'https://awscc3001.r18.com/litevideo/freepv/m/mgm/mgmr105/mgmr105_dmb_w.mp4',
+                    'ext': 'mp4',
+                    'format_id': '1500kbps',
+                    'height': 740,
+                    'tbr': 1500,
+                }],
+                'thumbnail': '//pics.r18.com/digital/amateur/mgmr105/mgmr105jp.jpg'
+            })
+
+        # from https://www.csfd.cz/
+        # with width and height
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://www.csfd.cz/',
+                r'''
+                <video width="770" height="328" preload="none" controls  poster="https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360" >
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4" type="video/mp4" width="640" height="360">
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4" type="video/mp4" width="1280" height="720">
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4" type="video/mp4" width="1920" height="1080">
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm" type="video/webm" width="640" height="360">
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm" type="video/webm" width="1280" height="720">
+                    <source src="https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm" type="video/webm" width="1920" height="1080">
+                    <track src="https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt" type="text/x-srt" kind="subtitles" srclang="cs" label="cs">
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327358_eac647.mp4',
+                    'ext': 'mp4',
+                    'width': 640,
+                    'height': 360,
+                }, {
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327360_3d2646.mp4',
+                    'ext': 'mp4',
+                    'width': 1280,
+                    'height': 720,
+                }, {
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327356_91f258.mp4',
+                    'ext': 'mp4',
+                    'width': 1920,
+                    'height': 1080,
+                }, {
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327359_962b4a.webm',
+                    'ext': 'webm',
+                    'width': 640,
+                    'height': 360,
+                }, {
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327361_6feee0.webm',
+                    'ext': 'webm',
+                    'width': 1280,
+                    'height': 720,
+                }, {
+                    'url': 'https://video.csfd.cz/files/videos/157/750/157750813/163327357_8ab472.webm',
+                    'ext': 'webm',
+                    'width': 1920,
+                    'height': 1080,
+                }],
+                'subtitles': {
+                    'cs': [{'url': 'https://video.csfd.cz/files/subtitles/163/344/163344115_4c388b.srt'}]
+                },
+                'thumbnail': 'https://img.csfd.cz/files/images/film/video/preview/163/344/163344118_748d20.png?h360'
+            })
+
+        # from https://tamasha.com/v/Kkdjw
+        # with height in label
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://tamasha.com/v/Kkdjw',
+                r'''
+                <video crossorigin="anonymous">
+                        <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4" label="AUTO" res="0"/>
+                                <source src="https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4" type="video/mp4"
+                                        label="240p" res="240"/>
+                                <source src="https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4" type="video/mp4"
+                                        label="144p" res="144"/>
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4',
+                }, {
+                    'url': 'https://s-v2.tamasha.com/statics/videos_file/19/8f/Kkdjw_198feff8577d0057536e905cce1fb61438dd64e0_n_240.mp4',
+                    'ext': 'mp4',
+                    'format_id': '240p',
+                    'height': 240,
+                }, {
+                    'url': 'https://s-v2.tamasha.com/statics/videos_file/20/00/Kkdjw_200041c66f657fc967db464d156eafbc1ed9fe6f_n_144.mp4',
+                    'ext': 'mp4',
+                    'format_id': '144p',
+                    'height': 144,
+                }]
+            })
+
+        # from https://www.directvnow.com
+        # with data-src
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://www.directvnow.com',
+                r'''
+                <video id="vid1" class="header--video-masked active" muted playsinline>
+                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" />
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'ext': 'mp4',
+                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4',
+                }]
+            })
+
+        # from https://www.directvnow.com
+        # with data-src
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://www.directvnow.com',
+                r'''
+                <video id="vid1" class="header--video-masked active" muted playsinline>
+                    <source data-src="https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4" type="video/mp4" />
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://cdn.directv.com/content/dam/dtv/prod/website_directvnow-international/videos/DTVN_hdr_HBO_v3.mp4',
+                    'ext': 'mp4',
+                }]
+            })
+
+        # from https://www.klarna.com/uk/
+        # with data-video-src
+        expect_dict(
+            self,
+            self.ie._parse_html5_media_entries(
+                'https://www.directvnow.com',
+                r'''
+                <video loop autoplay muted class="responsive-video block-kl__video video-on-medium">
+                    <source src="" data-video-desktop data-video-src="https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4" type="video/mp4" />
+                </video>
+                ''', None)[0],
+            {
+                'formats': [{
+                    'url': 'https://www.klarna.com/uk/wp-content/uploads/sites/11/2019/01/KL062_Smooth3_0_DogWalking_5s_920x080_.mp4',
+                    'ext': 'mp4',
+                }],
+            })
+
     def test_extract_jwplayer_data_realworld(self):
         # from http://www.suffolk.edu/sjc/
         expect_dict(
@@ -180,7 +379,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
     def test_parse_m3u8_formats(self):
         _TEST_CASES = [
             (
-                # https://github.com/rg3/youtube-dl/issues/11507
+                # https://github.com/ytdl-org/youtube-dl/issues/11507
                 # http://pluzz.francetv.fr/videos/le_ministere.html
                 'pluzz_francetv_11507',
                 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
@@ -242,7 +441,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                 }]
             ),
             (
-                # https://github.com/rg3/youtube-dl/issues/11995
+                # https://github.com/ytdl-org/youtube-dl/issues/11995
                 # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor
                 'teamcoco_11995',
                 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
@@ -316,7 +515,7 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                 }]
             ),
             (
-                # https://github.com/rg3/youtube-dl/issues/12211
+                # https://github.com/ytdl-org/youtube-dl/issues/12211
                 # http://video.toggle.sg/en/series/whoopie-s-world/ep3/478601
                 'toggle_mobile_12211',
                 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
@@ -478,7 +677,64 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                     'width': 1280,
                     'height': 720,
                 }]
-            )
+            ),
+            (
+                # https://github.com/ytdl-org/youtube-dl/issues/18923
+                # https://www.ted.com/talks/boris_hesser_a_grassroots_healthcare_revolution_in_africa
+                'ted_18923',
+                'http://hls.ted.com/talks/31241.m3u8',
+                [{
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '600k-Audio',
+                    'vcodec': 'none',
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/audio/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '68',
+                    'vcodec': 'none',
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/64k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '163',
+                    'acodec': 'none',
+                    'width': 320,
+                    'height': 180,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/180k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '481',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/320k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '769',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/450k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '984',
+                    'acodec': 'none',
+                    'width': 512,
+                    'height': 288,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/600k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '1255',
+                    'acodec': 'none',
+                    'width': 640,
+                    'height': 360,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/950k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '1693',
+                    'acodec': 'none',
+                    'width': 853,
+                    'height': 480,
+                }, {
+                    'url': 'http://hls.ted.com/videos/BorisHesser_2018S/video/1500k.m3u8?nobumpers=true&uniqueId=76011e2b',
+                    'format_id': '2462',
+                    'acodec': 'none',
+                    'width': 1280,
+                    'height': 720,
+                }]
+            ),
         ]
 
         for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
@@ -492,11 +748,12 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
     def test_parse_mpd_formats(self):
         _TEST_CASES = [
             (
-                # https://github.com/rg3/youtube-dl/issues/13919
+                # https://github.com/ytdl-org/youtube-dl/issues/13919
                 # Also tests duplicate representation ids, see
-                # https://github.com/rg3/youtube-dl/issues/15111
+                # https://github.com/ytdl-org/youtube-dl/issues/15111
                 'float_duration',
-                'http://unknown/manifest.mpd',
+                'http://unknown/manifest.mpd',  # mpd_url
+                None,  # mpd_base_url
                 [{
                     'manifest_url': 'http://unknown/manifest.mpd',
                     'ext': 'm4a',
@@ -574,9 +831,10 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                     'height': 1080,
                 }]
             ), (
-                # https://github.com/rg3/youtube-dl/pull/14844
+                # https://github.com/ytdl-org/youtube-dl/pull/14844
                 'urls_only',
-                'http://unknown/manifest.mpd',
+                'http://unknown/manifest.mpd',  # mpd_url
+                None,  # mpd_base_url
                 [{
                     'manifest_url': 'http://unknown/manifest.mpd',
                     'ext': 'mp4',
@@ -655,22 +913,68 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                     'width': 1920,
                     'height': 1080,
                 }]
+            ), (
+                # https://github.com/ytdl-org/youtube-dl/issues/20346
+                # Media considered unfragmented even though it contains
+                # Initialization tag
+                'unfragmented',
+                'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd',  # mpd_url
+                'https://v.redd.it/hw1x7rcg7zl21',  # mpd_base_url
+                [{
+                    'url': 'https://v.redd.it/hw1x7rcg7zl21/audio',
+                    'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd',
+                    'ext': 'm4a',
+                    'format_id': 'AUDIO-1',
+                    'format_note': 'DASH audio',
+                    'container': 'm4a_dash',
+                    'acodec': 'mp4a.40.2',
+                    'vcodec': 'none',
+                    'tbr': 129.87,
+                    'asr': 48000,
+
+                }, {
+                    'url': 'https://v.redd.it/hw1x7rcg7zl21/DASH_240',
+                    'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd',
+                    'ext': 'mp4',
+                    'format_id': 'VIDEO-2',
+                    'format_note': 'DASH video',
+                    'container': 'mp4_dash',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.4d401e',
+                    'tbr': 608.0,
+                    'width': 240,
+                    'height': 240,
+                    'fps': 30,
+                }, {
+                    'url': 'https://v.redd.it/hw1x7rcg7zl21/DASH_360',
+                    'manifest_url': 'https://v.redd.it/hw1x7rcg7zl21/DASHPlaylist.mpd',
+                    'ext': 'mp4',
+                    'format_id': 'VIDEO-1',
+                    'format_note': 'DASH video',
+                    'container': 'mp4_dash',
+                    'acodec': 'none',
+                    'vcodec': 'avc1.4d401e',
+                    'tbr': 804.261,
+                    'width': 360,
+                    'height': 360,
+                    'fps': 30,
+                }]
             )
         ]
 
-        for mpd_file, mpd_url, expected_formats in _TEST_CASES:
+        for mpd_file, mpd_url, mpd_base_url, expected_formats in _TEST_CASES:
             with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
                          mode='r', encoding='utf-8') as f:
                 formats = self.ie._parse_mpd_formats(
                     compat_etree_fromstring(f.read().encode('utf-8')),
-                    mpd_url=mpd_url)
+                    mpd_base_url=mpd_base_url, mpd_url=mpd_url)
                 self.ie._sort_formats(formats)
                 expect_value(self, formats, expected_formats, None)
 
     def test_parse_f4m_formats(self):
         _TEST_CASES = [
             (
-                # https://github.com/rg3/youtube-dl/issues/14660
+                # https://github.com/ytdl-org/youtube-dl/issues/14660
                 'custom_base_url',
                 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m',
                 [{
@@ -743,6 +1047,25 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
                 for i in range(len(entries)):
                     expect_dict(self, entries[i], expected_entries[i])
 
+    def test_response_with_expected_status_returns_content(self):
+        # Checks for mitigations against the effects of
+        # <https://bugs.python.org/issue15002> that affect Python 3.4.1+, which
+        # manifest as `_download_webpage`, `_download_xml`, `_download_json`,
+        # or the underlying `_download_webpage_handle` returning no content
+        # when a response matches `expected_status`.
+
+        httpd = compat_http_server.HTTPServer(
+            ('127.0.0.1', 0), InfoExtractorTestRequestHandler)
+        port = http_server_port(httpd)
+        server_thread = threading.Thread(target=httpd.serve_forever)
+        server_thread.daemon = True
+        server_thread.start()
+
+        (content, urlh) = self.ie._download_webpage_handle(
+            'http://127.0.0.1:%d/teapot' % port, None,
+            expected_status=TEAPOT_RESPONSE_STATUS)
+        self.assertEqual(content, TEAPOT_RESPONSE_BODY)
+
 
 if __name__ == '__main__':
     unittest.main()