ExtractorError,
find_xpath_attr,
fix_xml_ampersands,
+ float_or_none,
get_element_by_class,
get_element_by_attribute,
get_elements_by_class,
get_elements_by_attribute,
InAdvancePagedList,
+ int_or_none,
intlist_to_bytes,
is_html,
js_to_json,
limit_length,
+ merge_dicts,
mimetype2ext,
month_by_name,
+ multipart_encode,
ohdave_rsa_encrypt,
OnDemandPagedList,
orderedSet,
parse_filesize,
parse_count,
parse_iso8601,
+ parse_resolution,
+ parse_bitrate,
pkcs1pad,
read_batch_urls,
sanitize_filename,
sanitize_path,
+ sanitize_url,
expand_path,
prepend_extension,
replace_extension,
smuggle_url,
str_to_int,
strip_jsonp,
+ strip_or_none,
timeconvert,
unescapeHTML,
unified_strdate,
uppercase_escape,
lowercase_escape,
url_basename,
+ url_or_none,
base_url,
urljoin,
urlencode_postdata,
compat_chr,
compat_etree_fromstring,
compat_getenv,
+ compat_os_name,
compat_setenv,
compat_urlparse,
compat_parse_qs,
self.assertEqual(sanitize_filename(
'ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', restricted=True),
- 'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYPssaaaaaaaeceeeeiiiionooooooooeuuuuuypy')
+ 'AAAAAAAECEEEEIIIIDNOOOOOOOOEUUUUUYTHssaaaaaaaeceeeeiiiionooooooooeuuuuuythy')
def test_sanitize_ids(self):
self.assertEqual(sanitize_filename('_n_cd26wFpw', is_id=True), '_n_cd26wFpw')
self.assertEqual(sanitize_path('./abc'), 'abc')
self.assertEqual(sanitize_path('./../abc'), '..\\abc')
+ def test_sanitize_url(self):
+ self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar')
+ self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar')
+ self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar')
+ self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar')
+
def test_expand_path(self):
def env(var):
return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var)
self.assertEqual(unescapeHTML('/'), '/')
self.assertEqual(unescapeHTML('é'), 'é')
self.assertEqual(unescapeHTML('�'), '�')
+ self.assertEqual(unescapeHTML('&a"'), '&a"')
# HTML5 entities
self.assertEqual(unescapeHTML('.''), '.\'')
self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500)
self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100)
+ self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361)
+ self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540)
+ self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140)
+ self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363)
def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
self.assertEqual(determine_ext('http://example.com/foo/bar.nonext/?download', None), None)
self.assertEqual(determine_ext('http://example.com/foo/bar/mp4?download', None), None)
self.assertEqual(determine_ext('http://example.com/foo/bar.m3u8//?download'), 'm3u8')
+ self.assertEqual(determine_ext('foobar', None), None)
def test_find_xpath_attr(self):
testxml = '''<root>
def test_shell_quote(self):
args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
- self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+ self.assertEqual(
+ shell_quote(args),
+ """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
+
+ def test_float_or_none(self):
+ self.assertEqual(float_or_none('42.42'), 42.42)
+ self.assertEqual(float_or_none('42'), 42.0)
+ self.assertEqual(float_or_none(''), None)
+ self.assertEqual(float_or_none(None), None)
+ self.assertEqual(float_or_none([]), None)
+ self.assertEqual(float_or_none(set()), None)
+
+ def test_int_or_none(self):
+ self.assertEqual(int_or_none('42'), 42)
+ self.assertEqual(int_or_none(''), None)
+ self.assertEqual(int_or_none(None), None)
+ self.assertEqual(int_or_none([]), None)
+ self.assertEqual(int_or_none(set()), None)
def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456)
self.assertEqual(urljoin('http://foo.de/', ''), None)
self.assertEqual(urljoin('http://foo.de/', ['foobar']), None)
self.assertEqual(urljoin('http://foo.de/a/b/c.txt', '.././../d.txt'), 'http://foo.de/d.txt')
+ self.assertEqual(urljoin('http://foo.de/a/b/c.txt', 'rtmp://foo.de'), 'rtmp://foo.de')
+ self.assertEqual(urljoin(None, 'rtmp://foo.de'), 'rtmp://foo.de')
+
+ def test_url_or_none(self):
+ self.assertEqual(url_or_none(None), None)
+ self.assertEqual(url_or_none(''), None)
+ self.assertEqual(url_or_none('foo'), None)
+ self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
+ self.assertEqual(url_or_none('https://foo.de'), 'https://foo.de')
+ self.assertEqual(url_or_none('http$://foo.de'), None)
+ self.assertEqual(url_or_none('http://foo.de'), 'http://foo.de')
+ self.assertEqual(url_or_none('//foo.de'), '//foo.de')
def test_parse_age_limit(self):
self.assertEqual(parse_age_limit(None), None)
self.assertEqual(parse_age_limit('PG-13'), 13)
self.assertEqual(parse_age_limit('TV-14'), 14)
self.assertEqual(parse_age_limit('TV-MA'), 17)
+ self.assertEqual(parse_age_limit('TV14'), 14)
+ self.assertEqual(parse_age_limit('TV_G'), 0)
def test_parse_duration(self):
self.assertEqual(parse_duration(None), None)
self.assertEqual(parse_duration('87 Min.'), 5220)
self.assertEqual(parse_duration('PT1H0.040S'), 3600.04)
self.assertEqual(parse_duration('PT00H03M30SZ'), 210)
+ self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88)
def test_fix_xml_ampersands(self):
self.assertEqual(
'http://example.com/path', {'test': '第二行тест'})),
query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
+ def test_multipart_encode(self):
+ self.assertEqual(
+ multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0],
+ b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n')
+ self.assertEqual(
+ multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0],
+ b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n')
+ self.assertRaises(
+ ValueError, multipart_encode, {b'field': b'value'}, boundary='value')
+
def test_dict_get(self):
FALSE_VALUES = {
'none': None,
self.assertEqual(dict_get(d, ('b', 'c', key, )), None)
self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value)
+ def test_merge_dicts(self):
+ self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2})
+ self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1})
+ self.assertEqual(merge_dicts({'a': 1}, {'a': None}), {'a': 1})
+ self.assertEqual(merge_dicts({'a': 1}, {'a': ''}), {'a': 1})
+ self.assertEqual(merge_dicts({'a': 1}, {}), {'a': 1})
+ self.assertEqual(merge_dicts({'a': None}, {'a': 1}), {'a': 1})
+ self.assertEqual(merge_dicts({'a': ''}, {'a': 1}), {'a': ''})
+ self.assertEqual(merge_dicts({'a': ''}, {'a': 'abc'}), {'a': 'abc'})
+ self.assertEqual(merge_dicts({'a': None}, {'a': ''}, {'a': 'abc'}), {'a': 'abc'})
+
def test_encode_compat_str(self):
self.assertEqual(encode_compat_str(b'\xd1\x82\xd0\xb5\xd1\x81\xd1\x82', 'utf-8'), 'тест')
self.assertEqual(encode_compat_str('тест', 'utf-8'), 'тест')
d = json.loads(stripped)
self.assertEqual(d, {'status': 'success'})
+ stripped = strip_jsonp('window.cb && window.cb({"status": "success"});')
+ d = json.loads(stripped)
+ self.assertEqual(d, {'status': 'success'})
+
+ stripped = strip_jsonp('window.cb && cb({"status": "success"});')
+ d = json.loads(stripped)
+ self.assertEqual(d, {'status': 'success'})
+
+ stripped = strip_jsonp('({"status": "success"});')
+ d = json.loads(stripped)
+ self.assertEqual(d, {'status': 'success'})
+
+ def test_strip_or_none(self):
+ self.assertEqual(strip_or_none(' abc'), 'abc')
+ self.assertEqual(strip_or_none('abc '), 'abc')
+ self.assertEqual(strip_or_none(' abc '), 'abc')
+ self.assertEqual(strip_or_none('\tabc\t'), 'abc')
+ self.assertEqual(strip_or_none('\n\tabc\n\t'), 'abc')
+ self.assertEqual(strip_or_none('abc'), 'abc')
+ self.assertEqual(strip_or_none(''), '')
+ self.assertEqual(strip_or_none(None), None)
+ self.assertEqual(strip_or_none(42), None)
+ self.assertEqual(strip_or_none([]), None)
+
def test_uppercase_escape(self):
self.assertEqual(uppercase_escape('aä'), 'aä')
self.assertEqual(uppercase_escape('\\U0001d550'), '𝕐')
'vcodec': 'h264',
'acodec': 'aac',
})
+ self.assertEqual(parse_codecs('av01.0.05M.08'), {
+ 'vcodec': 'av01.0.05M.08',
+ 'acodec': 'none',
+ })
+ self.assertEqual(parse_codecs('theora, vorbis'), {
+ 'vcodec': 'theora',
+ 'acodec': 'vorbis',
+ })
+ self.assertEqual(parse_codecs('unknownvcodec, unknownacodec'), {
+ 'vcodec': 'unknownvcodec',
+ 'acodec': 'unknownacodec',
+ })
+ self.assertEqual(parse_codecs('unknown'), {})
def test_escape_rfc3986(self):
reserved = "!*'();:@&=+$,/?#[]"
inp = '''{"duration": "00:01:07"}'''
self.assertEqual(js_to_json(inp), '''{"duration": "00:01:07"}''')
+ inp = '''{segments: [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}'''
+ self.assertEqual(js_to_json(inp), '''{"segments": [{"offset":-3.885780586188048e-16,"duration":39.75000000000001}]}''')
+
def test_js_to_json_edgecases(self):
on = js_to_json("{abc_def:'1\\'\\\\2\\\\\\'3\"4'}")
self.assertEqual(json.loads(on), {"abc_def": "1'\\2\\'3\"4"})
on = js_to_json('{/*comment\n*/42/*comment\n*/:/*comment\n*/42/*comment\n*/}')
self.assertEqual(json.loads(on), {'42': 42})
+ on = js_to_json('{42:4.2e1}')
+ self.assertEqual(json.loads(on), {'42': 42.0})
+
+ def test_js_to_json_malformed(self):
+ self.assertEqual(js_to_json('42a1'), '42"a1"')
+ self.assertEqual(js_to_json('42a-1'), '42"a"-1')
+
def test_extract_attributes(self):
self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'})
self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'})
supports_outside_bmp = False
if supports_outside_bmp:
self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'})
+ # Malformed HTML should not break attributes extraction on older Python
+ self.assertEqual(extract_attributes('<mal"formed/>'), {})
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
+ self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
def test_intlist_to_bytes(self):
self.assertEqual(
def test_args_to_str(self):
self.assertEqual(
args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
- 'foo ba/r -baz \'2 be\' \'\''
+ 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
)
def test_parse_filesize(self):
self.assertEqual(parse_count('1.1kk '), 1100000)
self.assertEqual(parse_count('1.1kk views'), 1100000)
+ def test_parse_resolution(self):
+ self.assertEqual(parse_resolution(None), {})
+ self.assertEqual(parse_resolution(''), {})
+ self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080})
+ self.assertEqual(parse_resolution('720p'), {'height': 720})
+ self.assertEqual(parse_resolution('4k'), {'height': 2160})
+ self.assertEqual(parse_resolution('8K'), {'height': 4320})
+
+ def test_parse_bitrate(self):
+ self.assertEqual(parse_bitrate(None), None)
+ self.assertEqual(parse_bitrate(''), None)
+ self.assertEqual(parse_bitrate('300kbps'), 300)
+ self.assertEqual(parse_bitrate('1500kbps'), 1500)
+ self.assertEqual(parse_bitrate('300 kbps'), 300)
+
def test_version_tuple(self):
self.assertEqual(version_tuple('1'), (1,))
self.assertEqual(version_tuple('10.23.344'), (10, 23, 344))
self.assertFalse(match_str(
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'dislike_count': 10}))
+ self.assertTrue(match_str('is_live', {'is_live': True}))
+ self.assertFalse(match_str('is_live', {'is_live': False}))
+ self.assertFalse(match_str('is_live', {'is_live': None}))
+ self.assertFalse(match_str('is_live', {}))
+ self.assertFalse(match_str('!is_live', {'is_live': True}))
+ self.assertTrue(match_str('!is_live', {'is_live': False}))
+ self.assertTrue(match_str('!is_live', {'is_live': None}))
+ self.assertTrue(match_str('!is_live', {}))
+ self.assertTrue(match_str('title', {'title': 'abc'}))
+ self.assertTrue(match_str('title', {'title': ''}))
+ self.assertFalse(match_str('!title', {'title': 'abc'}))
+ self.assertFalse(match_str('!title', {'title': ''}))
def test_parse_dfxp_time_expr(self):
self.assertEqual(parse_dfxp_time_expr(None), None)
<p begin="3" dur="-1">Ignored, three</p>
</div>
</body>
- </tt>'''
+ </tt>'''.encode('utf-8')
srt_data = '''1
00:00:00,000 --> 00:00:01,000
The following line contains Chinese characters and special symbols
<p begin="0" end="1">The first line</p>
</div>
</body>
- </tt>'''
+ </tt>'''.encode('utf-8')
srt_data = '''1
00:00:00,000 --> 00:00:01,000
The first line
'''
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
+ dfxp_data_with_style = '''<?xml version="1.0" encoding="utf-8"?>
+<tt xmlns="http://www.w3.org/2006/10/ttaf1" xmlns:ttp="http://www.w3.org/2006/10/ttaf1#parameter" ttp:timeBase="media" xmlns:tts="http://www.w3.org/2006/10/ttaf1#style" xml:lang="en" xmlns:ttm="http://www.w3.org/2006/10/ttaf1#metadata">
+ <head>
+ <styling>
+ <style id="s2" style="s0" tts:color="cyan" tts:fontWeight="bold" />
+ <style id="s1" style="s0" tts:color="yellow" tts:fontStyle="italic" />
+ <style id="s3" style="s0" tts:color="lime" tts:textDecoration="underline" />
+ <style id="s0" tts:backgroundColor="black" tts:fontStyle="normal" tts:fontSize="16" tts:fontFamily="sansSerif" tts:color="white" />
+ </styling>
+ </head>
+ <body tts:textAlign="center" style="s0">
+ <div>
+ <p begin="00:00:02.08" id="p0" end="00:00:05.84">default style<span tts:color="red">custom style</span></p>
+ <p style="s2" begin="00:00:02.08" id="p0" end="00:00:05.84"><span tts:color="lime">part 1<br /></span><span tts:color="cyan">part 2</span></p>
+ <p style="s3" begin="00:00:05.84" id="p1" end="00:00:09.56">line 3<br />part 3</p>
+ <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
+ </div>
+ </body>
+</tt>'''.encode('utf-8')
+ srt_data = '''1
+00:00:02,080 --> 00:00:05,839
+<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
+
+2
+00:00:02,080 --> 00:00:05,839
+<b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1
+</font>part 2</font></b>
+
+3
+00:00:05,839 --> 00:00:09,560
+<u><font color="lime">line 3
+part 3</font></u>
+
+4
+00:00:09,560 --> 00:00:12,359
+<i><u><font color="yellow"><font color="lime">inner
+ </font>style</font></u></i>
+
+'''
+ self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
+
+ dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?>
+ <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+ <body>
+ <div xml:lang="en">
+ <p begin="0" end="1">Line 1</p>
+ <p begin="1" end="2">第二行</p>
+ </div>
+ </body>
+ </tt>'''.encode('utf-16')
+ srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+Line 1
+
+2
+00:00:01,000 --> 00:00:02,000
+第二行
+
+'''
+ self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data)
+
def test_cli_option(self):
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
cli_bool_option(
{'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
['--check-certificate=true'])
+ self.assertEqual(
+ cli_bool_option(
+ {}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
+ [])
def test_ohdave_rsa_encrypt(self):
N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+ html = '''
+ <div itemprop="author" itemscope>foo</div>
+ '''
+
+ self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+
def test_get_elements_by_class(self):
html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>