X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/b8d8e13c1f9e4d3cdd7d41c5c9d711a36dd5f9c3..fb7740590fb6631cf8e5ae3ba4e7a81b0623cba9:/youtube_dl/utils.py diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 67a847e..7987572 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -11,6 +11,7 @@ import contextlib import ctypes import datetime import email.utils +import email.header import errno import functools import gzip @@ -21,8 +22,8 @@ import locale import math import operator import os -import pipes import platform +import random import re import socket import ssl @@ -34,10 +35,14 @@ import xml.etree.ElementTree import zlib from .compat import ( + compat_HTMLParseError, compat_HTMLParser, compat_basestring, compat_chr, + compat_cookiejar, + compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, + compat_expanduser, compat_html_entities, compat_html_entities_html5, compat_http_client, @@ -45,7 +50,6 @@ from .compat import ( compat_os_name, compat_parse_qs, compat_shlex_quote, - compat_socket_create_connection, compat_str, compat_struct_pack, compat_struct_unpack, @@ -77,8 +81,1592 @@ def register_socks_protocols(): # This is not clearly defined otherwise compiled_regex_type = type(re.compile('')) + +def random_user_agent(): + _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36' + _CHROME_VERSIONS = ( + '74.0.3729.129', + '76.0.3780.3', + '76.0.3780.2', + '74.0.3729.128', + '76.0.3780.1', + '76.0.3780.0', + '75.0.3770.15', + '74.0.3729.127', + '74.0.3729.126', + '76.0.3779.1', + '76.0.3779.0', + '75.0.3770.14', + '74.0.3729.125', + '76.0.3778.1', + '76.0.3778.0', + '75.0.3770.13', + '74.0.3729.124', + '74.0.3729.123', + '73.0.3683.121', + '76.0.3777.1', + '76.0.3777.0', + '75.0.3770.12', + '74.0.3729.122', + '76.0.3776.4', + '75.0.3770.11', + '74.0.3729.121', + '76.0.3776.3', + '76.0.3776.2', + '73.0.3683.120', + '74.0.3729.120', + '74.0.3729.119', + '74.0.3729.118', + '76.0.3776.1', + '76.0.3776.0', + '76.0.3775.5', + '75.0.3770.10', + '74.0.3729.117', + '76.0.3775.4', + '76.0.3775.3', + '74.0.3729.116', + '75.0.3770.9', + '76.0.3775.2', + '76.0.3775.1', + '76.0.3775.0', + '75.0.3770.8', + '74.0.3729.115', + '74.0.3729.114', + '76.0.3774.1', + '76.0.3774.0', + '75.0.3770.7', + '74.0.3729.113', + '74.0.3729.112', + '74.0.3729.111', + '76.0.3773.1', + '76.0.3773.0', + '75.0.3770.6', + '74.0.3729.110', + '74.0.3729.109', + '76.0.3772.1', + '76.0.3772.0', + '75.0.3770.5', + '74.0.3729.108', + '74.0.3729.107', + '76.0.3771.1', + '76.0.3771.0', + '75.0.3770.4', + '74.0.3729.106', + '74.0.3729.105', + '75.0.3770.3', + '74.0.3729.104', + '74.0.3729.103', + '74.0.3729.102', + '75.0.3770.2', + '74.0.3729.101', + '75.0.3770.1', + '75.0.3770.0', + '74.0.3729.100', + '75.0.3769.5', + '75.0.3769.4', + '74.0.3729.99', + '75.0.3769.3', + '75.0.3769.2', + '75.0.3768.6', + '74.0.3729.98', + '75.0.3769.1', + '75.0.3769.0', + '74.0.3729.97', + '73.0.3683.119', + '73.0.3683.118', + '74.0.3729.96', + '75.0.3768.5', + '75.0.3768.4', + '75.0.3768.3', + '75.0.3768.2', + '74.0.3729.95', + '74.0.3729.94', + '75.0.3768.1', + '75.0.3768.0', + '74.0.3729.93', + '74.0.3729.92', + '73.0.3683.117', + '74.0.3729.91', + '75.0.3766.3', + '74.0.3729.90', + '75.0.3767.2', + '75.0.3767.1', + '75.0.3767.0', + '74.0.3729.89', + '73.0.3683.116', + '75.0.3766.2', + '74.0.3729.88', + '75.0.3766.1', + '75.0.3766.0', + '74.0.3729.87', + '73.0.3683.115', + '74.0.3729.86', + '75.0.3765.1', + '75.0.3765.0', + '74.0.3729.85', + '73.0.3683.114', + '74.0.3729.84', + '75.0.3764.1', + '75.0.3764.0', + '74.0.3729.83', + '73.0.3683.113', + '75.0.3763.2', + '75.0.3761.4', + '74.0.3729.82', + '75.0.3763.1', + '75.0.3763.0', + '74.0.3729.81', + '73.0.3683.112', + '75.0.3762.1', + '75.0.3762.0', + '74.0.3729.80', + '75.0.3761.3', + '74.0.3729.79', + '73.0.3683.111', + '75.0.3761.2', + '74.0.3729.78', + '74.0.3729.77', + '75.0.3761.1', + '75.0.3761.0', + '73.0.3683.110', + '74.0.3729.76', + '74.0.3729.75', + '75.0.3760.0', + '74.0.3729.74', + '75.0.3759.8', + '75.0.3759.7', + '75.0.3759.6', + '74.0.3729.73', + '75.0.3759.5', + '74.0.3729.72', + '73.0.3683.109', + '75.0.3759.4', + '75.0.3759.3', + '74.0.3729.71', + '75.0.3759.2', + '74.0.3729.70', + '73.0.3683.108', + '74.0.3729.69', + '75.0.3759.1', + '75.0.3759.0', + '74.0.3729.68', + '73.0.3683.107', + '74.0.3729.67', + '75.0.3758.1', + '75.0.3758.0', + '74.0.3729.66', + '73.0.3683.106', + '74.0.3729.65', + '75.0.3757.1', + '75.0.3757.0', + '74.0.3729.64', + '73.0.3683.105', + '74.0.3729.63', + '75.0.3756.1', + '75.0.3756.0', + '74.0.3729.62', + '73.0.3683.104', + '75.0.3755.3', + '75.0.3755.2', + '73.0.3683.103', + '75.0.3755.1', + '75.0.3755.0', + '74.0.3729.61', + '73.0.3683.102', + '74.0.3729.60', + '75.0.3754.2', + '74.0.3729.59', + '75.0.3753.4', + '74.0.3729.58', + '75.0.3754.1', + '75.0.3754.0', + '74.0.3729.57', + '73.0.3683.101', + '75.0.3753.3', + '75.0.3752.2', + '75.0.3753.2', + '74.0.3729.56', + '75.0.3753.1', + '75.0.3753.0', + '74.0.3729.55', + '73.0.3683.100', + '74.0.3729.54', + '75.0.3752.1', + '75.0.3752.0', + '74.0.3729.53', + '73.0.3683.99', + '74.0.3729.52', + '75.0.3751.1', + '75.0.3751.0', + '74.0.3729.51', + '73.0.3683.98', + '74.0.3729.50', + '75.0.3750.0', + '74.0.3729.49', + '74.0.3729.48', + '74.0.3729.47', + '75.0.3749.3', + '74.0.3729.46', + '73.0.3683.97', + '75.0.3749.2', + '74.0.3729.45', + '75.0.3749.1', + '75.0.3749.0', + '74.0.3729.44', + '73.0.3683.96', + '74.0.3729.43', + '74.0.3729.42', + '75.0.3748.1', + '75.0.3748.0', + '74.0.3729.41', + '75.0.3747.1', + '73.0.3683.95', + '75.0.3746.4', + '74.0.3729.40', + '74.0.3729.39', + '75.0.3747.0', + '75.0.3746.3', + '75.0.3746.2', + '74.0.3729.38', + '75.0.3746.1', + '75.0.3746.0', + '74.0.3729.37', + '73.0.3683.94', + '75.0.3745.5', + '75.0.3745.4', + '75.0.3745.3', + '75.0.3745.2', + '74.0.3729.36', + '75.0.3745.1', + '75.0.3745.0', + '75.0.3744.2', + '74.0.3729.35', + '73.0.3683.93', + '74.0.3729.34', + '75.0.3744.1', + '75.0.3744.0', + '74.0.3729.33', + '73.0.3683.92', + '74.0.3729.32', + '74.0.3729.31', + '73.0.3683.91', + '75.0.3741.2', + '75.0.3740.5', + '74.0.3729.30', + '75.0.3741.1', + '75.0.3741.0', + '74.0.3729.29', + '75.0.3740.4', + '73.0.3683.90', + '74.0.3729.28', + '75.0.3740.3', + '73.0.3683.89', + '75.0.3740.2', + '74.0.3729.27', + '75.0.3740.1', + '75.0.3740.0', + '74.0.3729.26', + '73.0.3683.88', + '73.0.3683.87', + '74.0.3729.25', + '75.0.3739.1', + '75.0.3739.0', + '73.0.3683.86', + '74.0.3729.24', + '73.0.3683.85', + '75.0.3738.4', + '75.0.3738.3', + '75.0.3738.2', + '75.0.3738.1', + '75.0.3738.0', + '74.0.3729.23', + '73.0.3683.84', + '74.0.3729.22', + '74.0.3729.21', + '75.0.3737.1', + '75.0.3737.0', + '74.0.3729.20', + '73.0.3683.83', + '74.0.3729.19', + '75.0.3736.1', + '75.0.3736.0', + '74.0.3729.18', + '73.0.3683.82', + '74.0.3729.17', + '75.0.3735.1', + '75.0.3735.0', + '74.0.3729.16', + '73.0.3683.81', + '75.0.3734.1', + '75.0.3734.0', + '74.0.3729.15', + '73.0.3683.80', + '74.0.3729.14', + '75.0.3733.1', + '75.0.3733.0', + '75.0.3732.1', + '74.0.3729.13', + '74.0.3729.12', + '73.0.3683.79', + '74.0.3729.11', + '75.0.3732.0', + '74.0.3729.10', + '73.0.3683.78', + '74.0.3729.9', + '74.0.3729.8', + '74.0.3729.7', + '75.0.3731.3', + '75.0.3731.2', + '75.0.3731.0', + '74.0.3729.6', + '73.0.3683.77', + '73.0.3683.76', + '75.0.3730.5', + '75.0.3730.4', + '73.0.3683.75', + '74.0.3729.5', + '73.0.3683.74', + '75.0.3730.3', + '75.0.3730.2', + '74.0.3729.4', + '73.0.3683.73', + '73.0.3683.72', + '75.0.3730.1', + '75.0.3730.0', + '74.0.3729.3', + '73.0.3683.71', + '74.0.3729.2', + '73.0.3683.70', + '74.0.3729.1', + '74.0.3729.0', + '74.0.3726.4', + '73.0.3683.69', + '74.0.3726.3', + '74.0.3728.0', + '74.0.3726.2', + '73.0.3683.68', + '74.0.3726.1', + '74.0.3726.0', + '74.0.3725.4', + '73.0.3683.67', + '73.0.3683.66', + '74.0.3725.3', + '74.0.3725.2', + '74.0.3725.1', + '74.0.3724.8', + '74.0.3725.0', + '73.0.3683.65', + '74.0.3724.7', + '74.0.3724.6', + '74.0.3724.5', + '74.0.3724.4', + '74.0.3724.3', + '74.0.3724.2', + '74.0.3724.1', + '74.0.3724.0', + '73.0.3683.64', + '74.0.3723.1', + '74.0.3723.0', + '73.0.3683.63', + '74.0.3722.1', + '74.0.3722.0', + '73.0.3683.62', + '74.0.3718.9', + '74.0.3702.3', + '74.0.3721.3', + '74.0.3721.2', + '74.0.3721.1', + '74.0.3721.0', + '74.0.3720.6', + '73.0.3683.61', + '72.0.3626.122', + '73.0.3683.60', + '74.0.3720.5', + '72.0.3626.121', + '74.0.3718.8', + '74.0.3720.4', + '74.0.3720.3', + '74.0.3718.7', + '74.0.3720.2', + '74.0.3720.1', + '74.0.3720.0', + '74.0.3718.6', + '74.0.3719.5', + '73.0.3683.59', + '74.0.3718.5', + '74.0.3718.4', + '74.0.3719.4', + '74.0.3719.3', + '74.0.3719.2', + '74.0.3719.1', + '73.0.3683.58', + '74.0.3719.0', + '73.0.3683.57', + '73.0.3683.56', + '74.0.3718.3', + '73.0.3683.55', + '74.0.3718.2', + '74.0.3718.1', + '74.0.3718.0', + '73.0.3683.54', + '74.0.3717.2', + '73.0.3683.53', + '74.0.3717.1', + '74.0.3717.0', + '73.0.3683.52', + '74.0.3716.1', + '74.0.3716.0', + '73.0.3683.51', + '74.0.3715.1', + '74.0.3715.0', + '73.0.3683.50', + '74.0.3711.2', + '74.0.3714.2', + '74.0.3713.3', + '74.0.3714.1', + '74.0.3714.0', + '73.0.3683.49', + '74.0.3713.1', + '74.0.3713.0', + '72.0.3626.120', + '73.0.3683.48', + '74.0.3712.2', + '74.0.3712.1', + '74.0.3712.0', + '73.0.3683.47', + '72.0.3626.119', + '73.0.3683.46', + '74.0.3710.2', + '72.0.3626.118', + '74.0.3711.1', + '74.0.3711.0', + '73.0.3683.45', + '72.0.3626.117', + '74.0.3710.1', + '74.0.3710.0', + '73.0.3683.44', + '72.0.3626.116', + '74.0.3709.1', + '74.0.3709.0', + '74.0.3704.9', + '73.0.3683.43', + '72.0.3626.115', + '74.0.3704.8', + '74.0.3704.7', + '74.0.3708.0', + '74.0.3706.7', + '74.0.3704.6', + '73.0.3683.42', + '72.0.3626.114', + '74.0.3706.6', + '72.0.3626.113', + '74.0.3704.5', + '74.0.3706.5', + '74.0.3706.4', + '74.0.3706.3', + '74.0.3706.2', + '74.0.3706.1', + '74.0.3706.0', + '73.0.3683.41', + '72.0.3626.112', + '74.0.3705.1', + '74.0.3705.0', + '73.0.3683.40', + '72.0.3626.111', + '73.0.3683.39', + '74.0.3704.4', + '73.0.3683.38', + '74.0.3704.3', + '74.0.3704.2', + '74.0.3704.1', + '74.0.3704.0', + '73.0.3683.37', + '72.0.3626.110', + '72.0.3626.109', + '74.0.3703.3', + '74.0.3703.2', + '73.0.3683.36', + '74.0.3703.1', + '74.0.3703.0', + '73.0.3683.35', + '72.0.3626.108', + '74.0.3702.2', + '74.0.3699.3', + '74.0.3702.1', + '74.0.3702.0', + '73.0.3683.34', + '72.0.3626.107', + '73.0.3683.33', + '74.0.3701.1', + '74.0.3701.0', + '73.0.3683.32', + '73.0.3683.31', + '72.0.3626.105', + '74.0.3700.1', + '74.0.3700.0', + '73.0.3683.29', + '72.0.3626.103', + '74.0.3699.2', + '74.0.3699.1', + '74.0.3699.0', + '73.0.3683.28', + '72.0.3626.102', + '73.0.3683.27', + '73.0.3683.26', + '74.0.3698.0', + '74.0.3696.2', + '72.0.3626.101', + '73.0.3683.25', + '74.0.3696.1', + '74.0.3696.0', + '74.0.3694.8', + '72.0.3626.100', + '74.0.3694.7', + '74.0.3694.6', + '74.0.3694.5', + '74.0.3694.4', + '72.0.3626.99', + '72.0.3626.98', + '74.0.3694.3', + '73.0.3683.24', + '72.0.3626.97', + '72.0.3626.96', + '72.0.3626.95', + '73.0.3683.23', + '72.0.3626.94', + '73.0.3683.22', + '73.0.3683.21', + '72.0.3626.93', + '74.0.3694.2', + '72.0.3626.92', + '74.0.3694.1', + '74.0.3694.0', + '74.0.3693.6', + '73.0.3683.20', + '72.0.3626.91', + '74.0.3693.5', + '74.0.3693.4', + '74.0.3693.3', + '74.0.3693.2', + '73.0.3683.19', + '74.0.3693.1', + '74.0.3693.0', + '73.0.3683.18', + '72.0.3626.90', + '74.0.3692.1', + '74.0.3692.0', + '73.0.3683.17', + '72.0.3626.89', + '74.0.3687.3', + '74.0.3691.1', + '74.0.3691.0', + '73.0.3683.16', + '72.0.3626.88', + '72.0.3626.87', + '73.0.3683.15', + '74.0.3690.1', + '74.0.3690.0', + '73.0.3683.14', + '72.0.3626.86', + '73.0.3683.13', + '73.0.3683.12', + '74.0.3689.1', + '74.0.3689.0', + '73.0.3683.11', + '72.0.3626.85', + '73.0.3683.10', + '72.0.3626.84', + '73.0.3683.9', + '74.0.3688.1', + '74.0.3688.0', + '73.0.3683.8', + '72.0.3626.83', + '74.0.3687.2', + '74.0.3687.1', + '74.0.3687.0', + '73.0.3683.7', + '72.0.3626.82', + '74.0.3686.4', + '72.0.3626.81', + '74.0.3686.3', + '74.0.3686.2', + '74.0.3686.1', + '74.0.3686.0', + '73.0.3683.6', + '72.0.3626.80', + '74.0.3685.1', + '74.0.3685.0', + '73.0.3683.5', + '72.0.3626.79', + '74.0.3684.1', + '74.0.3684.0', + '73.0.3683.4', + '72.0.3626.78', + '72.0.3626.77', + '73.0.3683.3', + '73.0.3683.2', + '72.0.3626.76', + '73.0.3683.1', + '73.0.3683.0', + '72.0.3626.75', + '71.0.3578.141', + '73.0.3682.1', + '73.0.3682.0', + '72.0.3626.74', + '71.0.3578.140', + '73.0.3681.4', + '73.0.3681.3', + '73.0.3681.2', + '73.0.3681.1', + '73.0.3681.0', + '72.0.3626.73', + '71.0.3578.139', + '72.0.3626.72', + '72.0.3626.71', + '73.0.3680.1', + '73.0.3680.0', + '72.0.3626.70', + '71.0.3578.138', + '73.0.3678.2', + '73.0.3679.1', + '73.0.3679.0', + '72.0.3626.69', + '71.0.3578.137', + '73.0.3678.1', + '73.0.3678.0', + '71.0.3578.136', + '73.0.3677.1', + '73.0.3677.0', + '72.0.3626.68', + '72.0.3626.67', + '71.0.3578.135', + '73.0.3676.1', + '73.0.3676.0', + '73.0.3674.2', + '72.0.3626.66', + '71.0.3578.134', + '73.0.3674.1', + '73.0.3674.0', + '72.0.3626.65', + '71.0.3578.133', + '73.0.3673.2', + '73.0.3673.1', + '73.0.3673.0', + '72.0.3626.64', + '71.0.3578.132', + '72.0.3626.63', + '72.0.3626.62', + '72.0.3626.61', + '72.0.3626.60', + '73.0.3672.1', + '73.0.3672.0', + '72.0.3626.59', + '71.0.3578.131', + '73.0.3671.3', + '73.0.3671.2', + '73.0.3671.1', + '73.0.3671.0', + '72.0.3626.58', + '71.0.3578.130', + '73.0.3670.1', + '73.0.3670.0', + '72.0.3626.57', + '71.0.3578.129', + '73.0.3669.1', + '73.0.3669.0', + '72.0.3626.56', + '71.0.3578.128', + '73.0.3668.2', + '73.0.3668.1', + '73.0.3668.0', + '72.0.3626.55', + '71.0.3578.127', + '73.0.3667.2', + '73.0.3667.1', + '73.0.3667.0', + '72.0.3626.54', + '71.0.3578.126', + '73.0.3666.1', + '73.0.3666.0', + '72.0.3626.53', + '71.0.3578.125', + '73.0.3665.4', + '73.0.3665.3', + '72.0.3626.52', + '73.0.3665.2', + '73.0.3664.4', + '73.0.3665.1', + '73.0.3665.0', + '72.0.3626.51', + '71.0.3578.124', + '72.0.3626.50', + '73.0.3664.3', + '73.0.3664.2', + '73.0.3664.1', + '73.0.3664.0', + '73.0.3663.2', + '72.0.3626.49', + '71.0.3578.123', + '73.0.3663.1', + '73.0.3663.0', + '72.0.3626.48', + '71.0.3578.122', + '73.0.3662.1', + '73.0.3662.0', + '72.0.3626.47', + '71.0.3578.121', + '73.0.3661.1', + '72.0.3626.46', + '73.0.3661.0', + '72.0.3626.45', + '71.0.3578.120', + '73.0.3660.2', + '73.0.3660.1', + '73.0.3660.0', + '72.0.3626.44', + '71.0.3578.119', + '73.0.3659.1', + '73.0.3659.0', + '72.0.3626.43', + '71.0.3578.118', + '73.0.3658.1', + '73.0.3658.0', + '72.0.3626.42', + '71.0.3578.117', + '73.0.3657.1', + '73.0.3657.0', + '72.0.3626.41', + '71.0.3578.116', + '73.0.3656.1', + '73.0.3656.0', + '72.0.3626.40', + '71.0.3578.115', + '73.0.3655.1', + '73.0.3655.0', + '72.0.3626.39', + '71.0.3578.114', + '73.0.3654.1', + '73.0.3654.0', + '72.0.3626.38', + '71.0.3578.113', + '73.0.3653.1', + '73.0.3653.0', + '72.0.3626.37', + '71.0.3578.112', + '73.0.3652.1', + '73.0.3652.0', + '72.0.3626.36', + '71.0.3578.111', + '73.0.3651.1', + '73.0.3651.0', + '72.0.3626.35', + '71.0.3578.110', + '73.0.3650.1', + '73.0.3650.0', + '72.0.3626.34', + '71.0.3578.109', + '73.0.3649.1', + '73.0.3649.0', + '72.0.3626.33', + '71.0.3578.108', + '73.0.3648.2', + '73.0.3648.1', + '73.0.3648.0', + '72.0.3626.32', + '71.0.3578.107', + '73.0.3647.2', + '73.0.3647.1', + '73.0.3647.0', + '72.0.3626.31', + '71.0.3578.106', + '73.0.3635.3', + '73.0.3646.2', + '73.0.3646.1', + '73.0.3646.0', + '72.0.3626.30', + '71.0.3578.105', + '72.0.3626.29', + '73.0.3645.2', + '73.0.3645.1', + '73.0.3645.0', + '72.0.3626.28', + '71.0.3578.104', + '72.0.3626.27', + '72.0.3626.26', + '72.0.3626.25', + '72.0.3626.24', + '73.0.3644.0', + '73.0.3643.2', + '72.0.3626.23', + '71.0.3578.103', + '73.0.3643.1', + '73.0.3643.0', + '72.0.3626.22', + '71.0.3578.102', + '73.0.3642.1', + '73.0.3642.0', + '72.0.3626.21', + '71.0.3578.101', + '73.0.3641.1', + '73.0.3641.0', + '72.0.3626.20', + '71.0.3578.100', + '72.0.3626.19', + '73.0.3640.1', + '73.0.3640.0', + '72.0.3626.18', + '73.0.3639.1', + '71.0.3578.99', + '73.0.3639.0', + '72.0.3626.17', + '73.0.3638.2', + '72.0.3626.16', + '73.0.3638.1', + '73.0.3638.0', + '72.0.3626.15', + '71.0.3578.98', + '73.0.3635.2', + '71.0.3578.97', + '73.0.3637.1', + '73.0.3637.0', + '72.0.3626.14', + '71.0.3578.96', + '71.0.3578.95', + '72.0.3626.13', + '71.0.3578.94', + '73.0.3636.2', + '71.0.3578.93', + '73.0.3636.1', + '73.0.3636.0', + '72.0.3626.12', + '71.0.3578.92', + '73.0.3635.1', + '73.0.3635.0', + '72.0.3626.11', + '71.0.3578.91', + '73.0.3634.2', + '73.0.3634.1', + '73.0.3634.0', + '72.0.3626.10', + '71.0.3578.90', + '71.0.3578.89', + '73.0.3633.2', + '73.0.3633.1', + '73.0.3633.0', + '72.0.3610.4', + '72.0.3626.9', + '71.0.3578.88', + '73.0.3632.5', + '73.0.3632.4', + '73.0.3632.3', + '73.0.3632.2', + '73.0.3632.1', + '73.0.3632.0', + '72.0.3626.8', + '71.0.3578.87', + '73.0.3631.2', + '73.0.3631.1', + '73.0.3631.0', + '72.0.3626.7', + '71.0.3578.86', + '72.0.3626.6', + '73.0.3630.1', + '73.0.3630.0', + '72.0.3626.5', + '71.0.3578.85', + '72.0.3626.4', + '73.0.3628.3', + '73.0.3628.2', + '73.0.3629.1', + '73.0.3629.0', + '72.0.3626.3', + '71.0.3578.84', + '73.0.3628.1', + '73.0.3628.0', + '71.0.3578.83', + '73.0.3627.1', + '73.0.3627.0', + '72.0.3626.2', + '71.0.3578.82', + '71.0.3578.81', + '71.0.3578.80', + '72.0.3626.1', + '72.0.3626.0', + '71.0.3578.79', + '70.0.3538.124', + '71.0.3578.78', + '72.0.3623.4', + '72.0.3625.2', + '72.0.3625.1', + '72.0.3625.0', + '71.0.3578.77', + '70.0.3538.123', + '72.0.3624.4', + '72.0.3624.3', + '72.0.3624.2', + '71.0.3578.76', + '72.0.3624.1', + '72.0.3624.0', + '72.0.3623.3', + '71.0.3578.75', + '70.0.3538.122', + '71.0.3578.74', + '72.0.3623.2', + '72.0.3610.3', + '72.0.3623.1', + '72.0.3623.0', + '72.0.3622.3', + '72.0.3622.2', + '71.0.3578.73', + '70.0.3538.121', + '72.0.3622.1', + '72.0.3622.0', + '71.0.3578.72', + '70.0.3538.120', + '72.0.3621.1', + '72.0.3621.0', + '71.0.3578.71', + '70.0.3538.119', + '72.0.3620.1', + '72.0.3620.0', + '71.0.3578.70', + '70.0.3538.118', + '71.0.3578.69', + '72.0.3619.1', + '72.0.3619.0', + '71.0.3578.68', + '70.0.3538.117', + '71.0.3578.67', + '72.0.3618.1', + '72.0.3618.0', + '71.0.3578.66', + '70.0.3538.116', + '72.0.3617.1', + '72.0.3617.0', + '71.0.3578.65', + '70.0.3538.115', + '72.0.3602.3', + '71.0.3578.64', + '72.0.3616.1', + '72.0.3616.0', + '71.0.3578.63', + '70.0.3538.114', + '71.0.3578.62', + '72.0.3615.1', + '72.0.3615.0', + '71.0.3578.61', + '70.0.3538.113', + '72.0.3614.1', + '72.0.3614.0', + '71.0.3578.60', + '70.0.3538.112', + '72.0.3613.1', + '72.0.3613.0', + '71.0.3578.59', + '70.0.3538.111', + '72.0.3612.2', + '72.0.3612.1', + '72.0.3612.0', + '70.0.3538.110', + '71.0.3578.58', + '70.0.3538.109', + '72.0.3611.2', + '72.0.3611.1', + '72.0.3611.0', + '71.0.3578.57', + '70.0.3538.108', + '72.0.3610.2', + '71.0.3578.56', + '71.0.3578.55', + '72.0.3610.1', + '72.0.3610.0', + '71.0.3578.54', + '70.0.3538.107', + '71.0.3578.53', + '72.0.3609.3', + '71.0.3578.52', + '72.0.3609.2', + '71.0.3578.51', + '72.0.3608.5', + '72.0.3609.1', + '72.0.3609.0', + '71.0.3578.50', + '70.0.3538.106', + '72.0.3608.4', + '72.0.3608.3', + '72.0.3608.2', + '71.0.3578.49', + '72.0.3608.1', + '72.0.3608.0', + '70.0.3538.105', + '71.0.3578.48', + '72.0.3607.1', + '72.0.3607.0', + '71.0.3578.47', + '70.0.3538.104', + '72.0.3606.2', + '72.0.3606.1', + '72.0.3606.0', + '71.0.3578.46', + '70.0.3538.103', + '70.0.3538.102', + '72.0.3605.3', + '72.0.3605.2', + '72.0.3605.1', + '72.0.3605.0', + '71.0.3578.45', + '70.0.3538.101', + '71.0.3578.44', + '71.0.3578.43', + '70.0.3538.100', + '70.0.3538.99', + '71.0.3578.42', + '72.0.3604.1', + '72.0.3604.0', + '71.0.3578.41', + '70.0.3538.98', + '71.0.3578.40', + '72.0.3603.2', + '72.0.3603.1', + '72.0.3603.0', + '71.0.3578.39', + '70.0.3538.97', + '72.0.3602.2', + '71.0.3578.38', + '71.0.3578.37', + '72.0.3602.1', + '72.0.3602.0', + '71.0.3578.36', + '70.0.3538.96', + '72.0.3601.1', + '72.0.3601.0', + '71.0.3578.35', + '70.0.3538.95', + '72.0.3600.1', + '72.0.3600.0', + '71.0.3578.34', + '70.0.3538.94', + '72.0.3599.3', + '72.0.3599.2', + '72.0.3599.1', + '72.0.3599.0', + '71.0.3578.33', + '70.0.3538.93', + '72.0.3598.1', + '72.0.3598.0', + '71.0.3578.32', + '70.0.3538.87', + '72.0.3597.1', + '72.0.3597.0', + '72.0.3596.2', + '71.0.3578.31', + '70.0.3538.86', + '71.0.3578.30', + '71.0.3578.29', + '72.0.3596.1', + '72.0.3596.0', + '71.0.3578.28', + '70.0.3538.85', + '72.0.3595.2', + '72.0.3591.3', + '72.0.3595.1', + '72.0.3595.0', + '71.0.3578.27', + '70.0.3538.84', + '72.0.3594.1', + '72.0.3594.0', + '71.0.3578.26', + '70.0.3538.83', + '72.0.3593.2', + '72.0.3593.1', + '72.0.3593.0', + '71.0.3578.25', + '70.0.3538.82', + '72.0.3589.3', + '72.0.3592.2', + '72.0.3592.1', + '72.0.3592.0', + '71.0.3578.24', + '72.0.3589.2', + '70.0.3538.81', + '70.0.3538.80', + '72.0.3591.2', + '72.0.3591.1', + '72.0.3591.0', + '71.0.3578.23', + '70.0.3538.79', + '71.0.3578.22', + '72.0.3590.1', + '72.0.3590.0', + '71.0.3578.21', + '70.0.3538.78', + '70.0.3538.77', + '72.0.3589.1', + '72.0.3589.0', + '71.0.3578.20', + '70.0.3538.76', + '71.0.3578.19', + '70.0.3538.75', + '72.0.3588.1', + '72.0.3588.0', + '71.0.3578.18', + '70.0.3538.74', + '72.0.3586.2', + '72.0.3587.0', + '71.0.3578.17', + '70.0.3538.73', + '72.0.3586.1', + '72.0.3586.0', + '71.0.3578.16', + '70.0.3538.72', + '72.0.3585.1', + '72.0.3585.0', + '71.0.3578.15', + '70.0.3538.71', + '71.0.3578.14', + '72.0.3584.1', + '72.0.3584.0', + '71.0.3578.13', + '70.0.3538.70', + '72.0.3583.2', + '71.0.3578.12', + '72.0.3583.1', + '72.0.3583.0', + '71.0.3578.11', + '70.0.3538.69', + '71.0.3578.10', + '72.0.3582.0', + '72.0.3581.4', + '71.0.3578.9', + '70.0.3538.67', + '72.0.3581.3', + '72.0.3581.2', + '72.0.3581.1', + '72.0.3581.0', + '71.0.3578.8', + '70.0.3538.66', + '72.0.3580.1', + '72.0.3580.0', + '71.0.3578.7', + '70.0.3538.65', + '71.0.3578.6', + '72.0.3579.1', + '72.0.3579.0', + '71.0.3578.5', + '70.0.3538.64', + '71.0.3578.4', + '71.0.3578.3', + '71.0.3578.2', + '71.0.3578.1', + '71.0.3578.0', + '70.0.3538.63', + '69.0.3497.128', + '70.0.3538.62', + '70.0.3538.61', + '70.0.3538.60', + '70.0.3538.59', + '71.0.3577.1', + '71.0.3577.0', + '70.0.3538.58', + '69.0.3497.127', + '71.0.3576.2', + '71.0.3576.1', + '71.0.3576.0', + '70.0.3538.57', + '70.0.3538.56', + '71.0.3575.2', + '70.0.3538.55', + '69.0.3497.126', + '70.0.3538.54', + '71.0.3575.1', + '71.0.3575.0', + '71.0.3574.1', + '71.0.3574.0', + '70.0.3538.53', + '69.0.3497.125', + '70.0.3538.52', + '71.0.3573.1', + '71.0.3573.0', + '70.0.3538.51', + '69.0.3497.124', + '71.0.3572.1', + '71.0.3572.0', + '70.0.3538.50', + '69.0.3497.123', + '71.0.3571.2', + '70.0.3538.49', + '69.0.3497.122', + '71.0.3571.1', + '71.0.3571.0', + '70.0.3538.48', + '69.0.3497.121', + '71.0.3570.1', + '71.0.3570.0', + '70.0.3538.47', + '69.0.3497.120', + '71.0.3568.2', + '71.0.3569.1', + '71.0.3569.0', + '70.0.3538.46', + '69.0.3497.119', + '70.0.3538.45', + '71.0.3568.1', + '71.0.3568.0', + '70.0.3538.44', + '69.0.3497.118', + '70.0.3538.43', + '70.0.3538.42', + '71.0.3567.1', + '71.0.3567.0', + '70.0.3538.41', + '69.0.3497.117', + '71.0.3566.1', + '71.0.3566.0', + '70.0.3538.40', + '69.0.3497.116', + '71.0.3565.1', + '71.0.3565.0', + '70.0.3538.39', + '69.0.3497.115', + '71.0.3564.1', + '71.0.3564.0', + '70.0.3538.38', + '69.0.3497.114', + '71.0.3563.0', + '71.0.3562.2', + '70.0.3538.37', + '69.0.3497.113', + '70.0.3538.36', + '70.0.3538.35', + '71.0.3562.1', + '71.0.3562.0', + '70.0.3538.34', + '69.0.3497.112', + '70.0.3538.33', + '71.0.3561.1', + '71.0.3561.0', + '70.0.3538.32', + '69.0.3497.111', + '71.0.3559.6', + '71.0.3560.1', + '71.0.3560.0', + '71.0.3559.5', + '71.0.3559.4', + '70.0.3538.31', + '69.0.3497.110', + '71.0.3559.3', + '70.0.3538.30', + '69.0.3497.109', + '71.0.3559.2', + '71.0.3559.1', + '71.0.3559.0', + '70.0.3538.29', + '69.0.3497.108', + '71.0.3558.2', + '71.0.3558.1', + '71.0.3558.0', + '70.0.3538.28', + '69.0.3497.107', + '71.0.3557.2', + '71.0.3557.1', + '71.0.3557.0', + '70.0.3538.27', + '69.0.3497.106', + '71.0.3554.4', + '70.0.3538.26', + '71.0.3556.1', + '71.0.3556.0', + '70.0.3538.25', + '71.0.3554.3', + '69.0.3497.105', + '71.0.3554.2', + '70.0.3538.24', + '69.0.3497.104', + '71.0.3555.2', + '70.0.3538.23', + '71.0.3555.1', + '71.0.3555.0', + '70.0.3538.22', + '69.0.3497.103', + '71.0.3554.1', + '71.0.3554.0', + '70.0.3538.21', + '69.0.3497.102', + '71.0.3553.3', + '70.0.3538.20', + '69.0.3497.101', + '71.0.3553.2', + '69.0.3497.100', + '71.0.3553.1', + '71.0.3553.0', + '70.0.3538.19', + '69.0.3497.99', + '69.0.3497.98', + '69.0.3497.97', + '71.0.3552.6', + '71.0.3552.5', + '71.0.3552.4', + '71.0.3552.3', + '71.0.3552.2', + '71.0.3552.1', + '71.0.3552.0', + '70.0.3538.18', + '69.0.3497.96', + '71.0.3551.3', + '71.0.3551.2', + '71.0.3551.1', + '71.0.3551.0', + '70.0.3538.17', + '69.0.3497.95', + '71.0.3550.3', + '71.0.3550.2', + '71.0.3550.1', + '71.0.3550.0', + '70.0.3538.16', + '69.0.3497.94', + '71.0.3549.1', + '71.0.3549.0', + '70.0.3538.15', + '69.0.3497.93', + '69.0.3497.92', + '71.0.3548.1', + '71.0.3548.0', + '70.0.3538.14', + '69.0.3497.91', + '71.0.3547.1', + '71.0.3547.0', + '70.0.3538.13', + '69.0.3497.90', + '71.0.3546.2', + '69.0.3497.89', + '71.0.3546.1', + '71.0.3546.0', + '70.0.3538.12', + '69.0.3497.88', + '71.0.3545.4', + '71.0.3545.3', + '71.0.3545.2', + '71.0.3545.1', + '71.0.3545.0', + '70.0.3538.11', + '69.0.3497.87', + '71.0.3544.5', + '71.0.3544.4', + '71.0.3544.3', + '71.0.3544.2', + '71.0.3544.1', + '71.0.3544.0', + '69.0.3497.86', + '70.0.3538.10', + '69.0.3497.85', + '70.0.3538.9', + '69.0.3497.84', + '71.0.3543.4', + '70.0.3538.8', + '71.0.3543.3', + '71.0.3543.2', + '71.0.3543.1', + '71.0.3543.0', + '70.0.3538.7', + '69.0.3497.83', + '71.0.3542.2', + '71.0.3542.1', + '71.0.3542.0', + '70.0.3538.6', + '69.0.3497.82', + '69.0.3497.81', + '71.0.3541.1', + '71.0.3541.0', + '70.0.3538.5', + '69.0.3497.80', + '71.0.3540.1', + '71.0.3540.0', + '70.0.3538.4', + '69.0.3497.79', + '70.0.3538.3', + '71.0.3539.1', + '71.0.3539.0', + '69.0.3497.78', + '68.0.3440.134', + '69.0.3497.77', + '70.0.3538.2', + '70.0.3538.1', + '70.0.3538.0', + '69.0.3497.76', + '68.0.3440.133', + '69.0.3497.75', + '70.0.3537.2', + '70.0.3537.1', + '70.0.3537.0', + '69.0.3497.74', + '68.0.3440.132', + '70.0.3536.0', + '70.0.3535.5', + '70.0.3535.4', + '70.0.3535.3', + '69.0.3497.73', + '68.0.3440.131', + '70.0.3532.8', + '70.0.3532.7', + '69.0.3497.72', + '69.0.3497.71', + '70.0.3535.2', + '70.0.3535.1', + '70.0.3535.0', + '69.0.3497.70', + '68.0.3440.130', + '69.0.3497.69', + '68.0.3440.129', + '70.0.3534.4', + '70.0.3534.3', + '70.0.3534.2', + '70.0.3534.1', + '70.0.3534.0', + '69.0.3497.68', + '68.0.3440.128', + '70.0.3533.2', + '70.0.3533.1', + '70.0.3533.0', + '69.0.3497.67', + '68.0.3440.127', + '70.0.3532.6', + '70.0.3532.5', + '70.0.3532.4', + '69.0.3497.66', + '68.0.3440.126', + '70.0.3532.3', + '70.0.3532.2', + '70.0.3532.1', + '69.0.3497.60', + '69.0.3497.65', + '69.0.3497.64', + '70.0.3532.0', + '70.0.3531.0', + '70.0.3530.4', + '70.0.3530.3', + '70.0.3530.2', + '69.0.3497.58', + '68.0.3440.125', + '69.0.3497.57', + '69.0.3497.56', + '69.0.3497.55', + '69.0.3497.54', + '70.0.3530.1', + '70.0.3530.0', + '69.0.3497.53', + '68.0.3440.124', + '69.0.3497.52', + '70.0.3529.3', + '70.0.3529.2', + '70.0.3529.1', + '70.0.3529.0', + '69.0.3497.51', + '70.0.3528.4', + '68.0.3440.123', + '70.0.3528.3', + '70.0.3528.2', + '70.0.3528.1', + '70.0.3528.0', + '69.0.3497.50', + '68.0.3440.122', + '70.0.3527.1', + '70.0.3527.0', + '69.0.3497.49', + '68.0.3440.121', + '70.0.3526.1', + '70.0.3526.0', + '68.0.3440.120', + '69.0.3497.48', + '69.0.3497.47', + '68.0.3440.119', + '68.0.3440.118', + '70.0.3525.5', + '70.0.3525.4', + '70.0.3525.3', + '68.0.3440.117', + '69.0.3497.46', + '70.0.3525.2', + '70.0.3525.1', + '70.0.3525.0', + '69.0.3497.45', + '68.0.3440.116', + '70.0.3524.4', + '70.0.3524.3', + '69.0.3497.44', + '70.0.3524.2', + '70.0.3524.1', + '70.0.3524.0', + '70.0.3523.2', + '69.0.3497.43', + '68.0.3440.115', + '70.0.3505.9', + '69.0.3497.42', + '70.0.3505.8', + '70.0.3523.1', + '70.0.3523.0', + '69.0.3497.41', + '68.0.3440.114', + '70.0.3505.7', + '69.0.3497.40', + '70.0.3522.1', + '70.0.3522.0', + '70.0.3521.2', + '69.0.3497.39', + '68.0.3440.113', + '70.0.3505.6', + '70.0.3521.1', + '70.0.3521.0', + '69.0.3497.38', + '68.0.3440.112', + '70.0.3520.1', + '70.0.3520.0', + '69.0.3497.37', + '68.0.3440.111', + '70.0.3519.3', + '70.0.3519.2', + '70.0.3519.1', + '70.0.3519.0', + '69.0.3497.36', + '68.0.3440.110', + '70.0.3518.1', + '70.0.3518.0', + '69.0.3497.35', + '69.0.3497.34', + '68.0.3440.109', + '70.0.3517.1', + '70.0.3517.0', + '69.0.3497.33', + '68.0.3440.108', + '69.0.3497.32', + '70.0.3516.3', + '70.0.3516.2', + '70.0.3516.1', + '70.0.3516.0', + '69.0.3497.31', + '68.0.3440.107', + '70.0.3515.4', + '68.0.3440.106', + '70.0.3515.3', + '70.0.3515.2', + '70.0.3515.1', + '70.0.3515.0', + '69.0.3497.30', + '68.0.3440.105', + '68.0.3440.104', + '70.0.3514.2', + '70.0.3514.1', + '70.0.3514.0', + '69.0.3497.29', + '68.0.3440.103', + '70.0.3513.1', + '70.0.3513.0', + '69.0.3497.28', + ) + return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS) + + std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', + 'User-Agent': random_user_agent(), 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -121,8 +1709,8 @@ KNOWN_EXTENSIONS = ( # needed for sanitizing filenames in restricted mode ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', - itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], - 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) + itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'], + 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y'))) DATE_FORMATS = ( '%d %B %Y', @@ -156,6 +1744,8 @@ DATE_FORMATS = ( '%Y-%m-%dT%H:%M', '%b %d %Y at %H:%M', '%b %d %Y at %H:%M:%S', + '%B %d %Y at %H:%M', + '%B %d %Y at %H:%M:%S', ) DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) @@ -178,6 +1768,7 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" +JSON_LD_RE = r'(?is)]+type=(["\']?)application/ld\+json\1[^>]*>(?P.+?)' def preferredencoding(): @@ -337,34 +1928,46 @@ def get_element_by_id(id, html): def get_element_by_class(class_name, html): - return get_element_by_attribute( + """Return the content of the first tag with the specified class in the passed HTML document""" + retval = get_elements_by_class(class_name, html) + return retval[0] if retval else None + + +def get_element_by_attribute(attribute, value, html, escape_value=True): + retval = get_elements_by_attribute(attribute, value, html, escape_value) + return retval[0] if retval else None + + +def get_elements_by_class(class_name, html): + """Return the content of all tags with the specified class in the passed HTML document as a list""" + return get_elements_by_attribute( 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), html, escape_value=False) -def get_element_by_attribute(attribute, value, html, escape_value=True): +def get_elements_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" value = re.escape(value) if escape_value else value - m = re.search(r'''(?xs) + retlist = [] + for m in re.finditer(r'''(?xs) <([a-zA-Z0-9:._-]+) - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s+%s=['"]?%s['"]? - (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? \s*> (?P.*?) - ''' % (re.escape(attribute), value), html) + ''' % (re.escape(attribute), value), html): + res = m.group('content') - if not m: - return None - res = m.group('content') + if res.startswith('"') or res.startswith("'"): + res = res[1:-1] - if res.startswith('"') or res.startswith("'"): - res = res[1:-1] + retlist.append(unescapeHTML(res)) - return unescapeHTML(res) + return retlist class HTMLAttributeParser(compat_HTMLParser): @@ -394,8 +1997,12 @@ def extract_attributes(html_element): but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. """ parser = HTMLAttributeParser() - parser.feed(html_element) - parser.close() + try: + parser.feed(html_element) + parser.close() + # Older Python may throw HTMLParseError in case of malformed HTML + except compat_HTMLParseError: + pass return parser.attrs @@ -407,8 +2014,8 @@ def clean_html(html): # Newline vs
html = html.replace('\n', ' ') - html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html) - html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) + html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html) + html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html) # Strip html tags html = re.sub('<.*?>', '', html) # Replace html entities @@ -460,7 +2067,8 @@ def timeconvert(timestr): def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. """ def replace_insane(char): if restricted and char in ACCENT_CHARS: @@ -515,16 +2123,33 @@ def sanitize_path(s): return os.path.join(*sanitized_path) -# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of -# unwanted failures due to missing protocol def sanitize_url(url): - return 'http:%s' % url if url.startswith('//') else url + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url.startswith('//'): + return 'http:%s' % url + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/ytdl-org/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url def sanitized_Request(url, *args, **kwargs): return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs) +def expand_path(s): + """Expand shell variables and ~""" + return os.path.expandvars(compat_expanduser(s)) + + def orderedSet(iterable): """ Remove all duplicates from the input iterable """ res = [] @@ -555,7 +2180,7 @@ def _htmlentity_transform(entity_with_semicolon): numstr = '0%s' % numstr else: base = 10 - # See https://github.com/rg3/youtube-dl/issues/7518 + # See https://github.com/ytdl-org/youtube-dl/issues/7518 try: return compat_chr(int(numstr, base)) except ValueError: @@ -571,7 +2196,7 @@ def unescapeHTML(s): assert type(s) == compat_str return re.sub( - r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) + r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) def get_subprocess_encoding(): @@ -689,7 +2314,12 @@ def bug_reports_message(): return msg -class ExtractorError(Exception): +class YoutubeDLError(Exception): + """Base exception for YoutubeDL errors.""" + pass + + +class ExtractorError(YoutubeDLError): """Error during info extraction.""" def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None): @@ -730,7 +2360,19 @@ class RegexNotFoundError(ExtractorError): pass -class DownloadError(Exception): +class GeoRestrictedError(ExtractorError): + """Geographic restriction Error exception. + + This exception may be thrown when a video is not available from your + geographic location due to geographic restrictions imposed by a website. + """ + def __init__(self, msg, countries=None): + super(GeoRestrictedError, self).__init__(msg, expected=True) + self.msg = msg + self.countries = countries + + +class DownloadError(YoutubeDLError): """Download Error exception. This exception may be thrown by FileDownloader objects if they are not @@ -744,7 +2386,7 @@ class DownloadError(Exception): self.exc_info = exc_info -class SameFileError(Exception): +class SameFileError(YoutubeDLError): """Same File exception. This exception will be thrown by FileDownloader objects if they detect @@ -753,7 +2395,7 @@ class SameFileError(Exception): pass -class PostProcessingError(Exception): +class PostProcessingError(YoutubeDLError): """Post Processing exception. This exception may be raised by PostProcessor's .run() method to @@ -761,15 +2403,16 @@ class PostProcessingError(Exception): """ def __init__(self, msg): + super(PostProcessingError, self).__init__(msg) self.msg = msg -class MaxDownloadsReached(Exception): +class MaxDownloadsReached(YoutubeDLError): """ --max-downloads limit has been reached. """ pass -class UnavailableVideoError(Exception): +class UnavailableVideoError(YoutubeDLError): """Unavailable Format exception. This exception will be thrown when a video is requested @@ -778,7 +2421,7 @@ class UnavailableVideoError(Exception): pass -class ContentTooShortError(Exception): +class ContentTooShortError(YoutubeDLError): """Content Too Short exception. This exception may be raised by FileDownloader objects when a file they @@ -787,20 +2430,23 @@ class ContentTooShortError(Exception): """ def __init__(self, downloaded, expected): + super(ContentTooShortError, self).__init__( + 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) + ) # Both in bytes self.downloaded = downloaded self.expected = expected -class XAttrMetadataError(Exception): +class XAttrMetadataError(YoutubeDLError): def __init__(self, code=None, msg='Unknown error'): super(XAttrMetadataError, self).__init__(msg) self.code = code self.msg = msg # Parsing code and msg - if (self.code in (errno.ENOSPC, errno.EDQUOT) or - 'No space left' in self.msg or 'Disk quota excedded' in self.msg): + if (self.code in (errno.ENOSPC, errno.EDQUOT) + or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): self.reason = 'NO_SPACE' elif self.code == errno.E2BIG or 'Argument list too long' in self.msg: self.reason = 'VALUE_TOO_LONG' @@ -808,25 +2454,63 @@ class XAttrMetadataError(Exception): self.reason = 'NOT_SUPPORTED' -class XAttrUnavailableError(Exception): +class XAttrUnavailableError(YoutubeDLError): pass def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting # expected HTTP responses to meet HTTP/1.0 or later (see also - # https://github.com/rg3/youtube-dl/issues/6727) + # https://github.com/ytdl-org/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs[b'strict'] = True - hc = http_class(*args, **kwargs) + kwargs['strict'] = True + hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') + if source_address is not None: + # This is to workaround _create_connection() from socket where it will try all + # address data from getaddrinfo() including IPv6. This filters the result from + # getaddrinfo() based on the source_address value. + # This is based on the cpython socket.create_connection() function. + # https://github.com/python/cpython/blob/master/Lib/socket.py#L691 + def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None): + host, port = address + err = None + addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM) + af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6 + ip_addrs = [addr for addr in addrs if addr[0] == af] + if addrs and not ip_addrs: + ip_version = 'v4' if af == socket.AF_INET else 'v6' + raise socket.error( + "No remote IP%s addresses available for connect, can't use '%s' as source address" + % (ip_version, source_address[0])) + for res in ip_addrs: + af, socktype, proto, canonname, sa = res + sock = None + try: + sock = socket.socket(af, socktype, proto) + if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT: + sock.settimeout(timeout) + sock.bind(source_address) + sock.connect(sa) + err = None # Explicitly break reference cycle + return sock + except socket.error as _: + err = _ + if sock is not None: + sock.close() + if err is not None: + raise err + else: + raise socket.error('getaddrinfo returns an empty list') + if hasattr(hc, '_create_connection'): + hc._create_connection = _create_connection sa = (source_address, 0) if hasattr(hc, 'source_address'): # Python 2.7+ hc.source_address = sa else: # Python 2.6 def _hc_connect(self, *args, **kwargs): - sock = compat_socket_create_connection( + sock = _create_connection( (self.host, self.port), self.timeout, sa) if is_https: self.sock = ssl.wrap_socket( @@ -890,14 +2574,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): except zlib.error: return zlib.decompress(data) - @staticmethod - def addinfourl_wrapper(stream, headers, url, code): - if hasattr(compat_urllib_request.addinfourl, 'getcode'): - return compat_urllib_request.addinfourl(stream, headers, url, code) - ret = compat_urllib_request.addinfourl(stream, headers, url) - ret.code = code - return ret - def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not # always respected by websites, some tend to give out URLs with non percent-encoded @@ -949,17 +2625,17 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): break else: raise original_ioerror - resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) - resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see - # https://github.com/rg3/youtube-dl/issues/6457). + # https://github.com/ytdl-org/youtube-dl/issues/6457). if 300 <= resp.code < 400: location = resp.headers.get('Location') if location: @@ -1048,6 +2724,49 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): req, **kwargs) +class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): + _HTTPONLY_PREFIX = '#HttpOnly_' + + def save(self, filename=None, ignore_discard=False, ignore_expires=False): + # Store session cookies with `expires` set to 0 instead of an empty + # string + for cookie in self: + if cookie.expires is None: + cookie.expires = 0 + compat_cookiejar.MozillaCookieJar.save(self, filename, ignore_discard, ignore_expires) + + def load(self, filename=None, ignore_discard=False, ignore_expires=False): + """Load cookies from a file.""" + if filename is None: + if self.filename is not None: + filename = self.filename + else: + raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + + cf = io.StringIO() + with open(filename) as f: + for line in f: + if line.startswith(self._HTTPONLY_PREFIX): + line = line[len(self._HTTPONLY_PREFIX):] + cf.write(compat_str(line)) + cf.seek(0) + self._really_load(cf, filename, ignore_discard, ignore_expires) + # Session cookies are denoted by either `expires` field set to + # an empty string or 0. MozillaCookieJar only recognizes the former + # (see [1]). So we need force the latter to be recognized as session + # cookies on our own. + # Session cookies may be important for cookies-based authentication, + # e.g. usually, when user does not check 'Remember me' check box while + # logging in on a site, some important cookies are stored as session + # cookies so that not recognizing them will result in failed login. + # 1. https://bugs.python.org/issue17164 + for cookie in self: + # Treat `expires=0` cookies as session cookies + if cookie.expires == 0: + cookie.expires = None + cookie.discard = True + + class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def __init__(self, cookiejar=None): compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) @@ -1055,7 +2774,7 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): def http_response(self, request, response): # Python 2 will choke on next HTTP request in row if there are non-ASCII # characters in Set-Cookie HTTP header of last response (see - # https://github.com/rg3/youtube-dl/issues/6769). + # https://github.com/ytdl-org/youtube-dl/issues/6769). # In order to at least prevent crashing we will percent encode Set-Cookie # header before HTTPCookieProcessor starts processing it. # if sys.version_info < (3, 0) and response.headers: @@ -1145,7 +2864,7 @@ def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = date_str.replace(',', ' ') + date_str = re.sub(r'[,|]', '', date_str) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -1153,6 +2872,16 @@ def unified_timestamp(date_str, day_first=True): # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + # Remove unrecognized timezones from ISO 8601 alike timestamps + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P\s*[A-Z]+)$', date_str) + if m: + date_str = date_str[:-len(m.group('tz'))] + + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -1165,7 +2894,7 @@ def unified_timestamp(date_str, day_first=True): def determine_ext(url, default_ext='unknown_video'): - if url is None: + if url is None or '.' not in url: return default_ext guess = url.partition('?')[0].rpartition('.')[2] if re.match(r'^[A-Za-z0-9]+$', guess): @@ -1285,31 +3014,31 @@ def _windows_write_string(s, out): if fileno not in WIN_OUTPUT_IDS: return False - GetStdHandle = ctypes.WINFUNCTYPE( + GetStdHandle = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - (b'GetStdHandle', ctypes.windll.kernel32)) + ('GetStdHandle', ctypes.windll.kernel32)) h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - WriteConsoleW = ctypes.WINFUNCTYPE( + WriteConsoleW = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32)) + ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) written = ctypes.wintypes.DWORD(0) - GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32)) + GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) FILE_TYPE_CHAR = 0x0002 FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = ctypes.WINFUNCTYPE( + GetConsoleMode = compat_ctypes_WINFUNCTYPE( ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.POINTER(ctypes.wintypes.DWORD))( - (b'GetConsoleMode', ctypes.windll.kernel32)) + ('GetConsoleMode', ctypes.windll.kernel32)) INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value def not_a_console(handle): if handle == INVALID_HANDLE_VALUE or handle is None: return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or - GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) + return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR + or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) if not_a_console(h): return False @@ -1345,8 +3074,8 @@ def write_string(s, out=None, encoding=None): if _windows_write_string(s, out): return - if ('b' in getattr(out, 'mode', '') or - sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr + if ('b' in getattr(out, 'mode', '') + or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr byt = s.encode(encoding or preferredencoding(), 'ignore') out.write(byt) elif hasattr(out, 'buffer'): @@ -1491,7 +3220,7 @@ def shell_quote(args): if isinstance(a, bytes): # We may get a filename encoded with 'encodeFilename' a = a.decode(encoding) - quoted_args.append(pipes.quote(a)) + quoted_args.append(compat_shlex_quote(a)) return ' '.join(quoted_args) @@ -1631,6 +3360,36 @@ def parse_count(s): return lookup_unit_table(_UNIT_TABLE, s) +def parse_resolution(s): + if s is None: + return {} + + mobj = re.search(r'\b(?P\d+)\s*[xX×]\s*(?P\d+)\b', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'\b(\d+)[pPiI]\b', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + +def parse_bitrate(s): + if not isinstance(s, compat_str): + return + mobj = re.search(r'\b(\d+)\s*kbps', s) + if mobj: + return int(mobj.group(1)) + + def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ @@ -1672,6 +3431,11 @@ def setproctitle(title): libc = ctypes.cdll.LoadLibrary('libc.so.6') except OSError: return + except TypeError: + # LoadLibrary in Windows Python 2.7.13 only expects + # a bytestring, but since unicode_literals turns + # every string into a unicode string, it fails. + return title_bytes = title.encode('utf-8') buf = ctypes.create_string_buffer(len(title_bytes)) buf.value = title_bytes @@ -1708,11 +3472,16 @@ def base_url(url): def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None - if re.match(r'^(?:https?:)?//', path): + if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path - if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + if isinstance(base, bytes): + base = base.decode('utf-8') + if not isinstance(base, compat_str) or not re.match( + r'^(?:https?:)?//', base): return None return compat_urlparse.urljoin(base, path) @@ -1737,7 +3506,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): return default try: return int(v) * invscale // scale - except ValueError: + except (ValueError, TypeError): return default @@ -1758,12 +3527,23 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default try: return float(v) * invscale / scale - except ValueError: + except (ValueError, TypeError): return default -def strip_or_none(v): - return None if v is None else v.strip() +def bool_or_none(v, default=None): + return v if isinstance(v, bool) else default + + +def strip_or_none(v, default=None): + return v.strip() if isinstance(v, compat_str) else default + + +def url_or_none(url): + if not url or not isinstance(url, compat_str): + return None + url = url.strip() + return url if re.match(r'^(?:[a-zA-Z][\da-zA-Z.+-]*:)?//', url) else None def parse_duration(s): @@ -1778,10 +3558,20 @@ def parse_duration(s): days, hours, mins, secs, ms = m.groups() else: m = re.match( - r'''(?ix)(?:P?T)? + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? (?: (?P[0-9]+)\s*d(?:ays?)?\s* )? + T)? (?: (?P[0-9]+)\s*h(?:ours?)?\s* )? @@ -1846,7 +3636,7 @@ def get_exe_version(exe, args=['--version'], try: # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if youtube-dl is run in the background. - # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656 + # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 out, _ = subprocess.Popen( [encodeArgument(exe)] + args, stdin=subprocess.PIPE, @@ -1876,7 +3666,7 @@ class PagedList(object): class OnDemandPagedList(PagedList): - def __init__(self, pagefunc, pagesize, use_cache=False): + def __init__(self, pagefunc, pagesize, use_cache=True): self._pagefunc = pagefunc self._pagesize = pagesize self._use_cache = use_cache @@ -2041,6 +3831,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}): return new_req +def _multipart_encode_impl(data, boundary): + content_type = 'multipart/form-data; boundary=%s' % boundary + + out = b'' + for k, v in data.items(): + out += b'--' + boundary.encode('ascii') + b'\r\n' + if isinstance(k, compat_str): + k = k.encode('utf-8') + if isinstance(v, compat_str): + v = v.encode('utf-8') + # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 + # suggests sending UTF-8 directly. Firefox sends UTF-8, too + content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' + if boundary.encode('ascii') in content: + raise ValueError('Boundary overlaps with data') + out += content + + out += b'--' + boundary.encode('ascii') + b'--\r\n' + + return out, content_type + + +def multipart_encode(data, boundary=None): + ''' + Encode a dict to RFC 7578-compliant form-data + + data: + A dict where keys and values can be either Unicode or bytes-like + objects. + boundary: + If specified a Unicode object, it's used as the boundary. Otherwise + a random boundary is generated. + + Reference: https://tools.ietf.org/html/rfc7578 + ''' + has_specified_boundary = boundary is not None + + while True: + if boundary is None: + boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff)) + + try: + out, content_type = _multipart_encode_impl(data, boundary) + break + except ValueError: + if has_specified_boundary: + raise + boundary = None + + return out, content_type + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): if isinstance(key_or_keys, (list, tuple)): for key in key_or_keys: @@ -2052,13 +3894,30 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True): def try_get(src, getter, expected_type=None): - try: - v = getter(src) - except (AttributeError, KeyError, TypeError, IndexError): - pass - else: - if expected_type is None or isinstance(v, expected_type): - return v + if not isinstance(getter, (list, tuple)): + getter = [getter] + for get in getter: + try: + v = get(src) + except (AttributeError, KeyError, TypeError, IndexError): + pass + else: + if expected_type is None or isinstance(v, expected_type): + return v + + +def merge_dicts(*dicts): + merged = {} + for a_dict in dicts: + for k, v in a_dict.items(): + if v is None: + continue + if (k not in merged + or (isinstance(v, compat_str) and v + and isinstance(merged[k], compat_str) + and not merged[k])): + merged[k] = v + return merged def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): @@ -2094,12 +3953,20 @@ def parse_age_limit(s): return int(m.group('age')) if s in US_RATINGS: return US_RATINGS[s] - return TV_PARENTAL_GUIDELINES.get(s) + m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s) + if m: + return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)] + return None def strip_jsonp(code): return re.sub( - r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code) + r'''(?sx)^ + (?:window\.)?(?P[a-zA-Z0-9_.$]*) + (?:\s*&&\s*(?P=func_name))? + \s*\(\s*(?P.*)\);? + \s*?(?://[^\n]*)*$''', + r'\g', code) def js_to_json(code): @@ -2137,7 +4004,7 @@ def js_to_json(code): "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| {comment}|,(?={skip}[\]}}])| - [a-zA-Z_][.a-zA-Z_0-9]*| + (?:(?%s)(?P\s*\?)?\s* (?: (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)| + (?P["\'])(?P(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)| (?P(?![0-9.])[a-z0-9A-Z]*) ) \s*$ @@ -2374,17 +4236,21 @@ def _match_one(filter_part, dct): if m: op = COMPARISON_OPERATORS[m.group('op')] actual_value = dct.get(m.group('key')) - if (m.group('strval') is not None or + if (m.group('quotedstrval') is not None + or m.group('strval') is not None # If the original field is a string and matching comparisonvalue is # a number we should respect the origin of the original field # and process comparison value as a string (see - # https://github.com/rg3/youtube-dl/issues/11082). - actual_value is not None and m.group('intval') is not None and - isinstance(actual_value, compat_str)): + # https://github.com/ytdl-org/youtube-dl/issues/11082). + or actual_value is not None and m.group('intval') is not None + and isinstance(actual_value, compat_str)): if m.group('op') not in ('=', '!='): raise ValueError( 'Operator %s does not support string values!' % m.group('op')) - comparison_value = m.group('strval') or m.group('intval') + comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval') + quote = m.group('quote') + if quote is not None: + comparison_value = comparison_value.replace(r'\%s' % quote, quote) else: try: comparison_value = int(m.group('intval')) @@ -2401,8 +4267,8 @@ def _match_one(filter_part, dct): return op(actual_value, comparison_value) UNARY_OPERATORS = { - '': lambda v: v is not None, - '!': lambda v: v is None, + '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), + '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } operator_rex = re.compile(r'''(?x)\s* (?P%s)\s*(?P[a-z_]+) @@ -2452,27 +4318,102 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): + ''' + @param dfxp_data A bytes-like object containing DFXP data + @returns A unicode object containing converted SRT data + ''' + LEGACY_NAMESPACES = ( + (b'http://www.w3.org/ns/ttml', [ + b'http://www.w3.org/2004/11/ttaf1', + b'http://www.w3.org/2006/04/ttaf1', + b'http://www.w3.org/2006/10/ttaf1', + ]), + (b'http://www.w3.org/ns/ttml#styling', [ + b'http://www.w3.org/ns/ttml#style', + ]), + ) + + SUPPORTED_STYLING = [ + 'color', + 'fontFamily', + 'fontSize', + 'fontStyle', + 'fontWeight', + 'textDecoration' + ] + _x = functools.partial(xpath_with_ns, ns_map={ + 'xml': 'http://www.w3.org/XML/1998/namespace', 'ttml': 'http://www.w3.org/ns/ttml', - 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', - 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1', + 'tts': 'http://www.w3.org/ns/ttml#styling', }) + styles = {} + default_style = {} + class TTMLPElementParser(object): - out = '' + _out = '' + _unclosed_elements = [] + _applied_styles = [] def start(self, tag, attrib): - if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): - self.out += '\n' + if tag in (_x('ttml:br'), 'br'): + self._out += '\n' + else: + unclosed_elements = [] + style = {} + element_style_id = attrib.get('style') + if default_style: + style.update(default_style) + if element_style_id: + style.update(styles.get(element_style_id, {})) + for prop in SUPPORTED_STYLING: + prop_val = attrib.get(_x('tts:' + prop)) + if prop_val: + style[prop] = prop_val + if style: + font = '' + for k, v in sorted(style.items()): + if self._applied_styles and self._applied_styles[-1].get(k) == v: + continue + if k == 'color': + font += ' color="%s"' % v + elif k == 'fontSize': + font += ' size="%s"' % v + elif k == 'fontFamily': + font += ' face="%s"' % v + elif k == 'fontWeight' and v == 'bold': + self._out += '' + unclosed_elements.append('b') + elif k == 'fontStyle' and v == 'italic': + self._out += '' + unclosed_elements.append('i') + elif k == 'textDecoration' and v == 'underline': + self._out += '' + unclosed_elements.append('u') + if font: + self._out += '' + unclosed_elements.append('font') + applied_style = {} + if self._applied_styles: + applied_style.update(self._applied_styles[-1]) + applied_style.update(style) + self._applied_styles.append(applied_style) + self._unclosed_elements.append(unclosed_elements) def end(self, tag): - pass + if tag not in (_x('ttml:br'), 'br'): + unclosed_elements = self._unclosed_elements.pop() + for element in reversed(unclosed_elements): + self._out += '' % element + if unclosed_elements and self._applied_styles: + self._applied_styles.pop() def data(self, data): - self.out += data + self._out += data def close(self): - return self.out.strip() + return self._out.strip() def parse_node(node): target = TTMLPElementParser() @@ -2480,13 +4421,47 @@ def dfxp2srt(dfxp_data): parser.feed(xml.etree.ElementTree.tostring(node)) return parser.close() - dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8')) + for k, v in LEGACY_NAMESPACES: + for ns in v: + dfxp_data = dfxp_data.replace(ns, k) + + dfxp = compat_etree_fromstring(dfxp_data) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') + repeat = False + while True: + for style in dfxp.findall(_x('.//ttml:style')): + style_id = style.get('id') or style.get(_x('xml:id')) + if not style_id: + continue + parent_style_id = style.get('style') + if parent_style_id: + if parent_style_id not in styles: + repeat = True + continue + styles[style_id] = styles[parent_style_id].copy() + for prop in SUPPORTED_STYLING: + prop_val = style.get(_x('tts:' + prop)) + if prop_val: + styles.setdefault(style_id, {})[prop] = prop_val + if repeat: + repeat = False + else: + break + + for p in ('body', 'div'): + ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p]) + if ele is None: + continue + style = styles.get(ele.get('style')) + if not style: + continue + default_style.update(style) + for para, index in zip(paras, itertools.count(1)): begin_time = parse_dfxp_time_expr(para.attrib.get('begin')) end_time = parse_dfxp_time_expr(para.attrib.get('end')) @@ -2515,6 +4490,8 @@ def cli_option(params, command_option, param): def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) + if param is None: + return [] assert isinstance(param, bool) if separator: return [command_option + separator + (true_value if param else false_value)] @@ -2594,6 +4571,7 @@ class ISO639Utils(object): 'gv': 'glv', 'ha': 'hau', 'he': 'heb', + 'iw': 'heb', # Replaced by he in 1989 revision 'hi': 'hin', 'ho': 'hmo', 'hr': 'hrv', @@ -2603,6 +4581,7 @@ class ISO639Utils(object): 'hz': 'her', 'ia': 'ina', 'id': 'ind', + 'in': 'ind', # Replaced by id in 1989 revision 'ie': 'ile', 'ig': 'ibo', 'ii': 'iii', @@ -2717,6 +4696,7 @@ class ISO639Utils(object): 'wo': 'wol', 'xh': 'xho', 'yi': 'yid', + 'ji': 'yid', # Replaced by yi in 1989 revision 'yo': 'yor', 'za': 'zha', 'zh': 'zho', @@ -2996,6 +4976,263 @@ class ISO3166Utils(object): return cls._country_map.get(code.upper()) +class GeoUtils(object): + # Major IPv4 address blocks per country + _country_ip_map = { + 'AD': '85.94.160.0/19', + 'AE': '94.200.0.0/13', + 'AF': '149.54.0.0/17', + 'AG': '209.59.64.0/18', + 'AI': '204.14.248.0/21', + 'AL': '46.99.0.0/16', + 'AM': '46.70.0.0/15', + 'AO': '105.168.0.0/13', + 'AP': '159.117.192.0/21', + 'AR': '181.0.0.0/12', + 'AS': '202.70.112.0/20', + 'AT': '84.112.0.0/13', + 'AU': '1.128.0.0/11', + 'AW': '181.41.0.0/18', + 'AZ': '5.191.0.0/16', + 'BA': '31.176.128.0/17', + 'BB': '65.48.128.0/17', + 'BD': '114.130.0.0/16', + 'BE': '57.0.0.0/8', + 'BF': '129.45.128.0/17', + 'BG': '95.42.0.0/15', + 'BH': '37.131.0.0/17', + 'BI': '154.117.192.0/18', + 'BJ': '137.255.0.0/16', + 'BL': '192.131.134.0/24', + 'BM': '196.12.64.0/18', + 'BN': '156.31.0.0/16', + 'BO': '161.56.0.0/16', + 'BQ': '161.0.80.0/20', + 'BR': '152.240.0.0/12', + 'BS': '24.51.64.0/18', + 'BT': '119.2.96.0/19', + 'BW': '168.167.0.0/16', + 'BY': '178.120.0.0/13', + 'BZ': '179.42.192.0/18', + 'CA': '99.224.0.0/11', + 'CD': '41.243.0.0/16', + 'CF': '196.32.200.0/21', + 'CG': '197.214.128.0/17', + 'CH': '85.0.0.0/13', + 'CI': '154.232.0.0/14', + 'CK': '202.65.32.0/19', + 'CL': '152.172.0.0/14', + 'CM': '165.210.0.0/15', + 'CN': '36.128.0.0/10', + 'CO': '181.240.0.0/12', + 'CR': '201.192.0.0/12', + 'CU': '152.206.0.0/15', + 'CV': '165.90.96.0/19', + 'CW': '190.88.128.0/17', + 'CY': '46.198.0.0/15', + 'CZ': '88.100.0.0/14', + 'DE': '53.0.0.0/8', + 'DJ': '197.241.0.0/17', + 'DK': '87.48.0.0/12', + 'DM': '192.243.48.0/20', + 'DO': '152.166.0.0/15', + 'DZ': '41.96.0.0/12', + 'EC': '186.68.0.0/15', + 'EE': '90.190.0.0/15', + 'EG': '156.160.0.0/11', + 'ER': '196.200.96.0/20', + 'ES': '88.0.0.0/11', + 'ET': '196.188.0.0/14', + 'EU': '2.16.0.0/13', + 'FI': '91.152.0.0/13', + 'FJ': '144.120.0.0/16', + 'FM': '119.252.112.0/20', + 'FO': '88.85.32.0/19', + 'FR': '90.0.0.0/9', + 'GA': '41.158.0.0/15', + 'GB': '25.0.0.0/8', + 'GD': '74.122.88.0/21', + 'GE': '31.146.0.0/16', + 'GF': '161.22.64.0/18', + 'GG': '62.68.160.0/19', + 'GH': '45.208.0.0/14', + 'GI': '85.115.128.0/19', + 'GL': '88.83.0.0/19', + 'GM': '160.182.0.0/15', + 'GN': '197.149.192.0/18', + 'GP': '104.250.0.0/19', + 'GQ': '105.235.224.0/20', + 'GR': '94.64.0.0/13', + 'GT': '168.234.0.0/16', + 'GU': '168.123.0.0/16', + 'GW': '197.214.80.0/20', + 'GY': '181.41.64.0/18', + 'HK': '113.252.0.0/14', + 'HN': '181.210.0.0/16', + 'HR': '93.136.0.0/13', + 'HT': '148.102.128.0/17', + 'HU': '84.0.0.0/14', + 'ID': '39.192.0.0/10', + 'IE': '87.32.0.0/12', + 'IL': '79.176.0.0/13', + 'IM': '5.62.80.0/20', + 'IN': '117.192.0.0/10', + 'IO': '203.83.48.0/21', + 'IQ': '37.236.0.0/14', + 'IR': '2.176.0.0/12', + 'IS': '82.221.0.0/16', + 'IT': '79.0.0.0/10', + 'JE': '87.244.64.0/18', + 'JM': '72.27.0.0/17', + 'JO': '176.29.0.0/16', + 'JP': '126.0.0.0/8', + 'KE': '105.48.0.0/12', + 'KG': '158.181.128.0/17', + 'KH': '36.37.128.0/17', + 'KI': '103.25.140.0/22', + 'KM': '197.255.224.0/20', + 'KN': '198.32.32.0/19', + 'KP': '175.45.176.0/22', + 'KR': '175.192.0.0/10', + 'KW': '37.36.0.0/14', + 'KY': '64.96.0.0/15', + 'KZ': '2.72.0.0/13', + 'LA': '115.84.64.0/18', + 'LB': '178.135.0.0/16', + 'LC': '192.147.231.0/24', + 'LI': '82.117.0.0/19', + 'LK': '112.134.0.0/15', + 'LR': '41.86.0.0/19', + 'LS': '129.232.0.0/17', + 'LT': '78.56.0.0/13', + 'LU': '188.42.0.0/16', + 'LV': '46.109.0.0/16', + 'LY': '41.252.0.0/14', + 'MA': '105.128.0.0/11', + 'MC': '88.209.64.0/18', + 'MD': '37.246.0.0/16', + 'ME': '178.175.0.0/17', + 'MF': '74.112.232.0/21', + 'MG': '154.126.0.0/17', + 'MH': '117.103.88.0/21', + 'MK': '77.28.0.0/15', + 'ML': '154.118.128.0/18', + 'MM': '37.111.0.0/17', + 'MN': '49.0.128.0/17', + 'MO': '60.246.0.0/16', + 'MP': '202.88.64.0/20', + 'MQ': '109.203.224.0/19', + 'MR': '41.188.64.0/18', + 'MS': '208.90.112.0/22', + 'MT': '46.11.0.0/16', + 'MU': '105.16.0.0/12', + 'MV': '27.114.128.0/18', + 'MW': '105.234.0.0/16', + 'MX': '187.192.0.0/11', + 'MY': '175.136.0.0/13', + 'MZ': '197.218.0.0/15', + 'NA': '41.182.0.0/16', + 'NC': '101.101.0.0/18', + 'NE': '197.214.0.0/18', + 'NF': '203.17.240.0/22', + 'NG': '105.112.0.0/12', + 'NI': '186.76.0.0/15', + 'NL': '145.96.0.0/11', + 'NO': '84.208.0.0/13', + 'NP': '36.252.0.0/15', + 'NR': '203.98.224.0/19', + 'NU': '49.156.48.0/22', + 'NZ': '49.224.0.0/14', + 'OM': '5.36.0.0/15', + 'PA': '186.72.0.0/15', + 'PE': '186.160.0.0/14', + 'PF': '123.50.64.0/18', + 'PG': '124.240.192.0/19', + 'PH': '49.144.0.0/13', + 'PK': '39.32.0.0/11', + 'PL': '83.0.0.0/11', + 'PM': '70.36.0.0/20', + 'PR': '66.50.0.0/16', + 'PS': '188.161.0.0/16', + 'PT': '85.240.0.0/13', + 'PW': '202.124.224.0/20', + 'PY': '181.120.0.0/14', + 'QA': '37.210.0.0/15', + 'RE': '139.26.0.0/16', + 'RO': '79.112.0.0/13', + 'RS': '178.220.0.0/14', + 'RU': '5.136.0.0/13', + 'RW': '105.178.0.0/15', + 'SA': '188.48.0.0/13', + 'SB': '202.1.160.0/19', + 'SC': '154.192.0.0/11', + 'SD': '154.96.0.0/13', + 'SE': '78.64.0.0/12', + 'SG': '152.56.0.0/14', + 'SI': '188.196.0.0/14', + 'SK': '78.98.0.0/15', + 'SL': '197.215.0.0/17', + 'SM': '89.186.32.0/19', + 'SN': '41.82.0.0/15', + 'SO': '197.220.64.0/19', + 'SR': '186.179.128.0/17', + 'SS': '105.235.208.0/21', + 'ST': '197.159.160.0/19', + 'SV': '168.243.0.0/16', + 'SX': '190.102.0.0/20', + 'SY': '5.0.0.0/16', + 'SZ': '41.84.224.0/19', + 'TC': '65.255.48.0/20', + 'TD': '154.68.128.0/19', + 'TG': '196.168.0.0/14', + 'TH': '171.96.0.0/13', + 'TJ': '85.9.128.0/18', + 'TK': '27.96.24.0/21', + 'TL': '180.189.160.0/20', + 'TM': '95.85.96.0/19', + 'TN': '197.0.0.0/11', + 'TO': '175.176.144.0/21', + 'TR': '78.160.0.0/11', + 'TT': '186.44.0.0/15', + 'TV': '202.2.96.0/19', + 'TW': '120.96.0.0/11', + 'TZ': '156.156.0.0/14', + 'UA': '93.72.0.0/13', + 'UG': '154.224.0.0/13', + 'US': '3.0.0.0/8', + 'UY': '167.56.0.0/13', + 'UZ': '82.215.64.0/18', + 'VA': '212.77.0.0/19', + 'VC': '24.92.144.0/20', + 'VE': '186.88.0.0/13', + 'VG': '172.103.64.0/18', + 'VI': '146.226.0.0/16', + 'VN': '14.160.0.0/11', + 'VU': '202.80.32.0/20', + 'WF': '117.20.32.0/21', + 'WS': '202.4.32.0/19', + 'YE': '134.35.0.0/16', + 'YT': '41.242.116.0/22', + 'ZA': '41.0.0.0/11', + 'ZM': '165.56.0.0/13', + 'ZW': '41.85.192.0/19', + } + + @classmethod + def random_ipv4(cls, code_or_block): + if len(code_or_block) == 2: + block = cls._country_ip_map.get(code_or_block.upper()) + if not block: + return None + else: + block = code_or_block + addr, preflen = block.split('/') + addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_max = addr_min | (0xffffffff >> int(preflen)) + return compat_str(socket.inet_ntoa( + compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers @@ -3003,7 +5240,7 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - return compat_urllib_request.ProxyHandler.__init__(self, proxies) + compat_urllib_request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') @@ -3021,6 +5258,57 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): self, req, proxy, type) +# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is +# released into Public Domain +# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 + +def long_to_bytes(n, blocksize=0): + """long_to_bytes(n:long, blocksize:int) : string + Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front of the + byte string with binary zeros so that the length is a multiple of + blocksize. + """ + # after much testing, this algorithm was deemed to be the fastest + s = b'' + n = int(n) + while n > 0: + s = compat_struct_pack('>I', n & 0xffffffff) + s + n = n >> 32 + # strip off leading zeros + for i in range(len(s)): + if s[i] != b'\000'[0]: + break + else: + # only happens when n == 0 + s = b'\000' + i = 0 + s = s[i:] + # add back some pad bytes. this could be done more efficiently w.r.t. the + # de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * b'\000' + s + return s + + +def bytes_to_long(s): + """bytes_to_long(string) : long + Convert a byte string to a long integer. + + This is (essentially) the inverse of long_to_bytes(). + """ + acc = 0 + length = len(s) + if length % 4: + extra = (4 - length % 4) + s = b'\000' * extra + s + length = length + extra + for i in range(0, length, 4): + acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + return acc + + def ohdave_rsa_encrypt(data, exponent, modulus): ''' Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ @@ -3038,6 +5326,21 @@ def ohdave_rsa_encrypt(data, exponent, modulus): return '%x' % encrypted +def pkcs1pad(data, length): + """ + Padding input data with PKCS#1 scheme + + @param {int[]} data input data + @param {int} length target length + @returns {int[]} padded data + """ + if len(data) > length - 11: + raise ValueError('Input data too long for PKCS#1 padding') + + pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] + return [0, 2] + pseudo_random + [0] + data + + def encode_base_n(num, n, table=None): FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' if not table: @@ -3088,7 +5391,7 @@ def urshift(val, n): # Based on png2str() written by @gdkchan and improved by @yokrysty -# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 +# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706 def decode_png(png_data): # Reference: https://www.w3.org/TR/PNG/ header = png_data[8:] @@ -3203,7 +5506,7 @@ def write_xattr(path, key, value): if hasattr(xattr, 'set'): # pyxattr # Unicode arguments are not supported in python-pyxattr until # version 0.5.0 - # See https://github.com/rg3/youtube-dl/issues/5498 + # See https://github.com/ytdl-org/youtube-dl/issues/5498 pyxattr_required_version = '0.5.0' if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): # TODO: fallback to CLI tools @@ -3249,9 +5552,9 @@ def write_xattr(path, key, value): executable = 'xattr' opts = ['-w', key, value] - cmd = ([encodeFilename(executable, True)] + - [encodeArgument(o) for o in opts] + - [encodeFilename(path, True)]) + cmd = ([encodeFilename(executable, True)] + + [encodeArgument(o) for o in opts] + + [encodeFilename(path, True)]) try: p = subprocess.Popen( @@ -3276,3 +5579,15 @@ def write_xattr(path, key, value): "Couldn't find a tool to set the xattrs. " "Install either the python 'xattr' module, " "or the 'xattr' binary.") + + +def random_birthday(year_field, month_field, day_field): + start_date = datetime.date(1950, 1, 1) + end_date = datetime.date(1995, 12, 31) + offset = random.randint(0, (end_date - start_date).days) + random_date = start_date + datetime.timedelta(offset) + return { + year_field: str(random_date.year), + month_field: str(random_date.month), + day_field: str(random_date.day), + }