X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/9dc487f48b50767cf540fa36c3de2c386fd74c04..89bb9e23b34f43c6563524917f7dffb9841a972e:/youtube_dl/extractor/youporn.py diff --git a/youtube_dl/extractor/youporn.py b/youtube_dl/extractor/youporn.py index 0df2d76..d4eccb4 100644 --- a/youtube_dl/extractor/youporn.py +++ b/youtube_dl/extractor/youporn.py @@ -9,6 +9,7 @@ from ..utils import ( str_to_int, unescapeHTML, unified_strdate, + url_or_none, ) from ..aes import aes_decrypt_text @@ -24,9 +25,9 @@ class YouPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Sex Ed: Is It Safe To Masturbate Daily?', 'description': 'Love & Sex Answers: http://bit.ly/DanAndJenn -- Is It Unhealthy To Masturbate Daily?', - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Ask Dan And Jennifer', - 'upload_date': '20101221', + 'upload_date': '20101217', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -35,7 +36,7 @@ class YouPornIE(InfoExtractor): 'age_limit': 18, }, }, { - # Anonymous User uploader + # Unknown uploader 'url': 'http://www.youporn.com/watch/561726/big-tits-awesome-brunette-on-amazing-webcam-show/?from=related3&al=2&from_id=561726&pos=4', 'info_dict': { 'id': '561726', @@ -43,9 +44,9 @@ class YouPornIE(InfoExtractor): 'ext': 'mp4', 'title': 'Big Tits Awesome Brunette On amazing webcam show', 'description': 'http://sweetlivegirls.com Big Tits Awesome Brunette On amazing webcam show.mp4', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'Anonymous User', - 'upload_date': '20111125', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Unknown', + 'upload_date': '20110418', 'average_rating': int, 'view_count': int, 'comment_count': int, @@ -67,29 +68,45 @@ class YouPornIE(InfoExtractor): request.add_header('Cookie', 'age_verified=1') webpage = self._download_webpage(request, display_id) - title = self._search_regex( - [r'(?:video_titles|videoTitle)\s*[:=]\s*(["\'])(?P.+?)\1', - r'<h1[^>]+class=["\']heading\d?["\'][^>]*>([^<])<'], - webpage, 'title', group='title') + title = self._html_search_regex( + r'(?s)<div[^>]+class=["\']watchVideoTitle[^>]+>(.+?)</div>', + webpage, 'title', default=None) or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, fatal=True) links = [] + # Main source + definitions = self._parse_json( + self._search_regex( + r'mediaDefinition\s*=\s*(\[.+?\]);', webpage, + 'media definitions', default='[]'), + video_id, fatal=False) + if definitions: + for definition in definitions: + if not isinstance(definition, dict): + continue + video_url = url_or_none(definition.get('videoUrl')) + if video_url: + links.append(video_url) + + # Fallback #1, this also contains extra low quality 180p format + for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + links.append(link) + + # Fallback #2 (unavailable as at 22.06.2017) sources = self._search_regex( r'(?s)sources\s*:\s*({.+?})', webpage, 'sources', default=None) if sources: for _, link in re.findall(r'[^:]+\s*:\s*(["\'])(http.+?)\1', sources): links.append(link) - # Fallback #1 + # Fallback #3 (unavailable as at 22.06.2017) for _, link in re.findall( - r'(?:videoUrl|videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): - links.append(link) - - # Fallback #2, this also contains extra low quality 180p format - for _, link in re.findall(r'<a[^>]+href=(["\'])(http.+?)\1[^>]+title=["\']Download [Vv]ideo', webpage): + r'(?:videoSrc|videoIpadUrl|html5PlayerSrc)\s*[:=]\s*(["\'])(http.+?)\1', webpage): links.append(link) - # Fallback #3, encrypted links + # Fallback #4, encrypted links (unavailable as at 22.06.2017) for _, encrypted_link in re.findall( r'encryptedQuality\d{3,4}URL\s*=\s*(["\'])([\da-zA-Z+/=]+)\1', webpage): links.append(aes_decrypt_text(encrypted_link, title, 32).decode('utf-8')) @@ -115,7 +132,11 @@ class YouPornIE(InfoExtractor): formats.append(f) self._sort_formats(formats) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div[^>]+\bid=["\']description["\'][^>]*>(.+?)</div>', + webpage, 'description', + default=None) or self._og_search_description( + webpage, default=None) thumbnail = self._search_regex( r'(?:imageurl\s*=|poster\s*:)\s*(["\'])(?P<thumbnail>.+?)\1', webpage, 'thumbnail', fatal=False, group='thumbnail') @@ -124,7 +145,8 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>', + [r'Date\s+[Aa]dded:\s*<span>([^<]+)', + r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) @@ -140,17 +162,17 @@ class YouPornIE(InfoExtractor): r'>All [Cc]omments? \(([\d,.]+)\)', webpage, 'comment count', fatal=False)) - def extract_tag_box(title): - tag_box = self._search_regex( - (r'<div[^>]+class=["\']tagBoxTitle["\'][^>]*>\s*%s\b.*?</div>\s*' - '<div[^>]+class=["\']tagBoxContent["\']>(.+?)</div>') % re.escape(title), - webpage, '%s tag box' % title, default=None) + def extract_tag_box(regex, title): + tag_box = self._search_regex(regex, webpage, title, default=None) if not tag_box: return [] return re.findall(r'<a[^>]+href=[^>]+>([^<]+)', tag_box) - categories = extract_tag_box('Category') - tags = extract_tag_box('Tags') + categories = extract_tag_box( + r'(?s)Categories:.*?</[^>]+>(.+?)</div>', 'categories') + tags = extract_tag_box( + r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', + 'tags') return { 'id': video_id,