X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/af014acd27e0b471d5903630847eabb26437b46c..39393b81acfaf4045fb7f20454a0226f0dc9142e:/youtube_dl/extractor/udemy.py diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 207c4a6..2a4faec 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_str, compat_urllib_request, compat_urlparse, @@ -17,7 +18,9 @@ from ..utils import ( int_or_none, js_to_json, sanitized_Request, + try_get, unescapeHTML, + url_or_none, urlencode_postdata, ) @@ -26,7 +29,7 @@ class UdemyIE(InfoExtractor): IE_NAME = 'udemy' _VALID_URL = r'''(?x) https?:// - www\.udemy\.com/ + (?:[^/]+\.)?udemy\.com/ (?: [^#]+\#/lecture/| lecture/view/?\?lectureId=| @@ -57,16 +60,26 @@ class UdemyIE(InfoExtractor): # no url in outputs format entry 'url': 'https://www.udemy.com/learn-web-development-complete-step-by-step-guide-to-success/learn/v4/t/lecture/4125812', 'only_matching': True, + }, { + # only outputs rendition + 'url': 'https://www.udemy.com/how-you-can-help-your-local-community-5-amazing-examples/learn/v4/t/lecture/3225750?start=0', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/#/lecture/172757', + 'only_matching': True, }] def _extract_course_info(self, webpage, video_id): course = self._parse_json( unescapeHTML(self._search_regex( - r'ng-init=["\'].*\bcourse=({.+?});', webpage, 'course', default='{}')), + r'ng-init=["\'].*\bcourse=({.+?})[;"\']', + webpage, 'course', default='{}')), video_id, fatal=False) or {} course_id = course.get('id') or self._search_regex( - (r'"id"\s*:\s*(\d+)', r'data-course-id=["\'](\d+)'), - webpage, 'course id') + [ + r'data-course-id=["\'](\d+)', + r'"courseId"\s*:\s*(\d+)' + ], webpage, 'course id') return course_id, course.get('title') def _enroll_course(self, base_url, webpage, course_id): @@ -100,7 +113,7 @@ class UdemyIE(InfoExtractor): % (course_id, lecture_id), lecture_id, 'Downloading lecture JSON', query={ 'fields[lecture]': 'title,description,view_html,asset', - 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,data', + 'fields[asset]': 'asset_type,stream_url,thumbnail_url,download_urls,stream_urls,captions,data', }) def _handle_error(self, response): @@ -114,6 +127,25 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}).copy() + headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' + kwargs['headers'] = headers + ret = super(UdemyIE, self)._download_webpage_handle( + *args, **compat_kwargs(kwargs)) + if not ret: + return ret + webpage, _ = ret + if any(p in webpage for p in ( + '>Please verify you are a human', + 'Access to this page has been denied because we believe you are using automation tools to browse the website', + '"_pxCaptcha"')): + raise ExtractorError( + 'Udemy asks you to solve a CAPTCHA. Login with browser, ' + 'solve CAPTCHA, then export cookies and pass cookie file to ' + 'youtube-dl with --cookies.', expected=True) + return ret + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', @@ -140,7 +172,7 @@ class UdemyIE(InfoExtractor): self._login() def _login(self): - (username, password) = self._get_login_info() + username, password = self._get_login_info() if username is None: return @@ -164,7 +196,7 @@ class UdemyIE(InfoExtractor): }) response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in as %s' % username, + self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata(login_form), headers={ 'Referer': self._ORIGIN_URL, @@ -254,8 +286,13 @@ class UdemyIE(InfoExtractor): if not isinstance(source_list, list): return for source in source_list: - video_url = source.get('file') or source.get('src') - if not video_url or not isinstance(video_url, compat_str): + video_url = url_or_none(source.get('file') or source.get('src')) + if not video_url: + continue + if source.get('type') == 'application/x-mpegURL' or determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) continue format_id = source.get('label') f = { @@ -277,8 +314,8 @@ class UdemyIE(InfoExtractor): continue if track.get('kind') != 'captions': continue - src = track.get('src') - if not src or not isinstance(src, compat_str): + src = url_or_none(track.get('src')) + if not src: continue lang = track.get('language') or track.get( 'srclang') or track.get('label') @@ -288,9 +325,25 @@ class UdemyIE(InfoExtractor): 'url': src, }) - download_urls = asset.get('download_urls') - if isinstance(download_urls, dict): - extract_formats(download_urls.get('Video')) + for url_kind in ('download', 'stream'): + urls = asset.get('%s_urls' % url_kind) + if isinstance(urls, dict): + extract_formats(urls.get('Video')) + + captions = asset.get('captions') + if isinstance(captions, list): + for cc in captions: + if not isinstance(cc, dict): + continue + cc_url = url_or_none(cc.get('url')) + if not cc_url: + continue + lang = try_get(cc, lambda x: x['locale']['locale'], compat_str) + sub_dict = (automatic_captions if cc.get('source') == 'auto' + else subtitles) + sub_dict.setdefault(lang or 'en', []).append({ + 'url': cc_url, + }) view_html = lecture.get('view_html') if view_html: @@ -325,7 +378,7 @@ class UdemyIE(InfoExtractor): }, res)) # react rendition since 2017.04.15 (see - # https://github.com/rg3/youtube-dl/issues/12744) + # https://github.com/ytdl-org/youtube-dl/issues/12744) data = self._parse_json( self._search_regex( r'videojs-setup-data=(["\'])(?P{.+?})\1', view_html, @@ -346,6 +399,12 @@ class UdemyIE(InfoExtractor): fatal=False) extract_subtitles(text_tracks) + if not formats and outputs: + for format_id, output in outputs.items(): + f = extract_output_format(output, format_id) + if f.get('url'): + formats.append(f) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { @@ -362,8 +421,14 @@ class UdemyIE(InfoExtractor): class UdemyCourseIE(UdemyIE): IE_NAME = 'udemy:course' - _VALID_URL = r'https?://(?:www\.)?udemy\.com/(?P[^/?#&]+)' - _TESTS = [] + _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.udemy.com/java-tutorial/', + 'only_matching': True, + }, { + 'url': 'https://wipro.udemy.com/java-tutorial/', + 'only_matching': True, + }] @classmethod def suitable(cls, url):