]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/platzi.py
557b2b5ade2fcd0269b8b2873fd644015f785c6c
2 from __future__
import unicode_literals
4 from .common
import InfoExtractor
21 class PlatziIE(InfoExtractor
):
25 platzi\.com/clases| # es version
26 courses\.platzi\.com/classes # en version
27 )/[^/]+/(?P<id>\d+)-[^/?\#&]+
29 _LOGIN_URL
= 'https://platzi.com/login/'
30 _NETRC_MACHINE
= 'platzi'
33 'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/',
34 'md5': '8f56448241005b561c10f11a595b37e3',
38 'title': 'Creando nuestra primera página',
39 'description': 'md5:4c866e45034fc76412fbf6e60ae008bc',
42 'skip': 'Requires platzi account credentials',
44 'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/',
48 'title': 'Background',
49 'description': 'md5:49c83c09404b15e6e71defaf87f6b305',
52 'skip': 'Requires platzi account credentials',
54 'skip_download': True,
58 def _real_initialize(self
):
62 username
, password
= self
._get
_login
_info
()
66 login_page
= self
._download
_webpage
(
67 self
._LOGIN
_URL
, None, 'Downloading login page')
69 login_form
= self
._hidden
_inputs
(login_page
)
76 urlh
= self
._request
_webpage
(
77 self
._LOGIN
_URL
, None, 'Logging in',
78 data
=urlencode_postdata(login_form
),
79 headers
={'Referer': self
._LOGIN
_URL
})
82 if 'platzi.com/login' not in compat_str(urlh
.geturl()):
85 login_error
= self
._webpage
_read
_content
(
86 urlh
, self
._LOGIN
_URL
, None, 'Downloading login error page')
88 login
= self
._parse
_json
(
90 r
'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error
, 'login'),
93 for kind
in ('error', 'password', 'nonFields'):
94 error
= str_or_none(login
.get('%sError' % kind
))
97 'Unable to login: %s' % error
, expected
=True)
98 raise ExtractorError('Unable to log in')
100 def _real_extract(self
, url
):
101 lecture_id
= self
._match
_id
(url
)
103 webpage
= self
._download
_webpage
(url
, lecture_id
)
105 data
= self
._parse
_json
(
107 r
'client_data\s*=\s*({.+?})\s*;', webpage
, 'client data'),
110 material
= data
['initialState']['material']
111 desc
= material
['description']
112 title
= desc
['title']
115 for server_id
, server
in material
['videos'].items():
116 if not isinstance(server
, dict):
118 for format_id
in ('hls', 'dash'):
119 format_url
= url_or_none(server
.get(format_id
))
122 if format_id
== 'hls':
123 formats
.extend(self
._extract
_m
3u8_formats
(
124 format_url
, lecture_id
, 'mp4',
125 entry_protocol
='m3u8_native', m3u8_id
=format_id
,
126 note
='Downloading %s m3u8 information' % server_id
,
128 elif format_id
== 'dash':
129 formats
.extend(self
._extract
_mpd
_formats
(
130 format_url
, lecture_id
, mpd_id
=format_id
,
131 note
='Downloading %s MPD manifest' % server_id
,
133 self
._sort
_formats
(formats
)
135 content
= str_or_none(desc
.get('content'))
136 description
= (clean_html(compat_b64decode(content
).decode('utf-8'))
137 if content
else None)
138 duration
= int_or_none(material
.get('duration'), invscale
=60)
143 'description': description
,
144 'duration': duration
,
149 class PlatziCourseIE(InfoExtractor
):
150 _VALID_URL
= r
'''(?x)
153 platzi\.com/clases| # es version
154 courses\.platzi\.com/classes # en version
158 'url': 'https://platzi.com/clases/next-js/',
161 'title': 'Curso de Next.js',
163 'playlist_count': 22,
165 'url': 'https://courses.platzi.com/classes/communication-codestream/',
168 'title': 'Codestream Course',
170 'playlist_count': 14,
174 def suitable(cls
, url
):
175 return False if PlatziIE
.suitable(url
) else super(PlatziCourseIE
, cls
).suitable(url
)
177 def _real_extract(self
, url
):
178 course_name
= self
._match
_id
(url
)
180 webpage
= self
._download
_webpage
(url
, course_name
)
182 props
= self
._parse
_json
(
183 self
._search
_regex
(r
'data\s*=\s*({.+?})\s*;', webpage
, 'data'),
184 course_name
)['initialProps']
187 for chapter_num
, chapter
in enumerate(props
['concepts'], 1):
188 if not isinstance(chapter
, dict):
190 materials
= chapter
.get('materials')
191 if not materials
or not isinstance(materials
, list):
193 chapter_title
= chapter
.get('title')
194 chapter_id
= str_or_none(chapter
.get('id'))
195 for material
in materials
:
196 if not isinstance(material
, dict):
198 if material
.get('material_type') != 'video':
200 video_url
= urljoin(url
, material
.get('url'))
204 '_type': 'url_transparent',
206 'title': str_or_none(material
.get('name')),
207 'id': str_or_none(material
.get('id')),
208 'ie_key': PlatziIE
.ie_key(),
209 'chapter': chapter_title
,
210 'chapter_number': chapter_num
,
211 'chapter_id': chapter_id
,
214 course_id
= compat_str(try_get(props
, lambda x
: x
['course']['id']))
215 course_title
= try_get(props
, lambda x
: x
['course']['name'], compat_str
)
217 return self
.playlist_result(entries
, course_id
, course_title
)