]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/platzi.py
   2 from __future__ 
import unicode_literals
 
   4 from .common 
import InfoExtractor
 
  21 class PlatziBaseIE(InfoExtractor
): 
  22     _LOGIN_URL 
= 'https://platzi.com/login/' 
  23     _NETRC_MACHINE 
= 'platzi' 
  25     def _real_initialize(self
): 
  29         username
, password 
= self
._get
_login
_info
() 
  33         login_page 
= self
._download
_webpage
( 
  34             self
._LOGIN
_URL
, None, 'Downloading login page') 
  36         login_form 
= self
._hidden
_inputs
(login_page
) 
  43         urlh 
= self
._request
_webpage
( 
  44             self
._LOGIN
_URL
, None, 'Logging in', 
  45             data
=urlencode_postdata(login_form
), 
  46             headers
={'Referer': self
._LOGIN
_URL
}) 
  49         if 'platzi.com/login' not in urlh
.geturl(): 
  52         login_error 
= self
._webpage
_read
_content
( 
  53             urlh
, self
._LOGIN
_URL
, None, 'Downloading login error page') 
  55         login 
= self
._parse
_json
( 
  57                 r
'login\s*=\s*({.+?})(?:\s*;|\s*</script)', login_error
, 'login'), 
  60         for kind 
in ('error', 'password', 'nonFields'): 
  61             error 
= str_or_none(login
.get('%sError' % kind
)) 
  64                     'Unable to login: %s' % error
, expected
=True) 
  65         raise ExtractorError('Unable to log in') 
  68 class PlatziIE(PlatziBaseIE
): 
  72                             platzi\.com/clases|           # es version 
  73                             courses\.platzi\.com/classes  # en version 
  74                         )/[^/]+/(?P<id>\d+)-[^/?\#&]+ 
  78         'url': 'https://platzi.com/clases/1311-next-js/12074-creando-nuestra-primera-pagina/', 
  79         'md5': '8f56448241005b561c10f11a595b37e3', 
  83             'title': 'Creando nuestra primera página', 
  84             'description': 'md5:4c866e45034fc76412fbf6e60ae008bc', 
  87         'skip': 'Requires platzi account credentials', 
  89         'url': 'https://courses.platzi.com/classes/1367-communication-codestream/13430-background/', 
  93             'title': 'Background', 
  94             'description': 'md5:49c83c09404b15e6e71defaf87f6b305', 
  97         'skip': 'Requires platzi account credentials', 
  99             'skip_download': True, 
 103     def _real_extract(self
, url
): 
 104         lecture_id 
= self
._match
_id
(url
) 
 106         webpage 
= self
._download
_webpage
(url
, lecture_id
) 
 108         data 
= self
._parse
_json
( 
 110                 # client_data may contain "};" so that we have to try more 
 112                 (r
'client_data\s*=\s*({.+?})\s*;\s*\n', 
 113                  r
'client_data\s*=\s*({.+?})\s*;'), 
 114                 webpage
, 'client data'), 
 117         material 
= data
['initialState']['material'] 
 118         desc 
= material
['description'] 
 119         title 
= desc
['title'] 
 122         for server_id
, server 
in material
['videos'].items(): 
 123             if not isinstance(server
, dict): 
 125             for format_id 
in ('hls', 'dash'): 
 126                 format_url 
= url_or_none(server
.get(format_id
)) 
 129                 if format_id 
== 'hls': 
 130                     formats
.extend(self
._extract
_m
3u8_formats
( 
 131                         format_url
, lecture_id
, 'mp4', 
 132                         entry_protocol
='m3u8_native', m3u8_id
=format_id
, 
 133                         note
='Downloading %s m3u8 information' % server_id
, 
 135                 elif format_id 
== 'dash': 
 136                     formats
.extend(self
._extract
_mpd
_formats
( 
 137                         format_url
, lecture_id
, mpd_id
=format_id
, 
 138                         note
='Downloading %s MPD manifest' % server_id
, 
 140         self
._sort
_formats
(formats
) 
 142         content 
= str_or_none(desc
.get('content')) 
 143         description 
= (clean_html(compat_b64decode(content
).decode('utf-8')) 
 144                        if content 
else None) 
 145         duration 
= int_or_none(material
.get('duration'), invscale
=60) 
 150             'description': description
, 
 151             'duration': duration
, 
 156 class PlatziCourseIE(PlatziBaseIE
): 
 157     _VALID_URL 
= r
'''(?x) 
 160                             platzi\.com/clases|           # es version 
 161                             courses\.platzi\.com/classes  # en version 
 165         'url': 'https://platzi.com/clases/next-js/', 
 168             'title': 'Curso de Next.js', 
 170         'playlist_count': 22, 
 172         'url': 'https://courses.platzi.com/classes/communication-codestream/', 
 175             'title': 'Codestream Course', 
 177         'playlist_count': 14, 
 181     def suitable(cls
, url
): 
 182         return False if PlatziIE
.suitable(url
) else super(PlatziCourseIE
, cls
).suitable(url
) 
 184     def _real_extract(self
, url
): 
 185         course_name 
= self
._match
_id
(url
) 
 187         webpage 
= self
._download
_webpage
(url
, course_name
) 
 189         props 
= self
._parse
_json
( 
 190             self
._search
_regex
(r
'data\s*=\s*({.+?})\s*;', webpage
, 'data'), 
 191             course_name
)['initialProps'] 
 194         for chapter_num
, chapter 
in enumerate(props
['concepts'], 1): 
 195             if not isinstance(chapter
, dict): 
 197             materials 
= chapter
.get('materials') 
 198             if not materials 
or not isinstance(materials
, list): 
 200             chapter_title 
= chapter
.get('title') 
 201             chapter_id 
= str_or_none(chapter
.get('id')) 
 202             for material 
in materials
: 
 203                 if not isinstance(material
, dict): 
 205                 if material
.get('material_type') != 'video': 
 207                 video_url 
= urljoin(url
, material
.get('url')) 
 211                     '_type': 'url_transparent', 
 213                     'title': str_or_none(material
.get('name')), 
 214                     'id': str_or_none(material
.get('id')), 
 215                     'ie_key': PlatziIE
.ie_key(), 
 216                     'chapter': chapter_title
, 
 217                     'chapter_number': chapter_num
, 
 218                     'chapter_id': chapter_id
, 
 221         course_id 
= compat_str(try_get(props
, lambda x
: x
['course']['id'])) 
 222         course_title 
= try_get(props
, lambda x
: x
['course']['name'], compat_str
) 
 224         return self
.playlist_result(entries
, course_id
, course_title
)