]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  13 class StanfordOpenClassroomIE(InfoExtractor
): 
  14     IE_NAME 
= 'stanfordoc' 
  15     IE_DESC 
= 'Stanford Open ClassRoom' 
  16     _VALID_URL 
= r
'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
  18         'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 
  19         'md5': '544a9468546059d4e80d76265b0443b8', 
  21             'id': 'PracticalUnix_intro-environment', 
  23             'title': 'Intro Environment', 
  27     def _real_extract(self
, url
): 
  28         mobj 
= re
.match(self
._VALID
_URL
, url
) 
  30         if mobj
.group('course') and mobj
.group('video'):  # A specific video 
  31             course 
= mobj
.group('course') 
  32             video 
= mobj
.group('video') 
  34                 'id': course 
+ '_' + video
, 
  39             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
  40             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
  41             mdoc 
= self
._download
_xml
(xmlUrl
, info
['id']) 
  43                 info
['title'] = mdoc
.findall('./title')[0].text
 
  44                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
  46                 raise ExtractorError('Invalid metadata XML file') 
  48         elif mobj
.group('course'):  # A course page 
  49             course 
= mobj
.group('course') 
  57             coursepage 
= self
._download
_webpage
( 
  59                 note
='Downloading course info page', 
  60                 errnote
='Unable to download course info page') 
  62             info
['title'] = self
._html
_search
_regex
( 
  63                 r
'<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id']) 
  65             info
['description'] = self
._html
_search
_regex
( 
  66                 r
'(?s)<description>([^<]+)</description>', 
  67                 coursepage
, 'description', fatal
=False) 
  69             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
  70             info
['entries'] = [self
.url_result( 
  71                 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l
) 
  76                 'id': 'Stanford OpenClassroom', 
  81             info
['title'] = info
['id'] 
  83             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
  84             rootpage 
= self
._download
_webpage
(rootURL
, info
['id'], 
  85                                               errnote
='Unable to download course info page') 
  87             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
  88             info
['entries'] = [self
.url_result( 
  89                 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l
)