]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
   3 from .common 
import InfoExtractor
 
  11 class StanfordOpenClassroomIE(InfoExtractor
): 
  12     IE_NAME 
= u
'stanfordoc' 
  13     IE_DESC 
= u
'Stanford Open ClassRoom' 
  14     _VALID_URL 
= r
'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
  16         u
'url': u
'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 
  17         u
'file': u
'PracticalUnix_intro-environment.mp4', 
  18         u
'md5': u
'544a9468546059d4e80d76265b0443b8', 
  20             u
"title": u
"Intro Environment" 
  24     def _real_extract(self
, url
): 
  25         mobj 
= re
.match(self
._VALID
_URL
, url
) 
  27             raise ExtractorError(u
'Invalid URL: %s' % url
) 
  29         if mobj
.group('course') and mobj
.group('video'): # A specific video 
  30             course 
= mobj
.group('course') 
  31             video 
= mobj
.group('video') 
  33                 'id': course 
+ '_' + video
, 
  38             self
.report_extraction(info
['id']) 
  39             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
  40             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
  41             mdoc 
= self
._download
_xml
(xmlUrl
, info
['id']) 
  43                 info
['title'] = mdoc
.findall('./title')[0].text
 
  44                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
  46                 raise ExtractorError(u
'Invalid metadata XML file') 
  47             info
['ext'] = info
['url'].rpartition('.')[2] 
  49         elif mobj
.group('course'): # A course page 
  50             course 
= mobj
.group('course') 
  58             coursepage 
= self
._download
_webpage
(url
, info
['id'], 
  59                                         note
='Downloading course info page', 
  60                                         errnote
='Unable to download course info page') 
  62             info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id']) 
  64             info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>', 
  65                 coursepage
, u
'description', fatal
=False) 
  67             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
  71                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
  75             for entry 
in info
['list']: 
  76                 assert entry
['type'] == 'reference' 
  77                 results 
+= self
.extract(entry
['url']) 
  81                 'id': 'Stanford OpenClassroom', 
  87             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
  88             rootpage 
= self
._download
_webpage
(rootURL
, info
['id'], 
  89                 errnote
=u
'Unable to download course info page') 
  91             info
['title'] = info
['id'] 
  93             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
  97                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
 102             for entry 
in info
['list']: 
 103                 assert entry
['type'] == 'reference' 
 104                 results 
+= self
.extract(entry
['url'])