]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
13 class StanfordOpenClassroomIE(InfoExtractor
):
14 IE_NAME
= 'stanfordoc'
15 IE_DESC
= 'Stanford Open ClassRoom'
16 _VALID_URL
= r
'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
18 'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
19 'md5': '544a9468546059d4e80d76265b0443b8',
21 'id': 'PracticalUnix_intro-environment',
23 'title': 'Intro Environment',
27 def _real_extract(self
, url
):
28 mobj
= re
.match(self
._VALID
_URL
, url
)
30 if mobj
.group('course') and mobj
.group('video'): # A specific video
31 course
= mobj
.group('course')
32 video
= mobj
.group('video')
34 'id': course
+ '_' + video
,
39 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
40 xmlUrl
= baseUrl
+ video
+ '.xml'
41 mdoc
= self
._download
_xml
(xmlUrl
, info
['id'])
43 info
['title'] = mdoc
.findall('./title')[0].text
44 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
46 raise ExtractorError('Invalid metadata XML file')
48 elif mobj
.group('course'): # A course page
49 course
= mobj
.group('course')
57 coursepage
= self
._download
_webpage
(
59 note
='Downloading course info page',
60 errnote
='Unable to download course info page')
62 info
['title'] = self
._html
_search
_regex
(
63 r
'<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
65 info
['description'] = self
._html
_search
_regex
(
66 r
'(?s)<description>([^<]+)</description>',
67 coursepage
, 'description', fatal
=False)
69 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
70 info
['entries'] = [self
.url_result(
71 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l
)
76 'id': 'Stanford OpenClassroom',
81 info
['title'] = info
['id']
83 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
84 rootpage
= self
._download
_webpage
(rootURL
, info
['id'],
85 errnote
='Unable to download course info page')
87 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
88 info
['entries'] = [self
.url_result(
89 'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l
)