]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
3 from .common
import InfoExtractor
11 class StanfordOpenClassroomIE(InfoExtractor
):
12 IE_NAME
= u
'stanfordoc'
13 IE_DESC
= u
'Stanford Open ClassRoom'
14 _VALID_URL
= r
'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
16 u
'url': u
'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
17 u
'file': u
'PracticalUnix_intro-environment.mp4',
18 u
'md5': u
'544a9468546059d4e80d76265b0443b8',
20 u
"title": u
"Intro Environment"
24 def _real_extract(self
, url
):
25 mobj
= re
.match(self
._VALID
_URL
, url
)
27 raise ExtractorError(u
'Invalid URL: %s' % url
)
29 if mobj
.group('course') and mobj
.group('video'): # A specific video
30 course
= mobj
.group('course')
31 video
= mobj
.group('video')
33 'id': course
+ '_' + video
,
38 self
.report_extraction(info
['id'])
39 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
40 xmlUrl
= baseUrl
+ video
+ '.xml'
41 mdoc
= self
._download
_xml
(xmlUrl
, info
['id'])
43 info
['title'] = mdoc
.findall('./title')[0].text
44 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
46 raise ExtractorError(u
'Invalid metadata XML file')
47 info
['ext'] = info
['url'].rpartition('.')[2]
49 elif mobj
.group('course'): # A course page
50 course
= mobj
.group('course')
58 coursepage
= self
._download
_webpage
(url
, info
['id'],
59 note
='Downloading course info page',
60 errnote
='Unable to download course info page')
62 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
64 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
65 coursepage
, u
'description', fatal
=False)
67 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
71 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
75 for entry
in info
['list']:
76 assert entry
['type'] == 'reference'
77 results
+= self
.extract(entry
['url'])
81 'id': 'Stanford OpenClassroom',
87 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
88 rootpage
= self
._download
_webpage
(rootURL
, info
['id'],
89 errnote
=u
'Unable to download course info page')
91 info
['title'] = info
['id']
93 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
97 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
102 for entry
in info
['list']:
103 assert entry
['type'] == 'reference'
104 results
+= self
.extract(entry
['url'])