]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
b27838bf9dc5ea430f01b054eb152d78fb946d0d
3 import xml
.etree
.ElementTree
5 from .common
import InfoExtractor
10 compat_urllib_request
,
18 class StanfordOpenClassroomIE(InfoExtractor
):
19 IE_NAME
= u
'stanfordoc'
20 IE_DESC
= u
'Stanford Open ClassRoom'
21 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
23 u
'url': u
'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
24 u
'file': u
'PracticalUnix_intro-environment.mp4',
25 u
'md5': u
'544a9468546059d4e80d76265b0443b8',
27 u
"title": u
"Intro Environment"
31 def _real_extract(self
, url
):
32 mobj
= re
.match(self
._VALID
_URL
, url
)
34 raise ExtractorError(u
'Invalid URL: %s' % url
)
36 if mobj
.group('course') and mobj
.group('video'): # A specific video
37 course
= mobj
.group('course')
38 video
= mobj
.group('video')
40 'id': course
+ '_' + video
,
45 self
.report_extraction(info
['id'])
46 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
47 xmlUrl
= baseUrl
+ video
+ '.xml'
49 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
50 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
51 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
52 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
54 info
['title'] = mdoc
.findall('./title')[0].text
55 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
57 raise ExtractorError(u
'Invalid metadata XML file')
58 info
['ext'] = info
['url'].rpartition('.')[2]
60 elif mobj
.group('course'): # A course page
61 course
= mobj
.group('course')
69 coursepage
= self
._download
_webpage
(url
, info
['id'],
70 note
='Downloading course info page',
71 errnote
='Unable to download course info page')
73 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
75 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
76 coursepage
, u
'description', fatal
=False)
78 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
82 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
86 for entry
in info
['list']:
87 assert entry
['type'] == 'reference'
88 results
+= self
.extract(entry
['url'])
92 'id': 'Stanford OpenClassroom',
98 self
.report_download_webpage(info
['id'])
99 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
101 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
102 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
103 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
105 info
['title'] = info
['id']
107 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
111 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
116 for entry
in info
['list']:
117 assert entry
['type'] == 'reference'
118 results
+= self
.extract(entry
['url'])