]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
3 import xml
.etree
.ElementTree
5 from .common
import InfoExtractor
10 compat_urllib_request
,
18 class StanfordOpenClassroomIE(InfoExtractor
):
19 """Information extractor for Stanford's Open ClassRoom"""
21 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
22 IE_NAME
= u
'stanfordoc'
24 def _real_extract(self
, url
):
25 mobj
= re
.match(self
._VALID
_URL
, url
)
27 raise ExtractorError(u
'Invalid URL: %s' % url
)
29 if mobj
.group('course') and mobj
.group('video'): # A specific video
30 course
= mobj
.group('course')
31 video
= mobj
.group('video')
33 'id': course
+ '_' + video
,
38 self
.report_extraction(info
['id'])
39 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
40 xmlUrl
= baseUrl
+ video
+ '.xml'
42 metaXml
= compat_urllib_request
.urlopen(xmlUrl
).read()
43 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
44 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
))
45 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
47 info
['title'] = mdoc
.findall('./title')[0].text
48 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
50 raise ExtractorError(u
'Invalid metadata XML file')
51 info
['ext'] = info
['url'].rpartition('.')[2]
53 elif mobj
.group('course'): # A course page
54 course
= mobj
.group('course')
62 coursepage
= self
._download
_webpage
(url
, info
['id'],
63 note
='Downloading course info page',
64 errnote
='Unable to download course info page')
66 info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id'])
68 info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>',
69 coursepage
, u
'description', fatal
=False)
71 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
75 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
79 for entry
in info
['list']:
80 assert entry
['type'] == 'reference'
81 results
+= self
.extract(entry
['url'])
85 'id': 'Stanford OpenClassroom',
91 self
.report_download_webpage(info
['id'])
92 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
94 rootpage
= compat_urllib_request
.urlopen(rootURL
).read()
95 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
96 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
))
98 info
['title'] = info
['id']
100 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
104 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
109 for entry
in info
['list']:
110 assert entry
['type'] == 'reference'
111 results
+= self
.extract(entry
['url'])