]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
   3 import xml
.etree
.ElementTree
 
   5 from .common 
import InfoExtractor
 
  10     compat_urllib_request
, 
  18 class StanfordOpenClassroomIE(InfoExtractor
): 
  19     IE_NAME 
= u
'stanfordoc' 
  20     IE_DESC 
= u
'Stanford Open ClassRoom' 
  21     _VALID_URL 
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
  23         u
'url': u
'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100', 
  24         u
'file': u
'PracticalUnix_intro-environment.mp4', 
  25         u
'md5': u
'544a9468546059d4e80d76265b0443b8', 
  27             u
"title": u
"Intro Environment" 
  31     def _real_extract(self
, url
): 
  32         mobj 
= re
.match(self
._VALID
_URL
, url
) 
  34             raise ExtractorError(u
'Invalid URL: %s' % url
) 
  36         if mobj
.group('course') and mobj
.group('video'): # A specific video 
  37             course 
= mobj
.group('course') 
  38             video 
= mobj
.group('video') 
  40                 'id': course 
+ '_' + video
, 
  45             self
.report_extraction(info
['id']) 
  46             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
  47             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
  49                 metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
  50             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
  51                 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
  52             mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
  54                 info
['title'] = mdoc
.findall('./title')[0].text
 
  55                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
  57                 raise ExtractorError(u
'Invalid metadata XML file') 
  58             info
['ext'] = info
['url'].rpartition('.')[2] 
  60         elif mobj
.group('course'): # A course page 
  61             course 
= mobj
.group('course') 
  69             coursepage 
= self
._download
_webpage
(url
, info
['id'], 
  70                                         note
='Downloading course info page', 
  71                                         errnote
='Unable to download course info page') 
  73             info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id']) 
  75             info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>', 
  76                 coursepage
, u
'description', fatal
=False) 
  78             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
  82                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
  86             for entry 
in info
['list']: 
  87                 assert entry
['type'] == 'reference' 
  88                 results 
+= self
.extract(entry
['url']) 
  92                 'id': 'Stanford OpenClassroom', 
  98             self
.report_download_webpage(info
['id']) 
  99             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
 101                 rootpage 
= compat_urllib_request
.urlopen(rootURL
).read() 
 102             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 103                 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
)) 
 105             info
['title'] = info
['id'] 
 107             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
 111                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
 116             for entry 
in info
['list']: 
 117                 assert entry
['type'] == 'reference' 
 118                 results 
+= self
.extract(entry
['url'])