]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/stanfordoc.py
8d3e32ab9755f7d365a0c346126d450d65ccac51
   3 import xml
.etree
.ElementTree
 
   5 from .common 
import InfoExtractor
 
  10     compat_urllib_request
, 
  18 class StanfordOpenClassroomIE(InfoExtractor
): 
  19     """Information extractor for Stanford's Open ClassRoom""" 
  21     _VALID_URL 
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
  22     IE_NAME 
= u
'stanfordoc' 
  24     def _real_extract(self
, url
): 
  25         mobj 
= re
.match(self
._VALID
_URL
, url
) 
  27             raise ExtractorError(u
'Invalid URL: %s' % url
) 
  29         if mobj
.group('course') and mobj
.group('video'): # A specific video 
  30             course 
= mobj
.group('course') 
  31             video 
= mobj
.group('video') 
  33                 'id': course 
+ '_' + video
, 
  38             self
.report_extraction(info
['id']) 
  39             baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
  40             xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
  42                 metaXml 
= compat_urllib_request
.urlopen(xmlUrl
).read() 
  43             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
  44                 raise ExtractorError(u
'Unable to download video info XML: %s' % compat_str(err
)) 
  45             mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
  47                 info
['title'] = mdoc
.findall('./title')[0].text
 
  48                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
  50                 raise ExtractorError(u
'Invalid metadata XML file') 
  51             info
['ext'] = info
['url'].rpartition('.')[2] 
  53         elif mobj
.group('course'): # A course page 
  54             course 
= mobj
.group('course') 
  62             coursepage 
= self
._download
_webpage
(url
, info
['id'], 
  63                                         note
='Downloading course info page', 
  64                                         errnote
='Unable to download course info page') 
  66             info
['title'] = self
._html
_search
_regex
('<h1>([^<]+)</h1>', coursepage
, 'title', default
=info
['id']) 
  68             info
['description'] = self
._html
_search
_regex
('<description>([^<]+)</description>', 
  69                 coursepage
, u
'description', fatal
=False) 
  71             links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
  75                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
  79             for entry 
in info
['list']: 
  80                 assert entry
['type'] == 'reference' 
  81                 results 
+= self
.extract(entry
['url']) 
  85                 'id': 'Stanford OpenClassroom', 
  91             self
.report_download_webpage(info
['id']) 
  92             rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
  94                 rootpage 
= compat_urllib_request
.urlopen(rootURL
).read() 
  95             except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
  96                 raise ExtractorError(u
'Unable to download course info page: ' + compat_str(err
)) 
  98             info
['title'] = info
['id'] 
 100             links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
 104                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
 109             for entry 
in info
['list']: 
 110                 assert entry
['type'] == 'reference' 
 111                 results 
+= self
.extract(entry
['url'])