]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/arte.py
   4 from .common 
import InfoExtractor
 
   6     # This is used by the not implemented extractLiveStream method 
  13 class ArteTvIE(InfoExtractor
): 
  15     There are two sources of video in arte.tv: videos.arte.tv and 
  16     www.arte.tv/guide, the extraction process is different for each one. 
  17     The videos expire in 7 days, so we can't add tests. 
  19     _EMISSION_URL 
= r
'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' 
  20     _VIDEOS_URL 
= r
'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html' 
  21     _LIVE_URL 
= r
'index-[0-9]+\.html$' 
  26     def suitable(cls
, url
): 
  27         return any(re
.match(regex
, url
) for regex 
in (cls
._EMISSION
_URL
, cls
._VIDEOS
_URL
)) 
  29     # TODO implement Live Stream 
  30     # def extractLiveStream(self, url): 
  31     #     video_lang = url.split('/')[-4] 
  32     #     info = self.grep_webpage( 
  34     #         r'src="(.*?/videothek_js.*?\.js)', 
  37     #             (1, 'url', u'Invalid URL: %s' % url) 
  40     #     http_host = url.split('/')[2] 
  41     #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) 
  42     #     info = self.grep_webpage( 
  44     #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + 
  45     #             '(http://.*?\.swf).*?' + 
  49     #             (1, 'path',   u'could not extract video path: %s' % url), 
  50     #             (2, 'player', u'could not extract video player: %s' % url), 
  51     #             (3, 'url',    u'could not extract video url: %s' % url) 
  54     #     video_url = u'%s/%s' % (info.get('url'), info.get('path')) 
  56     def _real_extract(self
, url
): 
  57         mobj 
= re
.match(self
._EMISSION
_URL
, url
) 
  59             name 
= mobj
.group('name') 
  60             # This is not a real id, it can be for example AJT for the news 
  61             # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal 
  62             video_id 
= mobj
.group('id') 
  63             return self
._extract
_emission
(url
, video_id
) 
  65         mobj 
= re
.match(self
._VIDEOS
_URL
, url
) 
  68             return self
._extract
_video
(url
, id) 
  70         if re
.search(self
._LIVE
_URL
, video_id
) is not None: 
  71             raise ExtractorError(u
'Arte live streams are not yet supported, sorry') 
  72             # self.extractLiveStream(url) 
  75     def _extract_emission(self
, url
, video_id
): 
  76         """Extract from www.arte.tv/guide""" 
  77         webpage 
= self
._download
_webpage
(url
, video_id
) 
  78         json_url 
= self
._html
_search
_regex
(r
'arte_vp_url="(.*?)"', webpage
, 'json url') 
  80         json_info 
= self
._download
_webpage
(json_url
, video_id
, 'Downloading info json') 
  81         self
.report_extraction(video_id
) 
  82         info 
= json
.loads(json_info
) 
  83         player_info 
= info
['videoJsonPlayer'] 
  85         info_dict 
= {'id': player_info
['VID'], 
  86                      'title': player_info
['VTI'], 
  87                      'description': player_info
['VDE'], 
  88                      'upload_date': unified_strdate(player_info
['VDA'].split(' ')[0]), 
  89                      'thumbnail': player_info
['programImage'], 
  93         formats 
= player_info
['VSR'].values() 
  94         # We order the formats by quality 
  95         formats 
= sorted(formats
, key
=lambda f
: int(f
['height'])) 
  96         # Pick the best quality 
  97         format_info 
= formats
[-1] 
  98         if format_info
['mediaType'] == u
'rtmp': 
  99             info_dict
['url'] = format_info
['streamer'] 
 100             info_dict
['play_path'] = 'mp4:' + format_info
['url'] 
 102             info_dict
['url'] = format_info
['url'] 
 106     def _extract_video(self
, url
, video_id
): 
 107         """Extract from videos.arte.tv""" 
 108         config_xml_url 
= url
.replace('/videos/', '/do_delegate/videos/') 
 109         config_xml_url 
= config_xml_url
.replace('.html', ',view,asPlayerXml.xml') 
 110         config_xml 
= self
._download
_webpage
(config_xml_url
, video_id
) 
 111         config_xml_url 
= self
._html
_search
_regex
(r
'<video lang=".*?" ref="(.*?)"', config_xml
, 'config xml url') 
 112         config_xml 
= self
._download
_webpage
(config_xml_url
, video_id
) 
 114         video_urls 
= list(re
.finditer(r
'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml
)) 
 116             quality 
= m
.group('quality') 
 121         # We pick the best quality 
 122         video_urls 
= sorted(video_urls
, key
=_key
) 
 123         video_url 
= list(video_urls
)[-1].group('url') 
 125         title 
= self
._html
_search
_regex
(r
'<name>(.*?)</name>', config_xml
, 'title') 
 126         thumbnail 
= self
._html
_search
_regex
(r
'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', 
 127                                             config_xml
, 'thumbnail') 
 128         return {'id': video_id
, 
 130                 'thumbnail': thumbnail
,