]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/arte.py
69b3b0ad7820600ef5107ad3d79230c0e4edcaac
   3 import xml
.etree
.ElementTree
 
   5 from .common 
import InfoExtractor
 
  12 class ArteTvIE(InfoExtractor
): 
  14     There are two sources of video in arte.tv: videos.arte.tv and 
  15     www.arte.tv/guide, the extraction process is different for each one. 
  16     The videos expire in 7 days, so we can't add tests. 
  18     _EMISSION_URL 
= r
'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?' 
  19     _VIDEOS_URL 
= r
'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html' 
  20     _LIVEWEB_URL 
= r
'(?:http://)?liveweb.arte.tv/(?P<lang>fr|de)/(?P<subpage>.+?)/(?P<name>.+)' 
  21     _LIVE_URL 
= r
'index-[0-9]+\.html$' 
  26     def suitable(cls
, url
): 
  27         return any(re
.match(regex
, url
) for regex 
in (cls
._EMISSION
_URL
, cls
._VIDEOS
_URL
, cls
._LIVEWEB
_URL
)) 
  29     # TODO implement Live Stream 
  30     # from ..utils import compat_urllib_parse 
  31     # def extractLiveStream(self, url): 
  32     #     video_lang = url.split('/')[-4] 
  33     #     info = self.grep_webpage( 
  35     #         r'src="(.*?/videothek_js.*?\.js)', 
  38     #             (1, 'url', u'Invalid URL: %s' % url) 
  41     #     http_host = url.split('/')[2] 
  42     #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) 
  43     #     info = self.grep_webpage( 
  45     #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + 
  46     #             '(http://.*?\.swf).*?' + 
  50     #             (1, 'path',   u'could not extract video path: %s' % url), 
  51     #             (2, 'player', u'could not extract video player: %s' % url), 
  52     #             (3, 'url',    u'could not extract video url: %s' % url) 
  55     #     video_url = u'%s/%s' % (info.get('url'), info.get('path')) 
  57     def _real_extract(self
, url
): 
  58         mobj 
= re
.match(self
._EMISSION
_URL
, url
) 
  60             lang 
= mobj
.group('lang') 
  61             # This is not a real id, it can be for example AJT for the news 
  62             # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal 
  63             video_id 
= mobj
.group('id') 
  64             return self
._extract
_emission
(url
, video_id
, lang
) 
  66         mobj 
= re
.match(self
._VIDEOS
_URL
, url
) 
  69             lang 
= mobj
.group('lang') 
  70             return self
._extract
_video
(url
, id, lang
) 
  72         mobj 
= re
.match(self
._LIVEWEB
_URL
, url
) 
  74             name 
= mobj
.group('name') 
  75             lang 
= mobj
.group('lang') 
  76             return self
._extract
_liveweb
(url
, name
, lang
) 
  78         if re
.search(self
._LIVE
_URL
, video_id
) is not None: 
  79             raise ExtractorError(u
'Arte live streams are not yet supported, sorry') 
  80             # self.extractLiveStream(url) 
  83     def _extract_emission(self
, url
, video_id
, lang
): 
  84         """Extract from www.arte.tv/guide""" 
  85         webpage 
= self
._download
_webpage
(url
, video_id
) 
  86         json_url 
= self
._html
_search
_regex
(r
'arte_vp_url="(.*?)"', webpage
, 'json url') 
  88         json_info 
= self
._download
_webpage
(json_url
, video_id
, 'Downloading info json') 
  89         self
.report_extraction(video_id
) 
  90         info 
= json
.loads(json_info
) 
  91         player_info 
= info
['videoJsonPlayer'] 
  93         info_dict 
= {'id': player_info
['VID'], 
  94                      'title': player_info
['VTI'], 
  95                      'description': player_info
.get('VDE'), 
  96                      'upload_date': unified_strdate(player_info
['VDA'].split(' ')[0]), 
  97                      'thumbnail': player_info
['programImage'], 
 101         formats 
= player_info
['VSR'].values() 
 103             # Return true if that format is in the language of the url 
 108             regexes 
= [r
'VO?%s' % l
, r
'VO?.-ST%s' % l
] 
 109             return any(re
.match(r
, f
['versionCode']) for r 
in regexes
) 
 110         # Some formats may not be in the same language as the url 
 111         formats 
= filter(_match_lang
, formats
) 
 112         # We order the formats by quality 
 113         formats 
= sorted(formats
, key
=lambda f
: int(f
['height'])) 
 114         # Prefer videos without subtitles in the same language 
 115         formats 
= sorted(formats
, key
=lambda f
: re
.match(r
'VO(F|A)-STM\1', f
['versionCode']) is None) 
 116         # Pick the best quality 
 117         format_info 
= formats
[-1] 
 118         if format_info
['mediaType'] == u
'rtmp': 
 119             info_dict
['url'] = format_info
['streamer'] 
 120             info_dict
['play_path'] = 'mp4:' + format_info
['url'] 
 122             info_dict
['url'] = format_info
['url'] 
 126     def _extract_video(self
, url
, video_id
, lang
): 
 127         """Extract from videos.arte.tv""" 
 128         ref_xml_url 
= url
.replace('/videos/', '/do_delegate/videos/') 
 129         ref_xml_url 
= ref_xml_url
.replace('.html', ',view,asPlayerXml.xml') 
 130         ref_xml 
= self
._download
_webpage
(ref_xml_url
, video_id
, note
=u
'Downloading metadata') 
 131         ref_xml_doc 
= xml
.etree
.ElementTree
.fromstring(ref_xml
) 
 132         config_node 
= find_xpath_attr(ref_xml_doc
, './/video', 'lang', lang
) 
 133         config_xml_url 
= config_node
.attrib
['ref'] 
 134         config_xml 
= self
._download
_webpage
(config_xml_url
, video_id
, note
=u
'Downloading configuration') 
 136         video_urls 
= list(re
.finditer(r
'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml
)) 
 138             quality 
= m
.group('quality') 
 143         # We pick the best quality 
 144         video_urls 
= sorted(video_urls
, key
=_key
) 
 145         video_url 
= list(video_urls
)[-1].group('url') 
 147         title 
= self
._html
_search
_regex
(r
'<name>(.*?)</name>', config_xml
, 'title') 
 148         thumbnail 
= self
._html
_search
_regex
(r
'<firstThumbnailUrl>(.*?)</firstThumbnailUrl>', 
 149                                             config_xml
, 'thumbnail') 
 150         return {'id': video_id
, 
 152                 'thumbnail': thumbnail
, 
 157     def _extract_liveweb(self
, url
, name
, lang
): 
 158         """Extract form http://liveweb.arte.tv/""" 
 159         webpage 
= self
._download
_webpage
(url
, name
) 
 160         video_id 
= self
._search
_regex
(r
'eventId=(\d+?)("|&)', webpage
, u
'event id') 
 161         config_xml 
= self
._download
_webpage
('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id
, 
 162                                             video_id
, u
'Downloading information') 
 163         config_doc 
= xml
.etree
.ElementTree
.fromstring(config_xml
.encode('utf-8')) 
 164         event_doc 
= config_doc
.find('event') 
 165         url_node 
= event_doc
.find('video').find('urlHd') 
 167             url_node 
= video_doc
.find('urlSd') 
 169         return {'id': video_id
, 
 170                 'title': event_doc
.find('name%s' % lang
.capitalize()).text
, 
 171                 'url': url_node
.text
.replace('MP4', 'mp4'), 
 173                 'thumbnail': self
._og
_search
_thumbnail
(webpage
),