]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/arte.py
b061b9566168758465ad56f43b1f74b89b2cce10
[youtubedl] / youtube_dl / extractor / arte.py
1 import re
2 import json
3
4 from .common import InfoExtractor
5 from ..utils import (
6 # This is used by the not implemented extractLiveStream method
7 compat_urllib_parse,
8
9 ExtractorError,
10 unified_strdate,
11 )
12
13 class ArteTvIE(InfoExtractor):
14 _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
15 _LIVE_URL = r'index-[0-9]+\.html$'
16
17 IE_NAME = u'arte.tv'
18
19 # TODO implement Live Stream
20 # def extractLiveStream(self, url):
21 # video_lang = url.split('/')[-4]
22 # info = self.grep_webpage(
23 # url,
24 # r'src="(.*?/videothek_js.*?\.js)',
25 # 0,
26 # [
27 # (1, 'url', u'Invalid URL: %s' % url)
28 # ]
29 # )
30 # http_host = url.split('/')[2]
31 # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
32 # info = self.grep_webpage(
33 # next_url,
34 # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
35 # '(http://.*?\.swf).*?' +
36 # '(rtmp://.*?)\'',
37 # re.DOTALL,
38 # [
39 # (1, 'path', u'could not extract video path: %s' % url),
40 # (2, 'player', u'could not extract video player: %s' % url),
41 # (3, 'url', u'could not extract video url: %s' % url)
42 # ]
43 # )
44 # video_url = u'%s/%s' % (info.get('url'), info.get('path'))
45
46 def _real_extract(self, url):
47 mobj = re.match(self._VALID_URL, url)
48 name = mobj.group('name')
49 # This is not a real id, it can be for example AJT for the news
50 # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
51 video_id = mobj.group('id')
52
53 if re.search(self._LIVE_URL, video_id) is not None:
54 raise ExtractorError(u'Arte live streams are not yet supported, sorry')
55 # self.extractLiveStream(url)
56 # return
57
58 webpage = self._download_webpage(url, video_id)
59 json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
60
61 json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
62 self.report_extraction(video_id)
63 info = json.loads(json_info)
64 player_info = info['videoJsonPlayer']
65
66 info_dict = {'id': player_info['VID'],
67 'title': player_info['VTI'],
68 'description': player_info['VDE'],
69 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
70 'thumbnail': player_info['programImage'],
71 }
72
73 formats = player_info['VSR'].values()
74 # We order the formats by quality
75 formats = sorted(formats, key=lambda f: int(f['height']))
76 # Pick the best quality
77 format_info = formats[-1]
78 if format_info['mediaType'] == u'rtmp':
79 info_dict['url'] = format_info['streamer']
80 info_dict['play_path'] = 'mp4:' + format_info['url']
81 info_dict['ext'] = 'mp4'
82 else:
83 info_dict['url'] = format_info['url']
84 info_dict['ext'] = 'mp4'
85
86 return info_dict