Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/arte.py

   1 import re
   2 import json
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     # This is used by the not implemented extractLiveStream method
   7     compat_urllib_parse,
   8
   9     ExtractorError,
  10     unified_strdate,
  11 )
  12
  13 class ArteTvIE(InfoExtractor):
  14     _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
  15     _LIVE_URL = r'index-[0-9]+\.html$'
  16
  17     IE_NAME = u'arte.tv'
  18
  19     # TODO implement Live Stream
  20     # def extractLiveStream(self, url):
  21     #     video_lang = url.split('/')[-4]
  22     #     info = self.grep_webpage(
  23     #         url,
  24     #         r'src="(.*?/videothek_js.*?\.js)',
  25     #         0,
  26     #         [
  27     #             (1, 'url', u'Invalid URL: %s' % url)
  28     #         ]
  29     #     )
  30     #     http_host = url.split('/')[2]
  31     #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
  32     #     info = self.grep_webpage(
  33     #         next_url,
  34     #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
  35     #             '(http://.*?\.swf).*?' +
  36     #             '(rtmp://.*?)\'',
  37     #         re.DOTALL,
  38     #         [
  39     #             (1, 'path',   u'could not extract video path: %s' % url),
  40     #             (2, 'player', u'could not extract video player: %s' % url),
  41     #             (3, 'url',    u'could not extract video url: %s' % url)
  42     #         ]
  43     #     )
  44     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
  45
  46     def _real_extract(self, url):
  47         mobj = re.match(self._VALID_URL, url)
  48         name = mobj.group('name')
  49         # This is not a real id, it can be for example AJT for the news
  50         # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
  51         video_id = mobj.group('id')
  52
  53         if re.search(self._LIVE_URL, video_id) is not None:
  54             raise ExtractorError(u'Arte live streams are not yet supported, sorry')
  55             # self.extractLiveStream(url)
  56             # return
  57
  58         webpage = self._download_webpage(url, video_id)
  59         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
  60
  61         json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
  62         self.report_extraction(video_id)
  63         info = json.loads(json_info)
  64         player_info = info['videoJsonPlayer']
  65
  66         info_dict = {'id': player_info['VID'],
  67                      'title': player_info['VTI'],
  68                      'description': player_info['VDE'],
  69                      'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
  70                      'thumbnail': player_info['programImage'],
  71                      }
  72
  73         formats = player_info['VSR'].values()
  74         # We order the formats by quality
  75         formats = sorted(formats, key=lambda f: int(f['height']))
  76         # Pick the best quality
  77         format_info = formats[-1]
  78         if format_info['mediaType'] == u'rtmp':
  79             info_dict['url'] = format_info['streamer']
  80             info_dict['play_path'] = 'mp4:' + format_info['url']
  81             info_dict['ext'] = 'mp4'
  82         else:
  83             info_dict['url'] = format_info['url']
  84             info_dict['ext'] = 'mp4'
  85
  86         return info_dict