Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/expressen.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     determine_ext,
   7     int_or_none,
   8     unescapeHTML,
   9     unified_timestamp,
  10 )
  11
  12
  13 class ExpressenIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  15     _TESTS = [{
  16         'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
  17         'md5': '2fbbe3ca14392a6b1b36941858d33a45',
  18         'info_dict': {
  19             'id': '8690962',
  20             'ext': 'mp4',
  21             'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
  22             'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
  23             'thumbnail': r're:^https?://.*\.jpg$',
  24             'duration': 788,
  25             'timestamp': 1526639109,
  26             'upload_date': '20180518',
  27         },
  28     }, {
  29         'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
  30         'only_matching': True,
  31     }]
  32
  33     def _real_extract(self, url):
  34         display_id = self._match_id(url)
  35
  36         webpage = self._download_webpage(url, display_id)
  37
  38         def extract_data(name):
  39             return self._parse_json(
  40                 self._search_regex(
  41                     r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
  42                     webpage, 'info', group='value'),
  43                 display_id, transform_source=unescapeHTML)
  44
  45         info = extract_data('video-tracking-info')
  46         video_id = info['videoId']
  47
  48         data = extract_data('article-data')
  49         stream = data['stream']
  50
  51         if determine_ext(stream) == 'm3u8':
  52             formats = self._extract_m3u8_formats(
  53                 stream, display_id, 'mp4', entry_protocol='m3u8_native',
  54                 m3u8_id='hls')
  55         else:
  56             formats = [{
  57                 'url': stream,
  58             }]
  59         self._sort_formats(formats)
  60
  61         title = info.get('titleRaw') or data['title']
  62         description = info.get('descriptionRaw')
  63         thumbnail = info.get('socialMediaImage') or data.get('image')
  64         duration = int_or_none(info.get('videoTotalSecondsDuration') or
  65                                data.get('totalSecondsDuration'))
  66         timestamp = unified_timestamp(info.get('publishDate'))
  67
  68         return {
  69             'id': video_id,
  70             'display_id': display_id,
  71             'title': title,
  72             'description': description,
  73             'thumbnail': thumbnail,
  74             'duration': duration,
  75             'timestamp': timestamp,
  76             'formats': formats,
  77         }