]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nova.py
   2 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
  18 class NovaEmbedIE(InfoExtractor
): 
  19     _VALID_URL 
= r
'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' 
  21         'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', 
  22         'md5': 'ee009bafcc794541570edd44b71cbea3', 
  27             'thumbnail': r
're:^https?://.*\.jpg', 
  32     def _real_extract(self
, url
): 
  33         video_id 
= self
._match
_id
(url
) 
  35         webpage 
= self
._download
_webpage
(url
, video_id
) 
  40         player 
= self
._parse
_json
( 
  42                 r
'Player\.init\s*\([^,]+,\s*({.+?})\s*,\s*{.+?}\s*\)\s*;', 
  43                 webpage
, 'player', default
='{}'), video_id
, fatal
=False) 
  45             for format_id
, format_list 
in player
['tracks'].items(): 
  46                 if not isinstance(format_list
, list): 
  47                     format_list 
= [format_list
] 
  48                 for format_dict 
in format_list
: 
  49                     if not isinstance(format_dict
, dict): 
  51                     format_url 
= url_or_none(format_dict
.get('src')) 
  52                     format_type 
= format_dict
.get('type') 
  53                     ext 
= determine_ext(format_url
) 
  54                     if (format_type 
== 'application/x-mpegURL' 
  55                             or format_id 
== 'HLS' or ext 
== 'm3u8'): 
  56                         formats
.extend(self
._extract
_m
3u8_formats
( 
  57                             format_url
, video_id
, 'mp4', 
  58                             entry_protocol
='m3u8_native', m3u8_id
='hls', 
  60                     elif (format_type 
== 'application/dash+xml' 
  61                           or format_id 
== 'DASH' or ext 
== 'mpd'): 
  62                         formats
.extend(self
._extract
_mpd
_formats
( 
  63                             format_url
, video_id
, mpd_id
='dash', fatal
=False)) 
  68             duration 
= int_or_none(player
.get('duration')) 
  70             # Old path, not actual as of 08.04.2020 
  71             bitrates 
= self
._parse
_json
( 
  73                     r
'(?s)(?:src|bitrates)\s*=\s*({.+?})\s*;', webpage
, 'formats'), 
  74                 video_id
, transform_source
=js_to_json
) 
  76             QUALITIES 
= ('lq', 'mq', 'hq', 'hd') 
  77             quality_key 
= qualities(QUALITIES
) 
  79             for format_id
, format_list 
in bitrates
.items(): 
  80                 if not isinstance(format_list
, list): 
  81                     format_list 
= [format_list
] 
  82                 for format_url 
in format_list
: 
  83                     format_url 
= url_or_none(format_url
) 
  86                     if format_id 
== 'hls': 
  87                         formats
.extend(self
._extract
_m
3u8_formats
( 
  88                             format_url
, video_id
, ext
='mp4', 
  89                             entry_protocol
='m3u8_native', m3u8_id
='hls', 
  96                     for quality 
in QUALITIES
: 
  97                         if '%s.mp4' % quality 
in format_url
: 
  98                             f_id 
+= '-%s' % quality
 
 100                                 'quality': quality_key(quality
), 
 101                                 'format_note': quality
.upper(), 
 104                     f
['format_id'] = f_id
 
 107         self
._sort
_formats
(formats
) 
 109         title 
= self
._og
_search
_title
( 
 110             webpage
, default
=None) or self
._search
_regex
( 
 111             (r
'<value>(?P<title>[^<]+)', 
 112              r
'videoTitle\s*:\s*(["\'])(?P
<value
>(?
:(?
!\
1).)+)\
1'), webpage, 
 113             'title
', group='value
') 
 114         thumbnail = self._og_search_thumbnail( 
 115             webpage, default=None) or self._search_regex( 
 116             r'poster\s
*:\s
*(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, 
 117             'thumbnail', fatal=False, group='value') 
 118         duration = int_or_none(self._search_regex( 
 119             r'videoDuration\s*:\s*(\d+)', webpage, 'duration', 
 125             'thumbnail': thumbnail, 
 126             'duration': duration, 
 131 class NovaIE(InfoExtractor): 
 132     IE_DESC = 'TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz' 
 133     _VALID_URL = r'https?://(?:[^.]+\.)?(?P<site>tv(?:noviny)?|tn|novaplus|vymena|fanda|krasna|doma|prask)\.nova\.cz/(?:[^/]+/)+(?P<id>[^/]+?)(?:\.html|/|$)' 
 135         'url': 'http://tn.nova.cz/clanek/tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci.html#player_13260', 
 136         'md5': '249baab7d0104e186e78b0899c7d5f28', 
 139             'display_id': 'tajemstvi-ukryte-v-podzemi-specialni-nemocnice-v-prazske-krci', 
 141             'title': 'Podzemní nemocnice v pražské Krči', 
 142             'description': 'md5:f0a42dd239c26f61c28f19e62d20ef53', 
 143             'thumbnail': r're:^https?://.*\.(?:jpg)', 
 146         'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 
 150             'title': 'Zaklínač 3: Divoký hon', 
 151             'description': 're:.*Pokud se stejně jako my nemůžete.*', 
 152             'thumbnail': r're:https?://.*\.jpg(\?.*)?', 
 153             'upload_date': '20150521', 
 157             'skip_download': True, 
 161         # media.cms.nova.cz embed 
 162         'url': 'https://novaplus.nova.cz/porad/ulice/epizoda/18760-2180-dil', 
 166             'title': '2180. díl', 
 167             'thumbnail': r're:^https?://.*\.jpg', 
 171             'skip_download': True, 
 173         'add_ie': [NovaEmbedIE.ie_key()], 
 174         'skip': 'CHYBA 404: STRÁNKA NENALEZENA', 
 176         'url': 'http://sport.tn.nova.cz/clanek/sport/hokej/nhl/zivot-jde-dal-hodnotil-po-vyrazeni-z-playoff-jiri-sekac.html', 
 177         'only_matching': True, 
 179         'url': 'http://fanda.nova.cz/clanek/fun-and-games/krvavy-epos-zaklinac-3-divoky-hon-vychazi-vyhrajte-ho-pro-sebe.html', 
 180         'only_matching': True, 
 182         'url': 'http://doma.nova.cz/clanek/zdravi/prijdte-se-zapsat-do-registru-kostni-drene-jiz-ve-stredu-3-cervna.html', 
 183         'only_matching': True, 
 185         'url': 'http://prask.nova.cz/clanek/novinky/co-si-na-sobe-nase-hvezdy-nechaly-pojistit.html', 
 186         'only_matching': True, 
 188         'url': 'http://tv.nova.cz/clanek/novinky/zivot-je-zivot-bondovsky-trailer.html', 
 189         'only_matching': True, 
 192     def _real_extract(self, url): 
 193         mobj = re.match(self._VALID_URL, url) 
 194         display_id = mobj.group('id') 
 195         site = mobj.group('site') 
 197         webpage = self._download_webpage(url, display_id) 
 199         description = clean_html(self._og_search_description(webpage, default=None)) 
 200         if site == 'novaplus': 
 201             upload_date = unified_strdate(self._search_regex( 
 202                 r'(\d{1,2}-\d{1,2}-\d{4})$', display_id, 'upload date', default=None)) 
 203         elif site == 'fanda': 
 204             upload_date = unified_strdate(self._search_regex( 
 205                 r'<span class="date_time
">(\d{1,2}\.\d{1,2}\.\d{4})', webpage, 'upload date', default=None)) 
 210         embed_id = self._search_regex( 
 211             r'<iframe[^>]+\bsrc=["\'](?
:https?
:)?
//media\
.cms\
.nova\
.cz
/embed
/([^
/?
#&]+)', 
 212             webpage
, 'embed url', default
=None) 
 215                 '_type': 'url_transparent', 
 216                 'url': 'https://media.cms.nova.cz/embed/%s' % embed_id
, 
 217                 'ie_key': NovaEmbedIE
.ie_key(), 
 219                 'description': description
, 
 220                 'upload_date': upload_date
 
 223         video_id 
= self
._search
_regex
( 
 224             [r
"(?:media|video_id)\s*:\s*'(\d+)'", 
 226              r
'id="article_video_(\d+)"', 
 227              r
'id="player_(\d+)"'], 
 230         config_url 
= self
._search
_regex
( 
 231             r
'src="(https?://(?:tn|api)\.nova\.cz/bin/player/videojs/config\.php\?[^"]+)"', 
 232             webpage
, 'config url', default
=None) 
 236             player 
= self
._parse
_json
( 
 238                     r
'(?s)Player\s*\(.+?\s*,\s*({.+?\bmedia\b["\']?\s
*:\s
*["\']?\d+.+?})\s*\)', webpage, 
 239                     'player', default='{}'), 
 240                 video_id, transform_source=js_to_json, fatal=False) 
 242                 config_url = url_or_none(player.get('configUrl')) 
 243                 params = player.get('configParams') 
 244                 if isinstance(params, dict): 
 245                     config_params = params 
 248             DEFAULT_SITE_ID = '23000' 
 250                 'tvnoviny': DEFAULT_SITE_ID, 
 251                 'novaplus': DEFAULT_SITE_ID, 
 252                 'vymena': DEFAULT_SITE_ID, 
 253                 'krasna': DEFAULT_SITE_ID, 
 259             site_id = self._search_regex( 
 260                 r'site=(\d+)', webpage, 'site id', default=None) or SITES.get( 
 261                 site, DEFAULT_SITE_ID) 
 263             config_url = 'https://api.nova.cz/bin/player/videojs/config.php' 
 271         config = self._download_json( 
 272             config_url, display_id, 
 273             'Downloading config JSON', query=config_params, 
 274             transform_source=lambda s: s[s.index('{'):s.rindex('}') + 1]) 
 276         mediafile = config['mediafile'] 
 277         video_url = mediafile['src'] 
 279         m = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>[^/]+?))/&*(?P<playpath>.+)$', video_url) 
 282                 'url': m.group('url'), 
 283                 'app': m.group('app'), 
 284                 'play_path': m.group('playpath'), 
 285                 'player_path': 'http://tvnoviny.nova.cz/static/shared/app/videojs/video-js.swf', 
 292         self._sort_formats(formats) 
 294         title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) 
 295         thumbnail = config.get('poster') 
 299             'display_id': display_id, 
 301             'description': description, 
 302             'upload_date': upload_date, 
 303             'thumbnail': thumbnail,