Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tvp.py

   1 # -*- coding: utf-8 -*-
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8
   9 class TvpIE(InfoExtractor):
  10     IE_NAME = 'tvp.pl'
  11     _VALID_URL = r'https?://(?:vod|www)\.tvp\.pl/.*/(?P<id>\d+)$'
  12
  13     _TESTS = [{
  14         'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem/wideo/odc-2/4278035',
  15         'md5': 'cdd98303338b8a7f7abab5cd14092bf2',
  16         'info_dict': {
  17             'id': '4278035',
  18             'ext': 'wmv',
  19             'title': 'Ogniem i mieczem, odc. 2',
  20         },
  21     }, {
  22         'url': 'http://vod.tvp.pl/seriale/obyczajowe/czas-honoru/sezon-1-1-13/i-seria-odc-13/194536',
  23         'md5': '8aa518c15e5cc32dfe8db400dc921fbb',
  24         'info_dict': {
  25             'id': '194536',
  26             'ext': 'mp4',
  27             'title': 'Czas honoru, I seria – odc. 13',
  28         },
  29     }, {
  30         'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176',
  31         'md5': 'c3b15ed1af288131115ff17a17c19dda',
  32         'info_dict': {
  33             'id': '17916176',
  34             'ext': 'mp4',
  35             'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata',
  36         },
  37     }, {
  38         'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272',
  39         'md5': 'c3b15ed1af288131115ff17a17c19dda',
  40         'info_dict': {
  41             'id': '17834272',
  42             'ext': 'mp4',
  43             'title': 'Na sygnale, odc. 39',
  44         },
  45     }]
  46
  47     def _real_extract(self, url):
  48         video_id = self._match_id(url)
  49
  50         webpage = self._download_webpage(
  51             'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id)
  52
  53         title = self._search_regex(
  54             r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1',
  55             webpage, 'title', group='title')
  56         series_title = self._search_regex(
  57             r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1',
  58             webpage, 'series', group='series', default=None)
  59         if series_title:
  60             title = '%s, %s' % (series_title, title)
  61
  62         thumbnail = self._search_regex(
  63             r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None)
  64
  65         video_url = self._search_regex(
  66             r'0:{src:([\'"])(?P<url>.*?)\1', webpage, 'formats', group='url', default=None)
  67         if not video_url:
  68             video_url = self._download_json(
  69                 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id,
  70                 video_id)['video_url']
  71
  72         ext = video_url.rsplit('.', 1)[-1]
  73         if ext != 'ism/manifest':
  74             if '/' in ext:
  75                 ext = 'mp4'
  76             formats = [{
  77                 'format_id': 'direct',
  78                 'url': video_url,
  79                 'ext': ext,
  80             }]
  81         else:
  82             m3u8_url = re.sub('([^/]*)\.ism/manifest', r'\1.ism/\1.m3u8', video_url)
  83             formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
  84
  85         self._sort_formats(formats)
  86
  87         return {
  88             'id': video_id,
  89             'title': title,
  90             'thumbnail': thumbnail,
  91             'formats': formats,
  92         }
  93
  94
  95 class TvpSeriesIE(InfoExtractor):
  96     IE_NAME = 'tvp.pl:Series'
  97     _VALID_URL = r'https?://vod\.tvp\.pl/(?:[^/]+/){2}(?P<id>[^/]+)/?$'
  98
  99     _TESTS = [{
 100         'url': 'http://vod.tvp.pl/filmy-fabularne/filmy-za-darmo/ogniem-i-mieczem',
 101         'info_dict': {
 102             'title': 'Ogniem i mieczem',
 103             'id': '4278026',
 104         },
 105         'playlist_count': 4,
 106     }, {
 107         'url': 'http://vod.tvp.pl/audycje/podroze/boso-przez-swiat',
 108         'info_dict': {
 109             'title': 'Boso przez świat',
 110             'id': '9329207',
 111         },
 112         'playlist_count': 86,
 113     }]
 114
 115     def _real_extract(self, url):
 116         display_id = self._match_id(url)
 117         webpage = self._download_webpage(url, display_id, tries=5)
 118
 119         title = self._html_search_regex(
 120             r'(?s) id=[\'"]path[\'"]>(?:.*? / ){2}(.*?)</span>', webpage, 'series')
 121         playlist_id = self._search_regex(r'nodeId:\s*(\d+)', webpage, 'playlist id')
 122         playlist = self._download_webpage(
 123             'http://vod.tvp.pl/vod/seriesAjax?type=series&nodeId=%s&recommend'
 124             'edId=0&sort=&page=0&pageSize=10000' % playlist_id, display_id, tries=5,
 125             note='Downloading playlist')
 126
 127         videos_paths = re.findall(
 128             '(?s)class="shortTitle">.*?href="(/[^"]+)', playlist)
 129         entries = [
 130             self.url_result('http://vod.tvp.pl%s' % v_path, ie=TvpIE.ie_key())
 131             for v_path in videos_paths]
 132
 133         return {
 134             '_type': 'playlist',
 135             'id': playlist_id,
 136             'display_id': display_id,
 137             'title': title,
 138             'entries': entries,
 139         }