]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tvp.py
   2 from __future__ 
import unicode_literals
 
   7 from .common 
import InfoExtractor
 
  12     get_element_by_attribute
, 
  17 class TVPIE(InfoExtractor
): 
  19     IE_DESC 
= 'Telewizja Polska' 
  20     _VALID_URL 
= r
'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' 
  23         'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', 
  24         'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 
  28             'title': 'Czas honoru, odc. 13 – Władek', 
  29             'description': 'md5:437f48b93558370b031740546b696e24', 
  32         'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', 
  33         'md5': 'b0005b542e5b4de643a9690326ab1257', 
  37             'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 
  38             'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', 
  41         # page id is not the same as video id(#7799) 
  42         'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', 
  43         'md5': '84cd3c8aec4840046e5ab712416b73d0', 
  47             'title': 'Wiadomości, 28.09.2017, 19:30', 
  48             'description': 'Wydanie główne codziennego serwisu informacyjnego.' 
  50         'skip': 'HTTP Error 404: Not Found', 
  52         'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 
  53         'only_matching': True, 
  55         'url': 'http://wiadomosci.tvp.pl/25169746/24052016-1200', 
  56         'only_matching': True, 
  58         'url': 'http://krakow.tvp.pl/25511623/25lecie-mck-wyjatkowe-miejsce-na-mapie-krakowa', 
  59         'only_matching': True, 
  61         'url': 'http://teleexpress.tvp.pl/25522307/wierni-wzieli-udzial-w-procesjach', 
  62         'only_matching': True, 
  64         'url': 'http://sport.tvp.pl/25522165/krychowiak-uspokaja-w-sprawie-kontuzji-dwa-tygodnie-to-maksimum', 
  65         'only_matching': True, 
  67         'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 
  68         'only_matching': True, 
  71     def _real_extract(self
, url
): 
  72         page_id 
= self
._match
_id
(url
) 
  73         webpage 
= self
._download
_webpage
(url
, page_id
) 
  74         video_id 
= self
._search
_regex
([ 
  75             r
'<iframe[^>]+src="[^"]*?object_id=(\d+)', 
  76             r
"object_id\s*:\s*'(\d+)'", 
  77             r
'data-video-id="(\d+)"'], webpage
, 'video id', default
=page_id
) 
  79             '_type': 'url_transparent', 
  80             'url': 'tvp:' + video_id
, 
  81             'description': self
._og
_search
_description
( 
  82                 webpage
, default
=None) or self
._html
_search
_meta
( 
  83                 'description', webpage
, default
=None), 
  84             'thumbnail': self
._og
_search
_thumbnail
(webpage
, default
=None), 
  89 class TVPEmbedIE(InfoExtractor
): 
  91     IE_DESC 
= 'Telewizja Polska' 
  92     _VALID_URL 
= r
'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' 
  96         'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 
 100             'title': 'Czas honoru, odc. 13 – Władek', 
 104         'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', 
 105         'md5': '8c9cd59d16edabf39331f93bf8a766c7', 
 109             'title': 'Panorama, 07.12.2015, 15:40', 
 111         'skip': 'Transmisja została zakończona lub materiał niedostępny', 
 113         'url': 'tvp:22670268', 
 114         'only_matching': True, 
 117     def _real_extract(self
, url
): 
 118         video_id 
= self
._match
_id
(url
) 
 120         webpage 
= self
._download
_webpage
( 
 121             'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id
, video_id
) 
 123         error 
= self
._html
_search
_regex
( 
 124             r
'(?s)<p[^>]+\bclass=["\']notAvailable__text
["\'][^>]*>(.+?)</p>', 
 125             webpage, 'error', default=None) or clean_html( 
 126             get_element_by_attribute('class', 'msg error', webpage)) 
 128             raise ExtractorError('%s said: %s' % ( 
 129                 self.IE_NAME, clean_html(error)), expected=True) 
 131         title = self._search_regex( 
 132             r'name\s*:\s*([\'"])Title\
1\s
*,\s
*value\s
*:\s
*\
1(?P
<title
>.+?
)\
1', 
 133             webpage, 'title
', group='title
') 
 134         series_title = self._search_regex( 
 135             r'name\s
*:\s
*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', 
 136             webpage, 'series', group='series', default=None) 
 138             title = '%s, %s' % (series_title, title) 
 140         thumbnail = self._search_regex( 
 141             r"poster\s
*:\s
*'([^']+)'", webpage, 'thumbnail
', default=None) 
 143         video_url = self._search_regex( 
 144             r'0:{src
:([\'"])(?P<url>.*?)\1', webpage, 
 145             'formats', group='url', default=None) 
 146         if not video_url or 'material_niedostepny.mp4' in video_url: 
 147             video_url = self._download_json( 
 148                 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, 
 149                 video_id)['video_url'] 
 152         video_url_base = self._search_regex( 
 153             r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', 
 154             video_url, 'video base url', default=None) 
 156             # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. 
 157             # It's not mentioned in MPEG-DASH standard. Figure that out. 
 158             # formats.extend(self._extract_mpd_formats( 
 159             #     video_url_base + '.ism/video.mpd', 
 160             #     video_id, mpd_id='dash', fatal=False)) 
 161             formats.extend(self._extract_ism_formats( 
 162                 video_url_base + '.ism/Manifest', 
 163                 video_id, 'mss', fatal=False)) 
 164             formats.extend(self._extract_f4m_formats( 
 165                 video_url_base + '.ism/video.f4m', 
 166                 video_id, f4m_id='hds', fatal=False)) 
 167             m3u8_formats = self._extract_m3u8_formats( 
 168                 video_url_base + '.ism/video.m3u8', video_id, 
 169                 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) 
 170             self._sort_formats(m3u8_formats) 
 171             m3u8_formats = list(filter( 
 172                 lambda f: f.get('vcodec') != 'none', m3u8_formats)) 
 173             formats.extend(m3u8_formats) 
 174             for i, m3u8_format in enumerate(m3u8_formats, 2): 
 175                 http_url = '%s-%d.mp4' % (video_url_base, i) 
 176                 if self._is_valid_url(http_url, video_id): 
 177                     f = m3u8_format.copy() 
 180                         'format_id': f['format_id'].replace('hls', 'http'), 
 186                 'format_id': 'direct', 
 188                 'ext': determine_ext(video_url, 'mp4'), 
 191         self._sort_formats(formats) 
 196             'thumbnail': thumbnail, 
 201 class TVPWebsiteIE(InfoExtractor): 
 202     IE_NAME = 'tvp:series' 
 203     _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' 
 207         'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', 
 211         'playlist_count': 115, 
 214         'url': 'https://vod.tvp.pl/website/gloria,35139666', 
 218             'title': 'Gloria, Gloria', 
 221             'skip_download': True, 
 223         'add_ie': ['TVPEmbed'], 
 225         'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', 
 226         'only_matching': True, 
 229     def _entries(self, display_id, playlist_id): 
 230         url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) 
 231         for page_num in itertools.count(1): 
 232             page = self._download_webpage( 
 233                 url, display_id, 'Downloading page %d' % page_num, 
 234                 query={'page': page_num}) 
 236             video_ids = orderedSet(re.findall( 
 237                 r'<a[^>]+\bhref=["\']/video
/%s,[^
,]+,(\d
+)' % display_id, 
 243             for video_id in video_ids: 
 244                 yield self.url_result( 
 245                     'tvp
:%s' % video_id, ie=TVPEmbedIE.ie_key(), 
 248     def _real_extract(self, url): 
 249         mobj = re.match(self._VALID_URL, url) 
 250         display_id, playlist_id = mobj.group('display_id
', 'id') 
 251         return self.playlist_result( 
 252             self._entries(display_id, playlist_id), playlist_id)