]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/rai.py
81eb9db85b9127e292894e22c9dd6272b134832a
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  28 class RaiBaseIE(InfoExtractor
): 
  29     _UUID_RE 
= r
'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' 
  30     _GEO_COUNTRIES 
= ['IT'] 
  33     def _extract_relinker_info(self
, relinker_url
, video_id
): 
  39         for platform 
in ('mon', 'flash', 'native'): 
  40             relinker 
= self
._download
_xml
( 
  41                 relinker_url
, video_id
, 
  42                 note
='Downloading XML metadata for platform %s' % platform
, 
  43                 transform_source
=fix_xml_ampersands
, 
  44                 query
={'output': 45, 'pl': platform
}, 
  45                 headers
=self
.geo_verification_headers()) 
  48                 geoprotection 
= xpath_text( 
  49                     relinker
, './geoprotection', default
=None) == 'Y' 
  53                     relinker
, './is_live', default
=None) == 'Y' 
  55                 duration 
= parse_duration(xpath_text( 
  56                     relinker
, './duration', default
=None)) 
  58             url_elem 
= find_xpath_attr(relinker
, './url', 'type', 'content') 
  62             media_url 
= url_elem
.text
 
  64             # This does not imply geo restriction (e.g. 
  65             # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) 
  66             if media_url 
== 'http://download.rai.it/video_no_available.mp4': 
  69             ext 
= determine_ext(media_url
) 
  70             if (ext 
== 'm3u8' and platform 
!= 'mon') or (ext 
== 'f4m' and platform 
!= 'flash'): 
  74                 formats
.extend(self
._extract
_m
3u8_formats
( 
  75                     media_url
, video_id
, 'mp4', 'm3u8_native', 
  76                     m3u8_id
='hls', fatal
=False)) 
  78                 manifest_url 
= update_url_query( 
  79                     media_url
.replace('manifest#live_hds.f4m', 'manifest.f4m'), 
  80                     {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) 
  81                 formats
.extend(self
._extract
_f
4m
_formats
( 
  82                     manifest_url
, video_id
, f4m_id
='hds', fatal
=False)) 
  84                 bitrate 
= int_or_none(xpath_text(relinker
, 'bitrate')) 
  87                     'tbr': bitrate 
if bitrate 
> 0 else None, 
  88                     'format_id': 'http-%d' % bitrate 
if bitrate 
> 0 else 'http', 
  91         if not formats 
and geoprotection 
is True: 
  92             self
.raise_geo_restricted(countries
=self
._GEO
_COUNTRIES
) 
  94         return dict((k
, v
) for k
, v 
in { 
  98         }.items() if v 
is not None) 
 101     def _extract_subtitles(url
, subtitle_url
): 
 103         if subtitle_url 
and isinstance(subtitle_url
, compat_str
): 
 104             subtitle_url 
= urljoin(url
, subtitle_url
) 
 111             if subtitle_url
.endswith(STL_EXT
): 
 112                 srt_url 
= subtitle_url
[:-len(STL_EXT
)] + SRT_EXT
 
 113                 subtitles
['it'].append({ 
 120 class RaiPlayIE(RaiBaseIE
): 
 121     _VALID_URL 
= r
'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE
._UUID
_RE
 
 123         'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', 
 124         'md5': '340aa3b7afb54bfd14a8c11786450d76', 
 126             'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', 
 128             'title': 'La Casa Bianca', 
 129             'alt_title': 'S2016 - Puntata del 23/10/2016', 
 130             'description': 'md5:a09d45890850458077d1f68bb036e0a5', 
 131             'thumbnail': r
're:^https?://.*\.jpg$', 
 135             'timestamp': 1477764300, 
 136             'upload_date': '20161029', 
 137             'series': 'La Casa Bianca', 
 141         'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 
 142         'md5': '8970abf8caf8aef4696e7b1f2adfc696', 
 144             'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 
 146             'title': 'Report del 07/04/2014', 
 147             'alt_title': 'S2013/14 - Puntata del 07/04/2014', 
 148             'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 
 149             'thumbnail': r
're:^https?://.*\.jpg$', 
 158             'skip_download': True, 
 161         'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 
 162         'only_matching': True, 
 165     def _real_extract(self
, url
): 
 166         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 167         url
, video_id 
= mobj
.group('url', 'id') 
 169         media 
= self
._download
_json
( 
 170             '%s?json' % url
, video_id
, 'Downloading video JSON') 
 172         title 
= media
['name'] 
 174         video 
= media
['video'] 
 176         relinker_info 
= self
._extract
_relinker
_info
(video
['contentUrl'], video_id
) 
 177         self
._sort
_formats
(relinker_info
['formats']) 
 180         if 'images' in media
: 
 181             for _
, value 
in media
.get('images').items(): 
 184                         'url': value
.replace('[RESOLUTION]', '600x400') 
 187         timestamp 
= unified_timestamp(try_get( 
 188             media
, lambda x
: x
['availabilities'][0]['start'], compat_str
)) 
 190         subtitles 
= self
._extract
_subtitles
(url
, video
.get('subtitles')) 
 195             'alt_title': media
.get('subtitle'), 
 196             'description': media
.get('description'), 
 197             'uploader': media
.get('channel'), 
 198             'creator': media
.get('editor'), 
 199             'duration': parse_duration(video
.get('duration')), 
 200             'timestamp': timestamp
, 
 201             'thumbnails': thumbnails
, 
 203                 media
, lambda x
: x
['isPartOf']['name'], compat_str
), 
 204             'season_number': int_or_none(try_get( 
 205                 media
, lambda x
: x
['isPartOf']['numeroStagioni'])), 
 206             'season': media
.get('stagione') or None, 
 207             'subtitles': subtitles
, 
 210         info
.update(relinker_info
) 
 215 class RaiIE(RaiBaseIE
): 
 216     _VALID_URL 
= r
'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE
._UUID
_RE
 
 218         # var uniquename = "ContentItem-..." 
 219         # data-id="ContentItem-..." 
 220         'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 
 222             'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 
 224             'title': 'TG PRIMO TEMPO', 
 225             'thumbnail': r
're:^https?://.*\.jpg$', 
 227             'upload_date': '20140612', 
 230         # with ContentItem in many metas 
 231         'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', 
 233             'id': '1632c009-c843-4836-bb65-80c33084a64b', 
 235             'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', 
 236             'description': 'I film in uscita questa settimana.', 
 237             'thumbnail': r
're:^https?://.*\.png$', 
 239             'upload_date': '20161103', 
 242         # with ContentItem in og:url 
 243         'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 
 244         'md5': '11959b4e44fa74de47011b5799490adf', 
 246             'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 
 248             'title': 'TG1 ore 20:00 del 03/11/2016', 
 249             'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 
 250             'thumbnail': r
're:^https?://.*\.jpg$', 
 252             'upload_date': '20161103', 
 255         # drawMediaRaiTV(...) 
 256         'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', 
 257         'md5': '2dd727e61114e1ee9c47f0da6914e178', 
 259             'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', 
 262             'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 
 263             'thumbnail': r
're:^https?://.*\.jpg$', 
 264             'upload_date': '20141221', 
 267         # initEdizione('ContentItem-...' 
 268         'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', 
 270             'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', 
 272             'title': r
're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', 
 274             'upload_date': '20170401', 
 276         'skip': 'Changes daily', 
 278         # HDS live stream with only relinker URL 
 279         'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', 
 281             'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', 
 286             'skip_download': True, 
 289         # HLS live stream with ContentItem in og:url 
 290         'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', 
 292             'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', 
 294             'title': 'La diretta di Rainews24', 
 297             'skip_download': True, 
 301     def _extract_from_content_id(self
, content_id
, url
): 
 302         media 
= self
._download
_json
( 
 303             'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id
, 
 304             content_id
, 'Downloading video JSON') 
 306         title 
= media
['name'].strip() 
 308         media_type 
= media
['type'] 
 309         if 'Audio' in media_type
: 
 312                     'format_id': media
.get('formatoAudio'), 
 313                     'url': media
['audioUrl'], 
 314                     'ext': media
.get('formatoAudio'), 
 317         elif 'Video' in media_type
: 
 318             relinker_info 
= self
._extract
_relinker
_info
(media
['mediaUri'], content_id
) 
 320             raise ExtractorError('not a media file') 
 322         self
._sort
_formats
(relinker_info
['formats']) 
 325         for image_type 
in ('image', 'image_medium', 'image_300'): 
 326             thumbnail_url 
= media
.get(image_type
) 
 329                     'url': compat_urlparse
.urljoin(url
, thumbnail_url
), 
 332         subtitles 
= self
._extract
_subtitles
(url
, media
.get('subtitlesUrl')) 
 337             'description': strip_or_none(media
.get('desc')), 
 338             'thumbnails': thumbnails
, 
 339             'uploader': media
.get('author'), 
 340             'upload_date': unified_strdate(media
.get('date')), 
 341             'duration': parse_duration(media
.get('length')), 
 342             'subtitles': subtitles
, 
 345         info
.update(relinker_info
) 
 349     def _real_extract(self
, url
): 
 350         video_id 
= self
._match
_id
(url
) 
 352         webpage 
= self
._download
_webpage
(url
, video_id
) 
 354         content_item_id 
= None 
 356         content_item_url 
= self
._html
_search
_meta
( 
 357             ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', 
 358              'twitter:player', 'jsonlink'), webpage
, default
=None) 
 360             content_item_id 
= self
._search
_regex
( 
 361                 r
'ContentItem-(%s)' % self
._UUID
_RE
, content_item_url
, 
 362                 'content item id', default
=None) 
 364         if not content_item_id
: 
 365             content_item_id 
= self
._search
_regex
( 
 368                         (?:initEdizione|drawMediaRaiTV)\(| 
 369                         <(?:[^>]+\bdata-id|var\s+uniquename)= 
 372                     (?
:(?
!\
1).)*\bContentItem
-(?P
<id>%s) 
 374                 webpage, 'content item id', default=None, group='id') 
 376         content_item_ids = set() 
 378             content_item_ids.add(content_item_id) 
 379         if video_id not in content_item_ids: 
 380             content_item_ids.add(video_id) 
 382         for content_item_id in content_item_ids: 
 384                 return self._extract_from_content_id(content_item_id, url) 
 385             except GeoRestrictedError: 
 387             except ExtractorError: 
 390         relinker_url = self._search_regex( 
 399                     //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? 
 400                     (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 
 402             webpage, 'relinker URL', group='url') 
 404         relinker_info = self._extract_relinker_info( 
 405             urljoin(url, relinker_url), video_id) 
 406         self._sort_formats(relinker_info['formats']) 
 408         title = self._search_regex( 
 409             r'var\s+videoTitolo\s*=\s*([\'"])(?P
<title
>[^
\'"]+)\1', 
 410             webpage, 'title', group='title', 
 411             default=None) or self._og_search_title(webpage) 
 418         info.update(relinker_info)