1 from __future__
import unicode_literals
5 from .common
import InfoExtractor
29 class RaiBaseIE(InfoExtractor
):
30 _UUID_RE
= r
'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
31 _GEO_COUNTRIES
= ['IT']
34 def _extract_relinker_info(self
, relinker_url
, video_id
):
35 if not re
.match(r
'https?://', relinker_url
):
36 return {'formats': [{'url': relinker_url
}]}
43 for platform
in ('mon', 'flash', 'native'):
44 relinker
= self
._download
_xml
(
45 relinker_url
, video_id
,
46 note
='Downloading XML metadata for platform %s' % platform
,
47 transform_source
=fix_xml_ampersands
,
48 query
={'output': 45, 'pl': platform
},
49 headers
=self
.geo_verification_headers())
52 geoprotection
= xpath_text(
53 relinker
, './geoprotection', default
=None) == 'Y'
57 relinker
, './is_live', default
=None) == 'Y'
59 duration
= parse_duration(xpath_text(
60 relinker
, './duration', default
=None))
62 url_elem
= find_xpath_attr(relinker
, './url', 'type', 'content')
66 media_url
= url_elem
.text
68 # This does not imply geo restriction (e.g.
69 # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
70 if media_url
== 'http://download.rai.it/video_no_available.mp4':
73 ext
= determine_ext(media_url
)
74 if (ext
== 'm3u8' and platform
!= 'mon') or (ext
== 'f4m' and platform
!= 'flash'):
78 formats
.extend(self
._extract
_m
3u8_formats
(
79 media_url
, video_id
, 'mp4', 'm3u8_native',
80 m3u8_id
='hls', fatal
=False))
82 manifest_url
= update_url_query(
83 media_url
.replace('manifest#live_hds.f4m', 'manifest.f4m'),
84 {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'})
85 formats
.extend(self
._extract
_f
4m
_formats
(
86 manifest_url
, video_id
, f4m_id
='hds', fatal
=False))
88 bitrate
= int_or_none(xpath_text(relinker
, 'bitrate'))
91 'tbr': bitrate
if bitrate
> 0 else None,
92 'format_id': 'http-%d' % bitrate
if bitrate
> 0 else 'http',
95 if not formats
and geoprotection
is True:
96 self
.raise_geo_restricted(countries
=self
._GEO
_COUNTRIES
)
98 return dict((k
, v
) for k
, v
in {
100 'duration': duration
,
102 }.items() if v
is not None)
105 def _extract_subtitles(url
, subtitle_url
):
107 if subtitle_url
and isinstance(subtitle_url
, compat_str
):
108 subtitle_url
= urljoin(url
, subtitle_url
)
115 if subtitle_url
.endswith(STL_EXT
):
116 srt_url
= subtitle_url
[:-len(STL_EXT
)] + SRT_EXT
117 subtitles
['it'].append({
124 class RaiPlayIE(RaiBaseIE
):
125 _VALID_URL
= r
'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE
._UUID
_RE
127 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
128 'md5': '340aa3b7afb54bfd14a8c11786450d76',
130 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
132 'title': 'La Casa Bianca',
133 'alt_title': 'S2016 - Puntata del 23/10/2016',
134 'description': 'md5:a09d45890850458077d1f68bb036e0a5',
135 'thumbnail': r
're:^https?://.*\.jpg$',
139 'timestamp': 1477764300,
140 'upload_date': '20161029',
141 'series': 'La Casa Bianca',
145 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
146 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
148 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
150 'title': 'Report del 07/04/2014',
151 'alt_title': 'S2013/14 - Puntata del 07/04/2014',
152 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
153 'thumbnail': r
're:^https?://.*\.jpg$',
162 'skip_download': True,
165 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
166 'only_matching': True,
169 def _real_extract(self
, url
):
170 mobj
= re
.match(self
._VALID
_URL
, url
)
171 url
, video_id
= mobj
.group('url', 'id')
173 media
= self
._download
_json
(
174 '%s?json' % url
, video_id
, 'Downloading video JSON')
176 title
= media
['name']
178 video
= media
['video']
180 relinker_info
= self
._extract
_relinker
_info
(video
['contentUrl'], video_id
)
181 self
._sort
_formats
(relinker_info
['formats'])
184 if 'images' in media
:
185 for _
, value
in media
.get('images').items():
188 'url': value
.replace('[RESOLUTION]', '600x400')
191 timestamp
= unified_timestamp(try_get(
192 media
, lambda x
: x
['availabilities'][0]['start'], compat_str
))
194 subtitles
= self
._extract
_subtitles
(url
, video
.get('subtitles'))
198 'title': self
._live
_title
(title
) if relinker_info
.get(
199 'is_live') else title
,
200 'alt_title': media
.get('subtitle'),
201 'description': media
.get('description'),
202 'uploader': strip_or_none(media
.get('channel')),
203 'creator': strip_or_none(media
.get('editor')),
204 'duration': parse_duration(video
.get('duration')),
205 'timestamp': timestamp
,
206 'thumbnails': thumbnails
,
208 media
, lambda x
: x
['isPartOf']['name'], compat_str
),
209 'season_number': int_or_none(try_get(
210 media
, lambda x
: x
['isPartOf']['numeroStagioni'])),
211 'season': media
.get('stagione') or None,
212 'subtitles': subtitles
,
215 info
.update(relinker_info
)
219 class RaiPlayLiveIE(RaiBaseIE
):
220 _VALID_URL
= r
'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)'
222 'url': 'http://www.raiplay.it/dirette/rainews24',
224 'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
225 'display_id': 'rainews24',
227 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
228 'description': 'md5:6eca31500550f9376819f174e5644754',
229 'uploader': 'Rai News 24',
230 'creator': 'Rai News 24',
234 'skip_download': True,
238 def _real_extract(self
, url
):
239 display_id
= self
._match
_id
(url
)
241 webpage
= self
._download
_webpage
(url
, display_id
)
243 video_id
= self
._search
_regex
(
244 r
'data-uniquename=["\']ContentItem
-(%s)' % RaiBaseIE._UUID_RE,
245 webpage, 'content
id')
248 '_type
': 'url_transparent
',
249 'ie_key
': RaiPlayIE.ie_key(),
250 'url
': 'http
://www
.raiplay
.it
/dirette
/ContentItem
-%s.html
' % video_id,
252 'display_id
': display_id,
256 class RaiPlayPlaylistIE(InfoExtractor):
257 _VALID_URL = r'https?
://(?
:www\
.)?raiplay\
.it
/programmi
/(?P
<id>[^
/?
#&]+)'
259 'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',
261 'id': 'nondirloalmiocapo',
262 'title': 'Non dirlo al mio capo',
263 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86',
265 'playlist_mincount': 12,
268 def _real_extract(self
, url
):
269 playlist_id
= self
._match
_id
(url
)
271 webpage
= self
._download
_webpage
(url
, playlist_id
)
273 title
= self
._html
_search
_meta
(
274 ('programma', 'nomeProgramma'), webpage
, 'title')
275 description
= unescapeHTML(self
._html
_search
_meta
(
276 ('description', 'og:description'), webpage
, 'description'))
279 for mobj
in re
.finditer(
280 r
'<a\b[^>]+\bhref=(["\'])(?P
<path
>/raiplay
/video
/.+?
)\
1',
282 video_url = urljoin(url, mobj.group('path
'))
283 entries.append(self.url_result(
284 video_url, ie=RaiPlayIE.ie_key(),
285 video_id=RaiPlayIE._match_id(video_url)))
287 return self.playlist_result(entries, playlist_id, title, description)
290 class RaiIE(RaiBaseIE):
291 _VALID_URL = r'https?
://[^
/]+\
.(?
:rai\
.(?
:it|tv
)|rainews\
.it
)/dl
/.+?
-(?P
<id>%s)(?
:-.+?
)?\
.html
' % RaiBaseIE._UUID_RE
293 # var uniquename = "ContentItem-..."
294 # data-id="ContentItem-..."
295 'url
': 'http
://www
.raisport
.rai
.it
/dl
/raiSport
/media
/rassegna
-stampa
-04a9f4bd
-b563
-40cf
-82a6
-aad3529cb4a9
.html
',
297 'id': '04a9f4bd
-b563
-40cf
-82a6
-aad3529cb4a9
',
299 'title
': 'TG PRIMO TEMPO
',
300 'thumbnail
': r're
:^https?
://.*\
.jpg$
',
302 'upload_date
': '20140612',
305 # with ContentItem in many metas
306 'url
': 'http
://www
.rainews
.it
/dl
/rainews
/media
/Weekend
-al
-cinema
-da
-Hollywood
-arriva
-il
-thriller
-di
-Tate
-Taylor
-La
-ragazza
-del-treno
-1632c009
-c843
-4836-bb65
-80c33084a64b
.html
',
308 'id': '1632c009
-c843
-4836-bb65
-80c33084a64b
',
310 'title
': 'Weekend al cinema
, da Hollywood arriva il thriller di Tate Taylor
"La ragazza del treno"',
311 'description
': 'I film
in uscita questa settimana
.',
312 'thumbnail
': r're
:^https?
://.*\
.png$
',
314 'upload_date
': '20161103',
317 # with ContentItem in og:url
318 'url
': 'http
://www
.rai
.it
/dl
/RaiTV
/programmi
/media
/ContentItem
-efb17665
-691c
-45d5
-a60c
-5301333cbb0c
.html
',
319 'md5
': '11959b4e44fa74de47011b5799490adf
',
321 'id': 'efb17665
-691c
-45d5
-a60c
-5301333cbb0c
',
323 'title
': 'TG1 ore
20:00 del 03/11/2016',
324 'description
': 'TG1 edizione integrale ore
20:00 del giorno
03/11/2016',
325 'thumbnail
': r're
:^https?
://.*\
.jpg$
',
327 'upload_date
': '20161103',
330 # drawMediaRaiTV(...)
331 'url
': 'http
://www
.report
.rai
.it
/dl
/Report
/puntata
/ContentItem
-0c7a664b
-d0f4
-4b2c
-8835-3f82e46f433e
.html
',
332 'md5
': '2dd727e61114e1ee9c47f0da6914e178
',
334 'id': '59d69d28
-6bb6
-409d
-a4b5
-ed44096560af
',
337 'description
': 'md5
:4b1afae1364115ce5d78ed83cd2e5b3a
',
338 'thumbnail
': r're
:^https?
://.*\
.jpg$
',
339 'upload_date
': '20141221',
342 # initEdizione('ContentItem
-...'
343 'url
': 'http
://www
.tg1
.rai
.it
/dl
/tg1
/2010/edizioni
/ContentSet
-9b6e0cba
-4bef
-4aef
-8cf0
-9f7f665b7dfb
-tg1
.html?item
=undefined
',
345 'id': 'c2187016
-8484-4e3a
-8ac8
-35e475b07303
',
347 'title
': r're
:TG1 ore \d{2}
:\d{2}
del \d{2}
/\d{2}
/\d{4}
',
349 'upload_date
': '20170401',
351 'skip
': 'Changes daily
',
353 # HDS live stream with only relinker URL
354 'url
': 'http
://www
.rai
.tv
/dl
/RaiTV
/dirette
/PublishingBlock
-1912dbbf
-3f96
-44c3
-b4cf
-523681fbacbc
.html?channel
=EuroNews
',
356 'id': '1912dbbf
-3f96
-44c3
-b4cf
-523681fbacbc
',
361 'skip_download
': True,
364 # HLS live stream with ContentItem in og:url
365 'url
': 'http
://www
.rainews
.it
/dl
/rainews
/live
/ContentItem
-3156f2f2
-dc70
-4953-8e2f
-70d7489d4ce9
.html
',
367 'id': '3156f2f2
-dc70
-4953-8e2f
-70d7489d4ce9
',
369 'title
': 'La diretta di Rainews24
',
372 'skip_download
': True,
376 'url
': 'http
://www
.rai
.it
/dl
/RaiTV
/programmi
/media
/ContentItem
-b63a4089
-ac28
-48cf
-bca5
-9f5b5bc46df5
.html
',
377 'only_matching
': True,
380 def _extract_from_content_id(self, content_id, url):
381 media = self._download_json(
382 'http
://www
.rai
.tv
/dl
/RaiTV
/programmi
/media
/ContentItem
-%s.html?json
' % content_id,
383 content_id, 'Downloading video JSON
')
385 title = media['name
'].strip()
387 media_type = media['type']
388 if 'Audio
' in media_type:
391 'format_id
': media.get('formatoAudio
'),
392 'url
': media['audioUrl
'],
393 'ext
': media.get('formatoAudio
'),
396 elif 'Video
' in media_type:
397 relinker_info = self._extract_relinker_info(media['mediaUri
'], content_id)
399 raise ExtractorError('not a media
file')
401 self._sort_formats(relinker_info['formats
'])
404 for image_type in ('image
', 'image_medium
', 'image_300
'):
405 thumbnail_url = media.get(image_type)
408 'url
': compat_urlparse.urljoin(url, thumbnail_url),
411 subtitles = self._extract_subtitles(url, media.get('subtitlesUrl
'))
416 'description
': strip_or_none(media.get('desc
')),
417 'thumbnails
': thumbnails,
418 'uploader
': media.get('author
'),
419 'upload_date
': unified_strdate(media.get('date
')),
420 'duration
': parse_duration(media.get('length
')),
421 'subtitles
': subtitles,
424 info.update(relinker_info)
428 def _real_extract(self, url):
429 video_id = self._match_id(url)
431 webpage = self._download_webpage(url, video_id)
433 content_item_id = None
435 content_item_url = self._html_search_meta(
436 ('og
:url
', 'og
:video
', 'og
:video
:secure_url
', 'twitter
:url
',
437 'twitter
:player
', 'jsonlink
'), webpage, default=None)
439 content_item_id = self._search_regex(
440 r'ContentItem
-(%s)' % self._UUID_RE, content_item_url,
441 'content item
id', default=None)
443 if not content_item_id:
444 content_item_id = self._search_regex(
447 (?:initEdizione|drawMediaRaiTV)\(|
448 <(?:[^>]+\bdata-id|var\s+uniquename)=
451 (?:(?!\1).)*\bContentItem-(?P<id>%s)
453 webpage, 'content item
id', default=None, group='id')
455 content_item_ids = set()
457 content_item_ids.add(content_item_id)
458 if video_id not in content_item_ids:
459 content_item_ids.add(video_id)
461 for content_item_id in content_item_ids:
463 return self._extract_from_content_id(content_item_id, url)
464 except GeoRestrictedError:
466 except ExtractorError:
469 relinker_url = self._search_regex(
478 //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
479 (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
481 webpage, 'relinker URL
', group='url
')
483 relinker_info = self._extract_relinker_info(
484 urljoin(url, relinker_url), video_id)
485 self._sort_formats(relinker_info['formats
'])
487 title = self._search_regex(
488 r'var\s
+videoTitolo\s
*=\s
*([\'"])(?P<title>[^\'"]+)\
1',
489 webpage, 'title
', group='title
',
490 default=None) or self._og_search_title(webpage)
497 info.update(relinker_info)