]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nfb.py
Update upstream source from tag 'upstream/2018.11.07'
[youtubedl] / youtube_dl / extractor / nfb.py
1 from __future__ import unicode_literals
2
3 from .common import InfoExtractor
4 from ..utils import (
5 clean_html,
6 determine_ext,
7 int_or_none,
8 qualities,
9 urlencode_postdata,
10 xpath_text,
11 )
12
13
14 class NFBIE(InfoExtractor):
15 IE_NAME = 'nfb'
16 IE_DESC = 'National Film Board of Canada'
17 _VALID_URL = r'https?://(?:www\.)?(?:nfb|onf)\.ca/film/(?P<id>[\da-z_-]+)'
18
19 _TEST = {
20 'url': 'https://www.nfb.ca/film/qallunaat_why_white_people_are_funny',
21 'info_dict': {
22 'id': 'qallunaat_why_white_people_are_funny',
23 'ext': 'flv',
24 'title': 'Qallunaat! Why White People Are Funny ',
25 'description': 'md5:6b8e32dde3abf91e58857b174916620c',
26 'duration': 3128,
27 'creator': 'Mark Sandiford',
28 'uploader': 'Mark Sandiford',
29 },
30 'params': {
31 # rtmp download
32 'skip_download': True,
33 }
34 }
35
36 def _real_extract(self, url):
37 video_id = self._match_id(url)
38
39 config = self._download_xml(
40 'https://www.nfb.ca/film/%s/player_config' % video_id,
41 video_id, 'Downloading player config XML',
42 data=urlencode_postdata({'getConfig': 'true'}),
43 headers={
44 'Content-Type': 'application/x-www-form-urlencoded',
45 'X-NFB-Referer': 'http://www.nfb.ca/medias/flash/NFBVideoPlayer.swf'
46 })
47
48 title, description, thumbnail, duration, uploader, author = [None] * 6
49 thumbnails, formats = [[]] * 2
50 subtitles = {}
51
52 for media in config.findall('./player/stream/media'):
53 if media.get('type') == 'posterImage':
54 quality_key = qualities(('low', 'high'))
55 thumbnails = []
56 for asset in media.findall('assets/asset'):
57 asset_url = xpath_text(asset, 'default/url', default=None)
58 if not asset_url:
59 continue
60 quality = asset.get('quality')
61 thumbnails.append({
62 'url': asset_url,
63 'id': quality,
64 'preference': quality_key(quality),
65 })
66 elif media.get('type') == 'video':
67 title = xpath_text(media, 'title', fatal=True)
68 for asset in media.findall('assets/asset'):
69 quality = asset.get('quality')
70 height = int_or_none(self._search_regex(
71 r'^(\d+)[pP]$', quality or '', 'height', default=None))
72 for node in asset:
73 streamer = xpath_text(node, 'streamerURI', default=None)
74 if not streamer:
75 continue
76 play_path = xpath_text(node, 'url', default=None)
77 if not play_path:
78 continue
79 formats.append({
80 'url': streamer,
81 'app': streamer.split('/', 3)[3],
82 'play_path': play_path,
83 'rtmp_live': False,
84 'ext': 'flv',
85 'format_id': '%s-%s' % (node.tag, quality) if quality else node.tag,
86 'height': height,
87 })
88 self._sort_formats(formats)
89 description = clean_html(xpath_text(media, 'description'))
90 uploader = xpath_text(media, 'author')
91 duration = int_or_none(media.get('duration'))
92 for subtitle in media.findall('./subtitles/subtitle'):
93 subtitle_url = xpath_text(subtitle, 'url', default=None)
94 if not subtitle_url:
95 continue
96 lang = xpath_text(subtitle, 'lang', default='en')
97 subtitles.setdefault(lang, []).append({
98 'url': subtitle_url,
99 'ext': (subtitle.get('format') or determine_ext(subtitle_url)).lower(),
100 })
101
102 return {
103 'id': video_id,
104 'title': title,
105 'description': description,
106 'thumbnails': thumbnails,
107 'duration': duration,
108 'creator': uploader,
109 'uploader': uploader,
110 'formats': formats,
111 'subtitles': subtitles,
112 }