]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/mdr.py
1b8c4a32edf5d269b1e9bb9db366487ef2fba981
[youtubedl] / youtube_dl / extractor / mdr.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6
7
8 class MDRIE(InfoExtractor):
9 _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
10
11 # No tests, MDR regularily deletes its videos
12 _TEST = {
13 'url': 'http://www.mdr.de/fakt/video189002.html',
14 'only_matching': True,
15 }
16
17 def _real_extract(self, url):
18 m = re.match(self._VALID_URL, url)
19 video_id = m.group('video_id')
20 domain = m.group('domain')
21
22 # determine title and media streams from webpage
23 html = self._download_webpage(url, video_id)
24
25 title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
26 xmlurl = self._search_regex(
27 r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
28
29 doc = self._download_xml(domain + xmlurl, video_id)
30 formats = []
31 for a in doc.findall('./assets/asset'):
32 url_el = a.find('.//progressiveDownloadUrl')
33 if url_el is None:
34 continue
35 abr = int(a.find('bitrateAudio').text) // 1000
36 media_type = a.find('mediaType').text
37 format = {
38 'abr': abr,
39 'filesize': int(a.find('fileSize').text),
40 'url': url_el.text,
41 }
42
43 vbr_el = a.find('bitrateVideo')
44 if vbr_el is None:
45 format.update({
46 'vcodec': 'none',
47 'format_id': '%s-%d' % (media_type, abr),
48 })
49 else:
50 vbr = int(vbr_el.text) // 1000
51 format.update({
52 'vbr': vbr,
53 'width': int(a.find('frameWidth').text),
54 'height': int(a.find('frameHeight').text),
55 'format_id': '%s-%d' % (media_type, vbr),
56 })
57 formats.append(format)
58 self._sort_formats(formats)
59
60 return {
61 'id': video_id,
62 'title': title,
63 'formats': formats,
64 }