]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/franceculture.py
Merge tag 'upstream/2014.06.07'
[youtubedl] / youtube_dl / extractor / franceculture.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9 compat_parse_qs,
10 compat_urlparse,
11 )
12
13
14 class FranceCultureIE(InfoExtractor):
15 _VALID_URL = r'(?P<baseurl>http://(?:www\.)?franceculture\.fr/)player/reecouter\?play=(?P<id>[0-9]+)'
16 _TEST = {
17 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174',
18 'info_dict': {
19 'id': '4795174',
20 'ext': 'mp3',
21 'title': 'Rendez-vous au pays des geeks',
22 'vcodec': 'none',
23 'uploader': 'Colette Fellous',
24 'upload_date': '20140301',
25 'duration': 3601,
26 'thumbnail': r're:^http://www\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$',
27 'description': 'Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche des « geeks », une enquête menée aux Etats-Unis dans la S ...',
28 }
29 }
30
31 def _real_extract(self, url):
32 mobj = re.match(self._VALID_URL, url)
33 video_id = mobj.group('id')
34 baseurl = mobj.group('baseurl')
35
36 webpage = self._download_webpage(url, video_id)
37 params_code = self._search_regex(
38 r"<param name='movie' value='/sites/all/modules/rf/rf_player/swf/loader.swf\?([^']+)' />",
39 webpage, 'parameter code')
40 params = compat_parse_qs(params_code)
41 video_url = compat_urlparse.urljoin(baseurl, params['urlAOD'][0])
42
43 title = self._html_search_regex(
44 r'<h1 class="title[^"]+">(.+?)</h1>', webpage, 'title')
45 uploader = self._html_search_regex(
46 r'(?s)<div id="emission".*?<span class="author">(.*?)</span>',
47 webpage, 'uploader', fatal=False)
48 thumbnail_part = self._html_search_regex(
49 r'(?s)<div id="emission".*?<img src="([^"]+)"', webpage,
50 'thumbnail', fatal=False)
51 if thumbnail_part is None:
52 thumbnail = None
53 else:
54 thumbnail = compat_urlparse.urljoin(baseurl, thumbnail_part)
55 description = self._html_search_regex(
56 r'(?s)<p class="desc">(.*?)</p>', webpage, 'description')
57
58 info = json.loads(params['infoData'][0])[0]
59 duration = info.get('media_length')
60 upload_date_candidate = info.get('media_section5')
61 upload_date = (
62 upload_date_candidate
63 if (upload_date_candidate is not None and
64 re.match(r'[0-9]{8}$', upload_date_candidate))
65 else None)
66
67 return {
68 'id': video_id,
69 'url': video_url,
70 'vcodec': 'none' if video_url.lower().endswith('.mp3') else None,
71 'duration': duration,
72 'uploader': uploader,
73 'upload_date': upload_date,
74 'title': title,
75 'thumbnail': thumbnail,
76 'description': description,
77 }