]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/infoq.py
debian/NEWS: Write news about new behavior of youtube-dl.
[youtubedl] / youtube_dl / extractor / infoq.py
1 from __future__ import unicode_literals
2
3 import base64
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8 compat_urllib_parse,
9 )
10
11
12 class InfoQIE(InfoExtractor):
13 _VALID_URL = r'https?://(?:www\.)?infoq\.com/[^/]+/(?P<id>[^/]+)$'
14 _TEST = {
15 "name": "InfoQ",
16 "url": "http://www.infoq.com/presentations/A-Few-of-My-Favorite-Python-Things",
17 "file": "12-jan-pythonthings.mp4",
18 "info_dict": {
19 "description": "Mike Pirnat presents some tips and tricks, standard libraries and third party packages that make programming in Python a richer experience.",
20 "title": "A Few of My Favorite [Python] Things",
21 },
22 "params": {
23 "skip_download": True,
24 },
25 }
26
27 def _real_extract(self, url):
28 mobj = re.match(self._VALID_URL, url)
29 video_id = mobj.group('id')
30
31 webpage = self._download_webpage(url, video_id)
32
33 # Extract video URL
34 encoded_id = self._search_regex(r"jsclassref ?= ?'([^']*)'", webpage, 'encoded id')
35 real_id = compat_urllib_parse.unquote(base64.b64decode(encoded_id.encode('ascii')).decode('utf-8'))
36 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
37
38 # Extract title
39 video_title = self._search_regex(r'contentTitle = "(.*?)";',
40 webpage, 'title')
41
42 # Extract description
43 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
44 webpage, 'description', fatal=False)
45
46 video_filename = video_url.split('/')[-1]
47 video_id, extension = video_filename.split('.')
48
49 return {
50 'id': video_id,
51 'url': video_url,
52 'title': video_title,
53 'ext': extension, # Extension is always(?) mp4, but seems to be flv
54 'description': video_description,
55 }