]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/funnyordie.py
debian/control: Remove trailing whitespace at EOF.
[youtubedl] / youtube_dl / extractor / funnyordie.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 ExtractorError,
8 float_or_none,
9 int_or_none,
10 unified_timestamp,
11 )
12
13
14 class FunnyOrDieIE(InfoExtractor):
15 _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|articles|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
16 _TESTS = [{
17 'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
18 'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
19 'info_dict': {
20 'id': '0732f586d7',
21 'ext': 'mp4',
22 'title': 'Heart-Shaped Box: Literal Video Version',
23 'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
24 'thumbnail': r're:^http:.*\.jpg$',
25 'uploader': 'DASjr',
26 'timestamp': 1317904928,
27 'upload_date': '20111006',
28 'duration': 318.3,
29 },
30 }, {
31 'url': 'http://www.funnyordie.com/embed/e402820827',
32 'info_dict': {
33 'id': 'e402820827',
34 'ext': 'mp4',
35 'title': 'Please Use This Song (Jon Lajoie)',
36 'description': 'Please use this to sell something. www.jonlajoie.com',
37 'thumbnail': r're:^http:.*\.jpg$',
38 'timestamp': 1398988800,
39 'upload_date': '20140502',
40 },
41 'params': {
42 'skip_download': True,
43 },
44 }, {
45 'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man',
46 'only_matching': True,
47 }]
48
49 def _real_extract(self, url):
50 mobj = re.match(self._VALID_URL, url)
51
52 video_id = mobj.group('id')
53 webpage = self._download_webpage(url, video_id)
54
55 links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)
56 if not links:
57 raise ExtractorError('No media links available for %s' % video_id)
58
59 links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
60
61 m3u8_url = self._search_regex(
62 r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1',
63 webpage, 'm3u8 url', group='url')
64
65 formats = []
66
67 m3u8_formats = self._extract_m3u8_formats(
68 m3u8_url, video_id, 'mp4', 'm3u8_native',
69 m3u8_id='hls', fatal=False)
70 source_formats = list(filter(
71 lambda f: f.get('vcodec') != 'none', m3u8_formats))
72
73 bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
74 bitrates.sort()
75
76 if source_formats:
77 self._sort_formats(source_formats)
78
79 for bitrate, f in zip(bitrates, source_formats or [{}] * len(bitrates)):
80 for path, ext in links:
81 ff = f.copy()
82 if ff:
83 if ext != 'mp4':
84 ff = dict(
85 [(k, v) for k, v in ff.items()
86 if k in ('height', 'width', 'format_id')])
87 ff.update({
88 'format_id': ff['format_id'].replace('hls', ext),
89 'ext': ext,
90 'protocol': 'http',
91 })
92 else:
93 ff.update({
94 'format_id': '%s-%d' % (ext, bitrate),
95 'vbr': bitrate,
96 })
97 ff['url'] = self._proto_relative_url(
98 '%s%d.%s' % (path, bitrate, ext))
99 formats.append(ff)
100 self._check_formats(formats, video_id)
101
102 formats.extend(m3u8_formats)
103 self._sort_formats(
104 formats, field_preference=('height', 'width', 'tbr', 'format_id'))
105
106 subtitles = {}
107 for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage):
108 subtitles[src_lang] = [{
109 'ext': src.split('/')[-1],
110 'url': 'http://www.funnyordie.com%s' % src,
111 }]
112
113 timestamp = unified_timestamp(self._html_search_meta(
114 'uploadDate', webpage, 'timestamp', default=None))
115
116 uploader = self._html_search_regex(
117 r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
118 webpage, 'uploader', default=None)
119
120 title, description, thumbnail, duration = [None] * 4
121
122 medium = self._parse_json(
123 self._search_regex(
124 r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
125 default='{}'),
126 video_id, fatal=False)
127 if medium:
128 title = medium.get('title')
129 duration = float_or_none(medium.get('duration'))
130 if not timestamp:
131 timestamp = unified_timestamp(medium.get('publishDate'))
132
133 post = self._parse_json(
134 self._search_regex(
135 r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
136 default='{}'),
137 video_id, fatal=False)
138 if post:
139 if not title:
140 title = post.get('name')
141 description = post.get('description')
142 thumbnail = post.get('picture')
143
144 if not title:
145 title = self._og_search_title(webpage)
146 if not description:
147 description = self._og_search_description(webpage)
148 if not duration:
149 duration = int_or_none(self._html_search_meta(
150 ('video:duration', 'duration'), webpage, 'duration', default=False))
151
152 return {
153 'id': video_id,
154 'title': title,
155 'description': description,
156 'thumbnail': thumbnail,
157 'uploader': uploader,
158 'timestamp': timestamp,
159 'duration': duration,
160 'formats': formats,
161 'subtitles': subtitles,
162 }