Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/funnyordie.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     ExtractorError,
   8     float_or_none,
   9     int_or_none,
  10     unified_timestamp,
  11 )
  12
  13
  14 class FunnyOrDieIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:www\.)?funnyordie\.com/(?P<type>embed|articles|videos)/(?P<id>[0-9a-f]+)(?:$|[?#/])'
  16     _TESTS = [{
  17         'url': 'http://www.funnyordie.com/videos/0732f586d7/heart-shaped-box-literal-video-version',
  18         'md5': 'bcd81e0c4f26189ee09be362ad6e6ba9',
  19         'info_dict': {
  20             'id': '0732f586d7',
  21             'ext': 'mp4',
  22             'title': 'Heart-Shaped Box: Literal Video Version',
  23             'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
  24             'thumbnail': r're:^http:.*\.jpg$',
  25             'uploader': 'DASjr',
  26             'timestamp': 1317904928,
  27             'upload_date': '20111006',
  28             'duration': 318.3,
  29         },
  30     }, {
  31         'url': 'http://www.funnyordie.com/embed/e402820827',
  32         'info_dict': {
  33             'id': 'e402820827',
  34             'ext': 'mp4',
  35             'title': 'Please Use This Song (Jon Lajoie)',
  36             'description': 'Please use this to sell something.  www.jonlajoie.com',
  37             'thumbnail': r're:^http:.*\.jpg$',
  38             'timestamp': 1398988800,
  39             'upload_date': '20140502',
  40         },
  41         'params': {
  42             'skip_download': True,
  43         },
  44     }, {
  45         'url': 'http://www.funnyordie.com/articles/ebf5e34fc8/10-hours-of-walking-in-nyc-as-a-man',
  46         'only_matching': True,
  47     }]
  48
  49     def _real_extract(self, url):
  50         mobj = re.match(self._VALID_URL, url)
  51
  52         video_id = mobj.group('id')
  53         webpage = self._download_webpage(url, video_id)
  54
  55         links = re.findall(r'<source src="([^"]+/v)[^"]+\.([^"]+)" type=\'video', webpage)
  56         if not links:
  57             raise ExtractorError('No media links available for %s' % video_id)
  58
  59         links.sort(key=lambda link: 1 if link[1] == 'mp4' else 0)
  60
  61         m3u8_url = self._search_regex(
  62             r'<source[^>]+src=(["\'])(?P<url>.+?/master\.m3u8[^"\']*)\1',
  63             webpage, 'm3u8 url', group='url')
  64
  65         formats = []
  66
  67         m3u8_formats = self._extract_m3u8_formats(
  68             m3u8_url, video_id, 'mp4', 'm3u8_native',
  69             m3u8_id='hls', fatal=False)
  70         source_formats = list(filter(
  71             lambda f: f.get('vcodec') != 'none', m3u8_formats))
  72
  73         bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
  74         bitrates.sort()
  75
  76         if source_formats:
  77             self._sort_formats(source_formats)
  78
  79         for bitrate, f in zip(bitrates, source_formats or [{}] * len(bitrates)):
  80             for path, ext in links:
  81                 ff = f.copy()
  82                 if ff:
  83                     if ext != 'mp4':
  84                         ff = dict(
  85                             [(k, v) for k, v in ff.items()
  86                              if k in ('height', 'width', 'format_id')])
  87                     ff.update({
  88                         'format_id': ff['format_id'].replace('hls', ext),
  89                         'ext': ext,
  90                         'protocol': 'http',
  91                     })
  92                 else:
  93                     ff.update({
  94                         'format_id': '%s-%d' % (ext, bitrate),
  95                         'vbr': bitrate,
  96                     })
  97                 ff['url'] = self._proto_relative_url(
  98                     '%s%d.%s' % (path, bitrate, ext))
  99                 formats.append(ff)
 100         self._check_formats(formats, video_id)
 101
 102         formats.extend(m3u8_formats)
 103         self._sort_formats(
 104             formats, field_preference=('height', 'width', 'tbr', 'format_id'))
 105
 106         subtitles = {}
 107         for src, src_lang in re.findall(r'<track kind="captions" src="([^"]+)" srclang="([^"]+)"', webpage):
 108             subtitles[src_lang] = [{
 109                 'ext': src.split('/')[-1],
 110                 'url': 'http://www.funnyordie.com%s' % src,
 111             }]
 112
 113         timestamp = unified_timestamp(self._html_search_meta(
 114             'uploadDate', webpage, 'timestamp', default=None))
 115
 116         uploader = self._html_search_regex(
 117             r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
 118             webpage, 'uploader', default=None)
 119
 120         title, description, thumbnail, duration = [None] * 4
 121
 122         medium = self._parse_json(
 123             self._search_regex(
 124                 r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
 125                 default='{}'),
 126             video_id, fatal=False)
 127         if medium:
 128             title = medium.get('title')
 129             duration = float_or_none(medium.get('duration'))
 130             if not timestamp:
 131                 timestamp = unified_timestamp(medium.get('publishDate'))
 132
 133         post = self._parse_json(
 134             self._search_regex(
 135                 r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
 136                 default='{}'),
 137             video_id, fatal=False)
 138         if post:
 139             if not title:
 140                 title = post.get('name')
 141             description = post.get('description')
 142             thumbnail = post.get('picture')
 143
 144         if not title:
 145             title = self._og_search_title(webpage)
 146         if not description:
 147             description = self._og_search_description(webpage)
 148         if not duration:
 149             duration = int_or_none(self._html_search_meta(
 150                 ('video:duration', 'duration'), webpage, 'duration', default=False))
 151
 152         return {
 153             'id': video_id,
 154             'title': title,
 155             'description': description,
 156             'thumbnail': thumbnail,
 157             'uploader': uploader,
 158             'timestamp': timestamp,
 159             'duration': duration,
 160             'formats': formats,
 161             'subtitles': subtitles,
 162         }