Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/vessel.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     ExtractorError,
  10     parse_iso8601,
  11     sanitized_Request,
  12 )
  13
  14
  15 class VesselIE(InfoExtractor):
  16     _VALID_URL = r'https?://(?:www\.)?vessel\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z-_]+)'
  17     _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
  18     _LOGIN_URL = 'https://www.vessel.com/api/account/login'
  19     _NETRC_MACHINE = 'vessel'
  20     _TESTS = [{
  21         'url': 'https://www.vessel.com/videos/HDN7G5UMs',
  22         'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
  23         'info_dict': {
  24             'id': 'HDN7G5UMs',
  25             'ext': 'mp4',
  26             'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?',
  27             'thumbnail': r're:^https?://.*\.jpg$',
  28             'upload_date': '20150317',
  29             'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
  30             'timestamp': int,
  31         },
  32     }, {
  33         'url': 'https://www.vessel.com/embed/G4U7gUJ6a?w=615&h=346',
  34         'only_matching': True,
  35     }, {
  36         'url': 'https://www.vessel.com/videos/F01_dsLj1',
  37         'only_matching': True,
  38     }, {
  39         'url': 'https://www.vessel.com/videos/RRX-sir-J',
  40         'only_matching': True,
  41     }]
  42
  43     @staticmethod
  44     def _extract_urls(webpage):
  45         return [url for _, url in re.findall(
  46             r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?vessel\.com/embed/[0-9a-zA-Z-_]+.*?)\1',
  47             webpage)]
  48
  49     @staticmethod
  50     def make_json_request(url, data):
  51         payload = json.dumps(data).encode('utf-8')
  52         req = sanitized_Request(url, payload)
  53         req.add_header('Content-Type', 'application/json; charset=utf-8')
  54         return req
  55
  56     @staticmethod
  57     def find_assets(data, asset_type, asset_id=None):
  58         for asset in data.get('assets', []):
  59             if not asset.get('type') == asset_type:
  60                 continue
  61             elif asset_id is not None and not asset.get('id') == asset_id:
  62                 continue
  63             else:
  64                 yield asset
  65
  66     def _check_access_rights(self, data):
  67         access_info = data.get('__view', {})
  68         if not access_info.get('allow_access', True):
  69             err_code = access_info.get('error_code') or ''
  70             if err_code == 'ITEM_PAID_ONLY':
  71                 raise ExtractorError(
  72                     'This video requires subscription.', expected=True)
  73             else:
  74                 raise ExtractorError(
  75                     'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True)
  76
  77     def _login(self):
  78         (username, password) = self._get_login_info()
  79         if username is None:
  80             return
  81         self.report_login()
  82         data = {
  83             'client_id': 'web',
  84             'type': 'password',
  85             'user_key': username,
  86             'password': password,
  87         }
  88         login_request = VesselIE.make_json_request(self._LOGIN_URL, data)
  89         self._download_webpage(login_request, None, False, 'Wrong login info')
  90
  91     def _real_initialize(self):
  92         self._login()
  93
  94     def _real_extract(self, url):
  95         video_id = self._match_id(url)
  96
  97         webpage = self._download_webpage(url, video_id)
  98         data = self._parse_json(self._search_regex(
  99             r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id)
 100         asset_id = data['model']['data']['id']
 101
 102         req = VesselIE.make_json_request(
 103             self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
 104         data = self._download_json(req, video_id)
 105         video_asset_id = data.get('main_video_asset')
 106
 107         self._check_access_rights(data)
 108
 109         try:
 110             video_asset = next(
 111                 VesselIE.find_assets(data, 'video', asset_id=video_asset_id))
 112         except StopIteration:
 113             raise ExtractorError('No video assets found')
 114
 115         formats = []
 116         for f in video_asset.get('sources', []):
 117             location = f.get('location')
 118             if not location:
 119                 continue
 120             name = f.get('name')
 121             if name == 'hls-index':
 122                 formats.extend(self._extract_m3u8_formats(
 123                     location, video_id, ext='mp4',
 124                     entry_protocol='m3u8_native', m3u8_id='m3u8', fatal=False))
 125             elif name == 'dash-index':
 126                 formats.extend(self._extract_mpd_formats(
 127                     location, video_id, mpd_id='dash', fatal=False))
 128             else:
 129                 formats.append({
 130                     'format_id': name,
 131                     'tbr': f.get('bitrate'),
 132                     'height': f.get('height'),
 133                     'width': f.get('width'),
 134                     'url': location,
 135                 })
 136         self._sort_formats(formats)
 137
 138         thumbnails = []
 139         for im_asset in VesselIE.find_assets(data, 'image'):
 140             thumbnails.append({
 141                 'url': im_asset['location'],
 142                 'width': im_asset.get('width', 0),
 143                 'height': im_asset.get('height', 0),
 144             })
 145
 146         return {
 147             'id': video_id,
 148             'title': data['title'],
 149             'formats': formats,
 150             'thumbnails': thumbnails,
 151             'description': data.get('short_description'),
 152             'duration': data.get('duration'),
 153             'comment_count': data.get('comment_count'),
 154             'like_count': data.get('like_count'),
 155             'view_count': data.get('view_count'),
 156             'timestamp': parse_iso8601(data.get('released_at')),
 157         }