Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/googledrive.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     determine_ext,
   8     ExtractorError,
   9     int_or_none,
  10     lowercase_escape,
  11     update_url_query,
  12 )
  13
  14
  15 class GoogleDriveIE(InfoExtractor):
  16     _VALID_URL = r'''(?x)
  17                         https?://
  18                             (?:
  19                                 (?:docs|drive)\.google\.com/
  20                                 (?:
  21                                     (?:uc|open)\?.*?id=|
  22                                     file/d/
  23                                 )|
  24                                 video\.google\.com/get_player\?.*?docid=
  25                             )
  26                             (?P<id>[a-zA-Z0-9_-]{28,})
  27                     '''
  28     _TESTS = [{
  29         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  30         'md5': '5c602afbbf2c1db91831f5d82f678554',
  31         'info_dict': {
  32             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  33             'ext': 'mp4',
  34             'title': 'Big Buck Bunny.mp4',
  35             'duration': 45,
  36         }
  37     }, {
  38         # video can't be watched anonymously due to view count limit reached,
  39         # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
  40         'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
  41         'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
  42         'info_dict': {
  43             'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
  44             'ext': 'mp4',
  45             'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
  46         }
  47     }, {
  48         # video id is longer than 28 characters
  49         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
  50         'info_dict': {
  51             'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
  52             'ext': 'mp4',
  53             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
  54             'duration': 189,
  55         },
  56         'only_matching': True,
  57     }, {
  58         'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  59         'only_matching': True,
  60     }, {
  61         'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
  62         'only_matching': True,
  63     }]
  64     _FORMATS_EXT = {
  65         '5': 'flv',
  66         '6': 'flv',
  67         '13': '3gp',
  68         '17': '3gp',
  69         '18': 'mp4',
  70         '22': 'mp4',
  71         '34': 'flv',
  72         '35': 'flv',
  73         '36': '3gp',
  74         '37': 'mp4',
  75         '38': 'mp4',
  76         '43': 'webm',
  77         '44': 'webm',
  78         '45': 'webm',
  79         '46': 'webm',
  80         '59': 'mp4',
  81     }
  82     _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
  83     _CAPTIONS_ENTRY_TAG = {
  84         'subtitles': 'track',
  85         'automatic_captions': 'target',
  86     }
  87     _caption_formats_ext = []
  88     _captions_xml = None
  89
  90     @staticmethod
  91     def _extract_url(webpage):
  92         mobj = re.search(
  93             r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
  94             webpage)
  95         if mobj:
  96             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  97
  98     def _download_subtitles_xml(self, video_id, subtitles_id, hl):
  99         if self._captions_xml:
 100             return
 101         self._captions_xml = self._download_xml(
 102             self._BASE_URL_CAPTIONS, video_id, query={
 103                 'id': video_id,
 104                 'vid': subtitles_id,
 105                 'hl': hl,
 106                 'v': video_id,
 107                 'type': 'list',
 108                 'tlangs': '1',
 109                 'fmts': '1',
 110                 'vssids': '1',
 111             }, note='Downloading subtitles XML',
 112             errnote='Unable to download subtitles XML', fatal=False)
 113         if self._captions_xml:
 114             for f in self._captions_xml.findall('format'):
 115                 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
 116                     self._caption_formats_ext.append(f.attrib['fmt_code'])
 117
 118     def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
 119                               origin_lang_code=None):
 120         if not subtitles_id or not caption_type:
 121             return
 122         captions = {}
 123         for caption_entry in self._captions_xml.findall(
 124                 self._CAPTIONS_ENTRY_TAG[caption_type]):
 125             caption_lang_code = caption_entry.attrib.get('lang_code')
 126             if not caption_lang_code:
 127                 continue
 128             caption_format_data = []
 129             for caption_format in self._caption_formats_ext:
 130                 query = {
 131                     'vid': subtitles_id,
 132                     'v': video_id,
 133                     'fmt': caption_format,
 134                     'lang': (caption_lang_code if origin_lang_code is None
 135                              else origin_lang_code),
 136                     'type': 'track',
 137                     'name': '',
 138                     'kind': '',
 139                 }
 140                 if origin_lang_code is not None:
 141                     query.update({'tlang': caption_lang_code})
 142                 caption_format_data.append({
 143                     'url': update_url_query(self._BASE_URL_CAPTIONS, query),
 144                     'ext': caption_format,
 145                 })
 146             captions[caption_lang_code] = caption_format_data
 147         return captions
 148
 149     def _get_subtitles(self, video_id, subtitles_id, hl):
 150         if not subtitles_id or not hl:
 151             return
 152         self._download_subtitles_xml(video_id, subtitles_id, hl)
 153         if not self._captions_xml:
 154             return
 155         return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
 156
 157     def _get_automatic_captions(self, video_id, subtitles_id, hl):
 158         if not subtitles_id or not hl:
 159             return
 160         self._download_subtitles_xml(video_id, subtitles_id, hl)
 161         if not self._captions_xml:
 162             return
 163         track = self._captions_xml.find('track')
 164         if track is None:
 165             return
 166         origin_lang_code = track.attrib.get('lang_code')
 167         if not origin_lang_code:
 168             return
 169         return self._get_captions_by_type(
 170             video_id, subtitles_id, 'automatic_captions', origin_lang_code)
 171
 172     def _real_extract(self, url):
 173         video_id = self._match_id(url)
 174         webpage = self._download_webpage(
 175             'http://docs.google.com/file/d/%s' % video_id, video_id)
 176
 177         title = self._search_regex(
 178             r'"title"\s*,\s*"([^"]+)', webpage, 'title',
 179             default=None) or self._og_search_title(webpage)
 180         duration = int_or_none(self._search_regex(
 181             r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
 182             default=None))
 183
 184         formats = []
 185         fmt_stream_map = self._search_regex(
 186             r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
 187             'fmt stream map', default='').split(',')
 188         fmt_list = self._search_regex(
 189             r'"fmt_list"\s*,\s*"([^"]+)', webpage,
 190             'fmt_list', default='').split(',')
 191         if fmt_stream_map and fmt_list:
 192             resolutions = {}
 193             for fmt in fmt_list:
 194                 mobj = re.search(
 195                     r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
 196                 if mobj:
 197                     resolutions[mobj.group('format_id')] = (
 198                         int(mobj.group('width')), int(mobj.group('height')))
 199
 200             for fmt_stream in fmt_stream_map:
 201                 fmt_stream_split = fmt_stream.split('|')
 202                 if len(fmt_stream_split) < 2:
 203                     continue
 204                 format_id, format_url = fmt_stream_split[:2]
 205                 f = {
 206                     'url': lowercase_escape(format_url),
 207                     'format_id': format_id,
 208                     'ext': self._FORMATS_EXT[format_id],
 209                 }
 210                 resolution = resolutions.get(format_id)
 211                 if resolution:
 212                     f.update({
 213                         'width': resolution[0],
 214                         'height': resolution[1],
 215                     })
 216                 formats.append(f)
 217
 218         source_url = update_url_query(
 219             'https://drive.google.com/uc', {
 220                 'id': video_id,
 221                 'export': 'download',
 222             })
 223
 224         def request_source_file(source_url, kind):
 225             return self._request_webpage(
 226                 source_url, video_id, note='Requesting %s file' % kind,
 227                 errnote='Unable to request %s file' % kind, fatal=False)
 228         urlh = request_source_file(source_url, 'source')
 229         if urlh:
 230             def add_source_format(urlh):
 231                 formats.append({
 232                     # Use redirect URLs as download URLs in order to calculate
 233                     # correct cookies in _calc_cookies.
 234                     # Using original URLs may result in redirect loop due to
 235                     # google.com's cookies mistakenly used for googleusercontent.com
 236                     # redirect URLs (see #23919).
 237                     'url': urlh.geturl(),
 238                     'ext': determine_ext(title, 'mp4').lower(),
 239                     'format_id': 'source',
 240                     'quality': 1,
 241                 })
 242             if urlh.headers.get('Content-Disposition'):
 243                 add_source_format(urlh)
 244             else:
 245                 confirmation_webpage = self._webpage_read_content(
 246                     urlh, url, video_id, note='Downloading confirmation page',
 247                     errnote='Unable to confirm download', fatal=False)
 248                 if confirmation_webpage:
 249                     confirm = self._search_regex(
 250                         r'confirm=([^&"\']+)', confirmation_webpage,
 251                         'confirmation code', fatal=False)
 252                     if confirm:
 253                         confirmed_source_url = update_url_query(source_url, {
 254                             'confirm': confirm,
 255                         })
 256                         urlh = request_source_file(confirmed_source_url, 'confirmed source')
 257                         if urlh and urlh.headers.get('Content-Disposition'):
 258                             add_source_format(urlh)
 259
 260         if not formats:
 261             reason = self._search_regex(
 262                 r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
 263             if reason:
 264                 raise ExtractorError(reason, expected=True)
 265
 266         self._sort_formats(formats)
 267
 268         hl = self._search_regex(
 269             r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
 270         subtitles_id = None
 271         ttsurl = self._search_regex(
 272             r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
 273         if ttsurl:
 274             # the video Id for subtitles will be the last value in the ttsurl
 275             # query string
 276             subtitles_id = ttsurl.encode('utf-8').decode(
 277                 'unicode_escape').split('=')[-1]
 278
 279         return {
 280             'id': video_id,
 281             'title': title,
 282             'thumbnail': self._og_search_thumbnail(webpage, default=None),
 283             'duration': duration,
 284             'formats': formats,
 285             'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
 286             'automatic_captions': self.extract_automatic_captions(
 287                 video_id, subtitles_id, hl),
 288         }