]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/googledrive.py
589e4d5c371480d590b504dd1a3738a858c80790
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  15 class GoogleDriveIE(InfoExtractor
): 
  19                                 (?:docs|drive)\.google\.com/ 
  24                                 video\.google\.com/get_player\?.*?docid= 
  26                             (?P<id>[a-zA-Z0-9_-]{28,}) 
  29         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', 
  30         'md5': '5c602afbbf2c1db91831f5d82f678554', 
  32             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 
  34             'title': 'Big Buck Bunny.mp4', 
  38         # video can't be watched anonymously due to view count limit reached, 
  39         # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046) 
  40         'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', 
  41         'md5': 'bfbd670d03a470bb1e6d4a257adec12e', 
  43             'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', 
  45             'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', 
  48         # video id is longer than 28 characters 
  49         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', 
  51             'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', 
  53             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', 
  56         'only_matching': True, 
  58         'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 
  59         'only_matching': True, 
  61         'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28', 
  62         'only_matching': True, 
  82     _BASE_URL_CAPTIONS 
= 'https://drive.google.com/timedtext' 
  83     _CAPTIONS_ENTRY_TAG 
= { 
  85         'automatic_captions': 'target', 
  87     _caption_formats_ext 
= [] 
  91     def _extract_url(webpage
): 
  93             r
'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', 
  96             return 'https://drive.google.com/file/d/%s' % mobj
.group('id') 
  98     def _download_subtitles_xml(self
, video_id
, subtitles_id
, hl
): 
  99         if self
._captions
_xml
: 
 101         self
._captions
_xml 
= self
._download
_xml
( 
 102             self
._BASE
_URL
_CAPTIONS
, video_id
, query
={ 
 111             }, note
='Downloading subtitles XML', 
 112             errnote
='Unable to download subtitles XML', fatal
=False) 
 113         if self
._captions
_xml
: 
 114             for f 
in self
._captions
_xml
.findall('format'): 
 115                 if f
.attrib
.get('fmt_code') and not f
.attrib
.get('default'): 
 116                     self
._caption
_formats
_ext
.append(f
.attrib
['fmt_code']) 
 118     def _get_captions_by_type(self
, video_id
, subtitles_id
, caption_type
, 
 119                               origin_lang_code
=None): 
 120         if not subtitles_id 
or not caption_type
: 
 123         for caption_entry 
in self
._captions
_xml
.findall( 
 124                 self
._CAPTIONS
_ENTRY
_TAG
[caption_type
]): 
 125             caption_lang_code 
= caption_entry
.attrib
.get('lang_code') 
 126             if not caption_lang_code
: 
 128             caption_format_data 
= [] 
 129             for caption_format 
in self
._caption
_formats
_ext
: 
 133                     'fmt': caption_format
, 
 134                     'lang': (caption_lang_code 
if origin_lang_code 
is None 
 135                              else origin_lang_code
), 
 140                 if origin_lang_code 
is not None: 
 141                     query
.update({'tlang': caption_lang_code
}) 
 142                 caption_format_data
.append({ 
 143                     'url': update_url_query(self
._BASE
_URL
_CAPTIONS
, query
), 
 144                     'ext': caption_format
, 
 146             captions
[caption_lang_code
] = caption_format_data
 
 149     def _get_subtitles(self
, video_id
, subtitles_id
, hl
): 
 150         if not subtitles_id 
or not hl
: 
 152         self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
) 
 153         if not self
._captions
_xml
: 
 155         return self
._get
_captions
_by
_type
(video_id
, subtitles_id
, 'subtitles') 
 157     def _get_automatic_captions(self
, video_id
, subtitles_id
, hl
): 
 158         if not subtitles_id 
or not hl
: 
 160         self
._download
_subtitles
_xml
(video_id
, subtitles_id
, hl
) 
 161         if not self
._captions
_xml
: 
 163         track 
= self
._captions
_xml
.find('track') 
 166         origin_lang_code 
= track
.attrib
.get('lang_code') 
 167         if not origin_lang_code
: 
 169         return self
._get
_captions
_by
_type
( 
 170             video_id
, subtitles_id
, 'automatic_captions', origin_lang_code
) 
 172     def _real_extract(self
, url
): 
 173         video_id 
= self
._match
_id
(url
) 
 174         webpage 
= self
._download
_webpage
( 
 175             'http://docs.google.com/file/d/%s' % video_id
, video_id
) 
 177         title 
= self
._search
_regex
( 
 178             r
'"title"\s*,\s*"([^"]+)', webpage
, 'title', 
 179             default
=None) or self
._og
_search
_title
(webpage
) 
 180         duration 
= int_or_none(self
._search
_regex
( 
 181             r
'"length_seconds"\s*,\s*"([^"]+)', webpage
, 'length seconds', 
 185         fmt_stream_map 
= self
._search
_regex
( 
 186             r
'"fmt_stream_map"\s*,\s*"([^"]+)', webpage
, 
 187             'fmt stream map', default
='').split(',') 
 188         fmt_list 
= self
._search
_regex
( 
 189             r
'"fmt_list"\s*,\s*"([^"]+)', webpage
, 
 190             'fmt_list', default
='').split(',') 
 191         if fmt_stream_map 
and fmt_list
: 
 195                     r
'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt
) 
 197                     resolutions
[mobj
.group('format_id')] = ( 
 198                         int(mobj
.group('width')), int(mobj
.group('height'))) 
 200             for fmt_stream 
in fmt_stream_map
: 
 201                 fmt_stream_split 
= fmt_stream
.split('|') 
 202                 if len(fmt_stream_split
) < 2: 
 204                 format_id
, format_url 
= fmt_stream_split
[:2] 
 206                     'url': lowercase_escape(format_url
), 
 207                     'format_id': format_id
, 
 208                     'ext': self
._FORMATS
_EXT
[format_id
], 
 210                 resolution 
= resolutions
.get(format_id
) 
 213                         'width': resolution
[0], 
 214                         'height': resolution
[1], 
 218         source_url 
= update_url_query( 
 219             'https://drive.google.com/uc', { 
 221                 'export': 'download', 
 223         urlh 
= self
._request
_webpage
( 
 224             source_url
, video_id
, note
='Requesting source file', 
 225             errnote
='Unable to request source file', fatal
=False) 
 227             def add_source_format(src_url
): 
 230                     'ext': determine_ext(title
, 'mp4').lower(), 
 231                     'format_id': 'source', 
 234             if urlh
.headers
.get('Content-Disposition'): 
 235                 add_source_format(source_url
) 
 237                 confirmation_webpage 
= self
._webpage
_read
_content
( 
 238                     urlh
, url
, video_id
, note
='Downloading confirmation page', 
 239                     errnote
='Unable to confirm download', fatal
=False) 
 240                 if confirmation_webpage
: 
 241                     confirm 
= self
._search
_regex
( 
 242                         r
'confirm=([^&"\']+)', confirmation_webpage, 
 243                         'confirmation code
', fatal=False) 
 245                         add_source_format(update_url_query(source_url, { 
 250             reason = self._search_regex( 
 251                 r'"reason"\s
*,\s
*"([^"]+)', webpage, 'reason
', default=None) 
 253                 raise ExtractorError(reason, expected=True) 
 255         self._sort_formats(formats) 
 257         hl = self._search_regex( 
 258             r'"hl"\s
*,\s
*"([^"]+)', webpage, 'hl
', default=None) 
 260         ttsurl = self._search_regex( 
 261             r'"ttsurl"\s
*,\s
*"([^"]+)', webpage, 'ttsurl
', default=None) 
 263             # the video Id for subtitles will be the last value in the ttsurl 
 265             subtitles_id = ttsurl.encode('utf
-8').decode( 
 266                 'unicode_escape
').split('=')[-1] 
 271             'thumbnail
': self._og_search_thumbnail(webpage, default=None), 
 272             'duration
': duration, 
 274             'subtitles
': self.extract_subtitles(video_id, subtitles_id, hl), 
 275             'automatic_captions
': self.extract_automatic_captions( 
 276                 video_id, subtitles_id, hl),