]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/googledrive.py
Prepare to release.
[youtubedl] / youtube_dl / extractor / googledrive.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7 determine_ext,
8 ExtractorError,
9 int_or_none,
10 lowercase_escape,
11 update_url_query,
12 )
13
14
15 class GoogleDriveIE(InfoExtractor):
16 _VALID_URL = r'''(?x)
17 https?://
18 (?:
19 (?:docs|drive)\.google\.com/
20 (?:
21 (?:uc|open)\?.*?id=|
22 file/d/
23 )|
24 video\.google\.com/get_player\?.*?docid=
25 )
26 (?P<id>[a-zA-Z0-9_-]{28,})
27 '''
28 _TESTS = [{
29 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
30 'md5': '5c602afbbf2c1db91831f5d82f678554',
31 'info_dict': {
32 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
33 'ext': 'mp4',
34 'title': 'Big Buck Bunny.mp4',
35 'duration': 45,
36 }
37 }, {
38 # video can't be watched anonymously due to view count limit reached,
39 # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
40 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
41 'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
42 'info_dict': {
43 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
44 'ext': 'mp4',
45 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
46 }
47 }, {
48 # video id is longer than 28 characters
49 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
50 'info_dict': {
51 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
52 'ext': 'mp4',
53 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
54 'duration': 189,
55 },
56 'only_matching': True,
57 }, {
58 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
59 'only_matching': True,
60 }, {
61 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
62 'only_matching': True,
63 }]
64 _FORMATS_EXT = {
65 '5': 'flv',
66 '6': 'flv',
67 '13': '3gp',
68 '17': '3gp',
69 '18': 'mp4',
70 '22': 'mp4',
71 '34': 'flv',
72 '35': 'flv',
73 '36': '3gp',
74 '37': 'mp4',
75 '38': 'mp4',
76 '43': 'webm',
77 '44': 'webm',
78 '45': 'webm',
79 '46': 'webm',
80 '59': 'mp4',
81 }
82 _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
83 _CAPTIONS_ENTRY_TAG = {
84 'subtitles': 'track',
85 'automatic_captions': 'target',
86 }
87 _caption_formats_ext = []
88 _captions_xml = None
89
90 @staticmethod
91 def _extract_url(webpage):
92 mobj = re.search(
93 r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
94 webpage)
95 if mobj:
96 return 'https://drive.google.com/file/d/%s' % mobj.group('id')
97
98 def _download_subtitles_xml(self, video_id, subtitles_id, hl):
99 if self._captions_xml:
100 return
101 self._captions_xml = self._download_xml(
102 self._BASE_URL_CAPTIONS, video_id, query={
103 'id': video_id,
104 'vid': subtitles_id,
105 'hl': hl,
106 'v': video_id,
107 'type': 'list',
108 'tlangs': '1',
109 'fmts': '1',
110 'vssids': '1',
111 }, note='Downloading subtitles XML',
112 errnote='Unable to download subtitles XML', fatal=False)
113 if self._captions_xml:
114 for f in self._captions_xml.findall('format'):
115 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
116 self._caption_formats_ext.append(f.attrib['fmt_code'])
117
118 def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
119 origin_lang_code=None):
120 if not subtitles_id or not caption_type:
121 return
122 captions = {}
123 for caption_entry in self._captions_xml.findall(
124 self._CAPTIONS_ENTRY_TAG[caption_type]):
125 caption_lang_code = caption_entry.attrib.get('lang_code')
126 if not caption_lang_code:
127 continue
128 caption_format_data = []
129 for caption_format in self._caption_formats_ext:
130 query = {
131 'vid': subtitles_id,
132 'v': video_id,
133 'fmt': caption_format,
134 'lang': (caption_lang_code if origin_lang_code is None
135 else origin_lang_code),
136 'type': 'track',
137 'name': '',
138 'kind': '',
139 }
140 if origin_lang_code is not None:
141 query.update({'tlang': caption_lang_code})
142 caption_format_data.append({
143 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
144 'ext': caption_format,
145 })
146 captions[caption_lang_code] = caption_format_data
147 return captions
148
149 def _get_subtitles(self, video_id, subtitles_id, hl):
150 if not subtitles_id or not hl:
151 return
152 self._download_subtitles_xml(video_id, subtitles_id, hl)
153 if not self._captions_xml:
154 return
155 return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
156
157 def _get_automatic_captions(self, video_id, subtitles_id, hl):
158 if not subtitles_id or not hl:
159 return
160 self._download_subtitles_xml(video_id, subtitles_id, hl)
161 if not self._captions_xml:
162 return
163 track = self._captions_xml.find('track')
164 if track is None:
165 return
166 origin_lang_code = track.attrib.get('lang_code')
167 if not origin_lang_code:
168 return
169 return self._get_captions_by_type(
170 video_id, subtitles_id, 'automatic_captions', origin_lang_code)
171
172 def _real_extract(self, url):
173 video_id = self._match_id(url)
174 webpage = self._download_webpage(
175 'http://docs.google.com/file/d/%s' % video_id, video_id)
176
177 title = self._search_regex(
178 r'"title"\s*,\s*"([^"]+)', webpage, 'title',
179 default=None) or self._og_search_title(webpage)
180 duration = int_or_none(self._search_regex(
181 r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
182 default=None))
183
184 formats = []
185 fmt_stream_map = self._search_regex(
186 r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
187 'fmt stream map', default='').split(',')
188 fmt_list = self._search_regex(
189 r'"fmt_list"\s*,\s*"([^"]+)', webpage,
190 'fmt_list', default='').split(',')
191 if fmt_stream_map and fmt_list:
192 resolutions = {}
193 for fmt in fmt_list:
194 mobj = re.search(
195 r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
196 if mobj:
197 resolutions[mobj.group('format_id')] = (
198 int(mobj.group('width')), int(mobj.group('height')))
199
200 for fmt_stream in fmt_stream_map:
201 fmt_stream_split = fmt_stream.split('|')
202 if len(fmt_stream_split) < 2:
203 continue
204 format_id, format_url = fmt_stream_split[:2]
205 f = {
206 'url': lowercase_escape(format_url),
207 'format_id': format_id,
208 'ext': self._FORMATS_EXT[format_id],
209 }
210 resolution = resolutions.get(format_id)
211 if resolution:
212 f.update({
213 'width': resolution[0],
214 'height': resolution[1],
215 })
216 formats.append(f)
217
218 source_url = update_url_query(
219 'https://drive.google.com/uc', {
220 'id': video_id,
221 'export': 'download',
222 })
223
224 def request_source_file(source_url, kind):
225 return self._request_webpage(
226 source_url, video_id, note='Requesting %s file' % kind,
227 errnote='Unable to request %s file' % kind, fatal=False)
228 urlh = request_source_file(source_url, 'source')
229 if urlh:
230 def add_source_format(urlh):
231 formats.append({
232 # Use redirect URLs as download URLs in order to calculate
233 # correct cookies in _calc_cookies.
234 # Using original URLs may result in redirect loop due to
235 # google.com's cookies mistakenly used for googleusercontent.com
236 # redirect URLs (see #23919).
237 'url': urlh.geturl(),
238 'ext': determine_ext(title, 'mp4').lower(),
239 'format_id': 'source',
240 'quality': 1,
241 })
242 if urlh.headers.get('Content-Disposition'):
243 add_source_format(urlh)
244 else:
245 confirmation_webpage = self._webpage_read_content(
246 urlh, url, video_id, note='Downloading confirmation page',
247 errnote='Unable to confirm download', fatal=False)
248 if confirmation_webpage:
249 confirm = self._search_regex(
250 r'confirm=([^&"\']+)', confirmation_webpage,
251 'confirmation code', fatal=False)
252 if confirm:
253 confirmed_source_url = update_url_query(source_url, {
254 'confirm': confirm,
255 })
256 urlh = request_source_file(confirmed_source_url, 'confirmed source')
257 if urlh and urlh.headers.get('Content-Disposition'):
258 add_source_format(urlh)
259
260 if not formats:
261 reason = self._search_regex(
262 r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
263 if reason:
264 raise ExtractorError(reason, expected=True)
265
266 self._sort_formats(formats)
267
268 hl = self._search_regex(
269 r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
270 subtitles_id = None
271 ttsurl = self._search_regex(
272 r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
273 if ttsurl:
274 # the video Id for subtitles will be the last value in the ttsurl
275 # query string
276 subtitles_id = ttsurl.encode('utf-8').decode(
277 'unicode_escape').split('=')[-1]
278
279 return {
280 'id': video_id,
281 'title': title,
282 'thumbnail': self._og_search_thumbnail(webpage, default=None),
283 'duration': duration,
284 'formats': formats,
285 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
286 'automatic_captions': self.extract_automatic_captions(
287 video_id, subtitles_id, hl),
288 }