]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
New upstream version 2017.03.26
[youtubedl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import random
10 import re
11 import socket
12 import sys
13 import time
14 import math
15
16 from ..compat import (
17 compat_cookiejar,
18 compat_cookies,
19 compat_etree_fromstring,
20 compat_getpass,
21 compat_http_client,
22 compat_os_name,
23 compat_str,
24 compat_urllib_error,
25 compat_urllib_parse_unquote,
26 compat_urllib_parse_urlencode,
27 compat_urllib_request,
28 compat_urlparse,
29 )
30 from ..downloader.f4m import remove_encrypted_media
31 from ..utils import (
32 NO_DEFAULT,
33 age_restricted,
34 base_url,
35 bug_reports_message,
36 clean_html,
37 compiled_regex_type,
38 determine_ext,
39 determine_protocol,
40 error_to_compat_str,
41 ExtractorError,
42 extract_attributes,
43 fix_xml_ampersands,
44 float_or_none,
45 GeoRestrictedError,
46 GeoUtils,
47 int_or_none,
48 js_to_json,
49 mimetype2ext,
50 orderedSet,
51 parse_codecs,
52 parse_duration,
53 parse_iso8601,
54 parse_m3u8_attributes,
55 RegexNotFoundError,
56 sanitized_Request,
57 sanitize_filename,
58 unescapeHTML,
59 unified_strdate,
60 unified_timestamp,
61 update_Request,
62 update_url_query,
63 urljoin,
64 url_basename,
65 xpath_element,
66 xpath_text,
67 xpath_with_ns,
68 )
69
70
71 class InfoExtractor(object):
72 """Information Extractor class.
73
74 Information extractors are the classes that, given a URL, extract
75 information about the video (or videos) the URL refers to. This
76 information includes the real video URL, the video title, author and
77 others. The information is stored in a dictionary which is then
78 passed to the YoutubeDL. The YoutubeDL processes this
79 information possibly downloading the video to the file system, among
80 other possible outcomes.
81
82 The type field determines the type of the result.
83 By far the most common value (and the default if _type is missing) is
84 "video", which indicates a single video.
85
86 For a video, the dictionaries must include the following fields:
87
88 id: Video identifier.
89 title: Video title, unescaped.
90
91 Additionally, it must contain either a formats entry or a url one:
92
93 formats: A list of dictionaries for each format available, ordered
94 from worst to best quality.
95
96 Potential fields:
97 * url Mandatory. The URL of the video file
98 * manifest_url
99 The URL of the manifest file in case of
100 fragmented media (DASH, hls, hds)
101 * ext Will be calculated from URL if missing
102 * format A human-readable description of the format
103 ("mp4 container with h264/opus").
104 Calculated from the format_id, width, height.
105 and format_note fields if missing.
106 * format_id A short description of the format
107 ("mp4_h264_opus" or "19").
108 Technically optional, but strongly recommended.
109 * format_note Additional info about the format
110 ("3D" or "DASH video")
111 * width Width of the video, if known
112 * height Height of the video, if known
113 * resolution Textual description of width and height
114 * tbr Average bitrate of audio and video in KBit/s
115 * abr Average audio bitrate in KBit/s
116 * acodec Name of the audio codec in use
117 * asr Audio sampling rate in Hertz
118 * vbr Average video bitrate in KBit/s
119 * fps Frame rate
120 * vcodec Name of the video codec in use
121 * container Name of the container format
122 * filesize The number of bytes, if known in advance
123 * filesize_approx An estimate for the number of bytes
124 * player_url SWF Player URL (used for rtmpdump).
125 * protocol The protocol that will be used for the actual
126 download, lower-case.
127 "http", "https", "rtsp", "rtmp", "rtmpe",
128 "m3u8", "m3u8_native" or "http_dash_segments".
129 * fragment_base_url
130 Base URL for fragments. Each fragment's path
131 value (if present) will be relative to
132 this URL.
133 * fragments A list of fragments of a fragmented media.
134 Each fragment entry must contain either an url
135 or a path. If an url is present it should be
136 considered by a client. Otherwise both path and
137 fragment_base_url must be present. Here is
138 the list of all potential fields:
139 * "url" - fragment's URL
140 * "path" - fragment's path relative to
141 fragment_base_url
142 * "duration" (optional, int or float)
143 * "filesize" (optional, int)
144 * preference Order number of this format. If this field is
145 present and not None, the formats get sorted
146 by this field, regardless of all other values.
147 -1 for default (order by other properties),
148 -2 or smaller for less than default.
149 < -1000 to hide the format (if there is
150 another one which is strictly better)
151 * language Language code, e.g. "de" or "en-US".
152 * language_preference Is this in the language mentioned in
153 the URL?
154 10 if it's what the URL is about,
155 -1 for default (don't know),
156 -10 otherwise, other values reserved for now.
157 * quality Order number of the video quality of this
158 format, irrespective of the file format.
159 -1 for default (order by other properties),
160 -2 or smaller for less than default.
161 * source_preference Order number for this video source
162 (quality takes higher priority)
163 -1 for default (order by other properties),
164 -2 or smaller for less than default.
165 * http_headers A dictionary of additional HTTP headers
166 to add to the request.
167 * stretched_ratio If given and not 1, indicates that the
168 video's pixels are not square.
169 width : height ratio as float.
170 * no_resume The server does not support resuming the
171 (HTTP or RTMP) download. Boolean.
172
173 url: Final video URL.
174 ext: Video filename extension.
175 format: The video format, defaults to ext (used for --get-format)
176 player_url: SWF Player URL (used for rtmpdump).
177
178 The following fields are optional:
179
180 alt_title: A secondary title of the video.
181 display_id An alternative identifier for the video, not necessarily
182 unique, but available before title. Typically, id is
183 something like "4234987", title "Dancing naked mole rats",
184 and display_id "dancing-naked-mole-rats"
185 thumbnails: A list of dictionaries, with the following entries:
186 * "id" (optional, string) - Thumbnail format ID
187 * "url"
188 * "preference" (optional, int) - quality of the image
189 * "width" (optional, int)
190 * "height" (optional, int)
191 * "resolution" (optional, string "{width}x{height"},
192 deprecated)
193 * "filesize" (optional, int)
194 thumbnail: Full URL to a video thumbnail image.
195 description: Full video description.
196 uploader: Full name of the video uploader.
197 license: License name the video is licensed under.
198 creator: The creator of the video.
199 release_date: The date (YYYYMMDD) when the video was released.
200 timestamp: UNIX timestamp of the moment the video became available.
201 upload_date: Video upload date (YYYYMMDD).
202 If not explicitly set, calculated from timestamp.
203 uploader_id: Nickname or id of the video uploader.
204 uploader_url: Full URL to a personal webpage of the video uploader.
205 location: Physical location where the video was filmed.
206 subtitles: The available subtitles as a dictionary in the format
207 {tag: subformats}. "tag" is usually a language code, and
208 "subformats" is a list sorted from lower to higher
209 preference, each element is a dictionary with the "ext"
210 entry and one of:
211 * "data": The subtitles file contents
212 * "url": A URL pointing to the subtitles file
213 "ext" will be calculated from URL if missing
214 automatic_captions: Like 'subtitles', used by the YoutubeIE for
215 automatically generated captions
216 duration: Length of the video in seconds, as an integer or float.
217 view_count: How many users have watched the video on the platform.
218 like_count: Number of positive ratings of the video
219 dislike_count: Number of negative ratings of the video
220 repost_count: Number of reposts of the video
221 average_rating: Average rating give by users, the scale used depends on the webpage
222 comment_count: Number of comments on the video
223 comments: A list of comments, each with one or more of the following
224 properties (all but one of text or html optional):
225 * "author" - human-readable name of the comment author
226 * "author_id" - user ID of the comment author
227 * "id" - Comment ID
228 * "html" - Comment as HTML
229 * "text" - Plain text of the comment
230 * "timestamp" - UNIX timestamp of comment
231 * "parent" - ID of the comment this one is replying to.
232 Set to "root" to indicate that this is a
233 comment to the original video.
234 age_limit: Age restriction for the video, as an integer (years)
235 webpage_url: The URL to the video webpage, if given to youtube-dl it
236 should allow to get the same result again. (It will be set
237 by YoutubeDL if it's missing)
238 categories: A list of categories that the video falls in, for example
239 ["Sports", "Berlin"]
240 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
241 is_live: True, False, or None (=unknown). Whether this video is a
242 live stream that goes on instead of a fixed-length video.
243 start_time: Time in seconds where the reproduction should start, as
244 specified in the URL.
245 end_time: Time in seconds where the reproduction should end, as
246 specified in the URL.
247
248 The following fields should only be used when the video belongs to some logical
249 chapter or section:
250
251 chapter: Name or title of the chapter the video belongs to.
252 chapter_number: Number of the chapter the video belongs to, as an integer.
253 chapter_id: Id of the chapter the video belongs to, as a unicode string.
254
255 The following fields should only be used when the video is an episode of some
256 series, programme or podcast:
257
258 series: Title of the series or programme the video episode belongs to.
259 season: Title of the season the video episode belongs to.
260 season_number: Number of the season the video episode belongs to, as an integer.
261 season_id: Id of the season the video episode belongs to, as a unicode string.
262 episode: Title of the video episode. Unlike mandatory video title field,
263 this field should denote the exact title of the video episode
264 without any kind of decoration.
265 episode_number: Number of the video episode within a season, as an integer.
266 episode_id: Id of the video episode, as a unicode string.
267
268 The following fields should only be used when the media is a track or a part of
269 a music album:
270
271 track: Title of the track.
272 track_number: Number of the track within an album or a disc, as an integer.
273 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
274 as a unicode string.
275 artist: Artist(s) of the track.
276 genre: Genre(s) of the track.
277 album: Title of the album the track belongs to.
278 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
279 album_artist: List of all artists appeared on the album (e.g.
280 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
281 and compilations).
282 disc_number: Number of the disc or other physical medium the track belongs to,
283 as an integer.
284 release_year: Year (YYYY) when the album was released.
285
286 Unless mentioned otherwise, the fields should be Unicode strings.
287
288 Unless mentioned otherwise, None is equivalent to absence of information.
289
290
291 _type "playlist" indicates multiple videos.
292 There must be a key "entries", which is a list, an iterable, or a PagedList
293 object, each element of which is a valid dictionary by this specification.
294
295 Additionally, playlists can have "title", "description" and "id" attributes
296 with the same semantics as videos (see above).
297
298
299 _type "multi_video" indicates that there are multiple videos that
300 form a single show, for examples multiple acts of an opera or TV episode.
301 It must have an entries key like a playlist and contain all the keys
302 required for a video at the same time.
303
304
305 _type "url" indicates that the video must be extracted from another
306 location, possibly by a different extractor. Its only required key is:
307 "url" - the next URL to extract.
308 The key "ie_key" can be set to the class name (minus the trailing "IE",
309 e.g. "Youtube") if the extractor class is known in advance.
310 Additionally, the dictionary may have any properties of the resolved entity
311 known in advance, for example "title" if the title of the referred video is
312 known ahead of time.
313
314
315 _type "url_transparent" entities have the same specification as "url", but
316 indicate that the given additional information is more precise than the one
317 associated with the resolved URL.
318 This is useful when a site employs a video service that hosts the video and
319 its technical metadata, but that video service does not embed a useful
320 title, description etc.
321
322
323 Subclasses of this one should re-define the _real_initialize() and
324 _real_extract() methods and define a _VALID_URL regexp.
325 Probably, they should also be added to the list of extractors.
326
327 _GEO_BYPASS attribute may be set to False in order to disable
328 geo restriction bypass mechanisms for a particular extractor.
329 Though it won't disable explicit geo restriction bypass based on
330 country code provided with geo_bypass_country. (experimental)
331
332 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
333 countries for this extractor. One of these countries will be used by
334 geo restriction bypass mechanism right away in order to bypass
335 geo restriction, of course, if the mechanism is not disabled. (experimental)
336
337 NB: both these geo attributes are experimental and may change in future
338 or be completely removed.
339
340 Finally, the _WORKING attribute should be set to False for broken IEs
341 in order to warn the users and skip the tests.
342 """
343
344 _ready = False
345 _downloader = None
346 _x_forwarded_for_ip = None
347 _GEO_BYPASS = True
348 _GEO_COUNTRIES = None
349 _WORKING = True
350
351 def __init__(self, downloader=None):
352 """Constructor. Receives an optional downloader."""
353 self._ready = False
354 self._x_forwarded_for_ip = None
355 self.set_downloader(downloader)
356
357 @classmethod
358 def suitable(cls, url):
359 """Receives a URL and returns True if suitable for this IE."""
360
361 # This does not use has/getattr intentionally - we want to know whether
362 # we have cached the regexp for *this* class, whereas getattr would also
363 # match the superclass
364 if '_VALID_URL_RE' not in cls.__dict__:
365 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
366 return cls._VALID_URL_RE.match(url) is not None
367
368 @classmethod
369 def _match_id(cls, url):
370 if '_VALID_URL_RE' not in cls.__dict__:
371 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
372 m = cls._VALID_URL_RE.match(url)
373 assert m
374 return m.group('id')
375
376 @classmethod
377 def working(cls):
378 """Getter method for _WORKING."""
379 return cls._WORKING
380
381 def initialize(self):
382 """Initializes an instance (authentication, etc)."""
383 self._initialize_geo_bypass(self._GEO_COUNTRIES)
384 if not self._ready:
385 self._real_initialize()
386 self._ready = True
387
388 def _initialize_geo_bypass(self, countries):
389 """
390 Initialize geo restriction bypass mechanism.
391
392 This method is used to initialize geo bypass mechanism based on faking
393 X-Forwarded-For HTTP header. A random country from provided country list
394 is selected and a random IP belonging to this country is generated. This
395 IP will be passed as X-Forwarded-For HTTP header in all subsequent
396 HTTP requests.
397
398 This method will be used for initial geo bypass mechanism initialization
399 during the instance initialization with _GEO_COUNTRIES.
400
401 You may also manually call it from extractor's code if geo countries
402 information is not available beforehand (e.g. obtained during
403 extraction) or due to some another reason.
404 """
405 if not self._x_forwarded_for_ip:
406 country_code = self._downloader.params.get('geo_bypass_country', None)
407 # If there is no explicit country for geo bypass specified and
408 # the extractor is known to be geo restricted let's fake IP
409 # as X-Forwarded-For right away.
410 if (not country_code and
411 self._GEO_BYPASS and
412 self._downloader.params.get('geo_bypass', True) and
413 countries):
414 country_code = random.choice(countries)
415 if country_code:
416 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
417 if self._downloader.params.get('verbose', False):
418 self._downloader.to_stdout(
419 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
420 % (self._x_forwarded_for_ip, country_code.upper()))
421
422 def extract(self, url):
423 """Extracts URL information and returns it in list of dicts."""
424 try:
425 for _ in range(2):
426 try:
427 self.initialize()
428 ie_result = self._real_extract(url)
429 if self._x_forwarded_for_ip:
430 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
431 return ie_result
432 except GeoRestrictedError as e:
433 if self.__maybe_fake_ip_and_retry(e.countries):
434 continue
435 raise
436 except ExtractorError:
437 raise
438 except compat_http_client.IncompleteRead as e:
439 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
440 except (KeyError, StopIteration) as e:
441 raise ExtractorError('An extractor error has occurred.', cause=e)
442
443 def __maybe_fake_ip_and_retry(self, countries):
444 if (not self._downloader.params.get('geo_bypass_country', None) and
445 self._GEO_BYPASS and
446 self._downloader.params.get('geo_bypass', True) and
447 not self._x_forwarded_for_ip and
448 countries):
449 country_code = random.choice(countries)
450 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
451 if self._x_forwarded_for_ip:
452 self.report_warning(
453 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
454 % (self._x_forwarded_for_ip, country_code.upper()))
455 return True
456 return False
457
458 def set_downloader(self, downloader):
459 """Sets the downloader for this IE."""
460 self._downloader = downloader
461
462 def _real_initialize(self):
463 """Real initialization process. Redefine in subclasses."""
464 pass
465
466 def _real_extract(self, url):
467 """Real extraction process. Redefine in subclasses."""
468 pass
469
470 @classmethod
471 def ie_key(cls):
472 """A string for getting the InfoExtractor with get_info_extractor"""
473 return compat_str(cls.__name__[:-2])
474
475 @property
476 def IE_NAME(self):
477 return compat_str(type(self).__name__[:-2])
478
479 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
480 """ Returns the response handle """
481 if note is None:
482 self.report_download_webpage(video_id)
483 elif note is not False:
484 if video_id is None:
485 self.to_screen('%s' % (note,))
486 else:
487 self.to_screen('%s: %s' % (video_id, note))
488 if isinstance(url_or_request, compat_urllib_request.Request):
489 url_or_request = update_Request(
490 url_or_request, data=data, headers=headers, query=query)
491 else:
492 if query:
493 url_or_request = update_url_query(url_or_request, query)
494 if data is not None or headers:
495 url_or_request = sanitized_Request(url_or_request, data, headers)
496 try:
497 return self._downloader.urlopen(url_or_request)
498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
499 if errnote is False:
500 return False
501 if errnote is None:
502 errnote = 'Unable to download webpage'
503
504 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
505 if fatal:
506 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
507 else:
508 self._downloader.report_warning(errmsg)
509 return False
510
511 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
512 """ Returns a tuple (page content as string, URL handle) """
513 # Strip hashes from the URL (#1038)
514 if isinstance(url_or_request, (compat_str, str)):
515 url_or_request = url_or_request.partition('#')[0]
516
517 # Some sites check X-Forwarded-For HTTP header in order to figure out
518 # the origin of the client behind proxy. This allows bypassing geo
519 # restriction by faking this header's value to IP that belongs to some
520 # geo unrestricted country. We will do so once we encounter any
521 # geo restriction error.
522 if self._x_forwarded_for_ip:
523 if 'X-Forwarded-For' not in headers:
524 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
525
526 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
527 if urlh is False:
528 assert not fatal
529 return False
530 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
531 return (content, urlh)
532
533 @staticmethod
534 def _guess_encoding_from_content(content_type, webpage_bytes):
535 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
536 if m:
537 encoding = m.group(1)
538 else:
539 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
540 webpage_bytes[:1024])
541 if m:
542 encoding = m.group(1).decode('ascii')
543 elif webpage_bytes.startswith(b'\xff\xfe'):
544 encoding = 'utf-16'
545 else:
546 encoding = 'utf-8'
547
548 return encoding
549
550 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
551 content_type = urlh.headers.get('Content-Type', '')
552 webpage_bytes = urlh.read()
553 if prefix is not None:
554 webpage_bytes = prefix + webpage_bytes
555 if not encoding:
556 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
557 if self._downloader.params.get('dump_intermediate_pages', False):
558 try:
559 url = url_or_request.get_full_url()
560 except AttributeError:
561 url = url_or_request
562 self.to_screen('Dumping request to ' + url)
563 dump = base64.b64encode(webpage_bytes).decode('ascii')
564 self._downloader.to_screen(dump)
565 if self._downloader.params.get('write_pages', False):
566 try:
567 url = url_or_request.get_full_url()
568 except AttributeError:
569 url = url_or_request
570 basen = '%s_%s' % (video_id, url)
571 if len(basen) > 240:
572 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
573 basen = basen[:240 - len(h)] + h
574 raw_filename = basen + '.dump'
575 filename = sanitize_filename(raw_filename, restricted=True)
576 self.to_screen('Saving request to ' + filename)
577 # Working around MAX_PATH limitation on Windows (see
578 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
579 if compat_os_name == 'nt':
580 absfilepath = os.path.abspath(filename)
581 if len(absfilepath) > 259:
582 filename = '\\\\?\\' + absfilepath
583 with open(filename, 'wb') as outf:
584 outf.write(webpage_bytes)
585
586 try:
587 content = webpage_bytes.decode(encoding, 'replace')
588 except LookupError:
589 content = webpage_bytes.decode('utf-8', 'replace')
590
591 if ('<title>Access to this site is blocked</title>' in content and
592 'Websense' in content[:512]):
593 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
594 blocked_iframe = self._html_search_regex(
595 r'<iframe src="([^"]+)"', content,
596 'Websense information URL', default=None)
597 if blocked_iframe:
598 msg += ' Visit %s for more details' % blocked_iframe
599 raise ExtractorError(msg, expected=True)
600 if '<title>The URL you requested has been blocked</title>' in content[:512]:
601 msg = (
602 'Access to this webpage has been blocked by Indian censorship. '
603 'Use a VPN or proxy server (with --proxy) to route around it.')
604 block_msg = self._html_search_regex(
605 r'</h1><p>(.*?)</p>',
606 content, 'block message', default=None)
607 if block_msg:
608 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
609 raise ExtractorError(msg, expected=True)
610
611 return content
612
613 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
614 """ Returns the data of the page as a string """
615 success = False
616 try_count = 0
617 while success is False:
618 try:
619 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
620 success = True
621 except compat_http_client.IncompleteRead as e:
622 try_count += 1
623 if try_count >= tries:
624 raise e
625 self._sleep(timeout, video_id)
626 if res is False:
627 return res
628 else:
629 content, _ = res
630 return content
631
632 def _download_xml(self, url_or_request, video_id,
633 note='Downloading XML', errnote='Unable to download XML',
634 transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
635 """Return the xml as an xml.etree.ElementTree.Element"""
636 xml_string = self._download_webpage(
637 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
638 if xml_string is False:
639 return xml_string
640 if transform_source:
641 xml_string = transform_source(xml_string)
642 return compat_etree_fromstring(xml_string.encode('utf-8'))
643
644 def _download_json(self, url_or_request, video_id,
645 note='Downloading JSON metadata',
646 errnote='Unable to download JSON metadata',
647 transform_source=None,
648 fatal=True, encoding=None, data=None, headers={}, query={}):
649 json_string = self._download_webpage(
650 url_or_request, video_id, note, errnote, fatal=fatal,
651 encoding=encoding, data=data, headers=headers, query=query)
652 if (not fatal) and json_string is False:
653 return None
654 return self._parse_json(
655 json_string, video_id, transform_source=transform_source, fatal=fatal)
656
657 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
658 if transform_source:
659 json_string = transform_source(json_string)
660 try:
661 return json.loads(json_string)
662 except ValueError as ve:
663 errmsg = '%s: Failed to parse JSON ' % video_id
664 if fatal:
665 raise ExtractorError(errmsg, cause=ve)
666 else:
667 self.report_warning(errmsg + str(ve))
668
669 def report_warning(self, msg, video_id=None):
670 idstr = '' if video_id is None else '%s: ' % video_id
671 self._downloader.report_warning(
672 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
673
674 def to_screen(self, msg):
675 """Print msg to screen, prefixing it with '[ie_name]'"""
676 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
677
678 def report_extraction(self, id_or_name):
679 """Report information extraction."""
680 self.to_screen('%s: Extracting information' % id_or_name)
681
682 def report_download_webpage(self, video_id):
683 """Report webpage download."""
684 self.to_screen('%s: Downloading webpage' % video_id)
685
686 def report_age_confirmation(self):
687 """Report attempt to confirm age."""
688 self.to_screen('Confirming age')
689
690 def report_login(self):
691 """Report attempt to log in."""
692 self.to_screen('Logging in')
693
694 @staticmethod
695 def raise_login_required(msg='This video is only available for registered users'):
696 raise ExtractorError(
697 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
698 expected=True)
699
700 @staticmethod
701 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
702 raise GeoRestrictedError(msg, countries=countries)
703
704 # Methods for following #608
705 @staticmethod
706 def url_result(url, ie=None, video_id=None, video_title=None):
707 """Returns a URL that points to a page that should be processed"""
708 # TODO: ie should be the class used for getting the info
709 video_info = {'_type': 'url',
710 'url': url,
711 'ie_key': ie}
712 if video_id is not None:
713 video_info['id'] = video_id
714 if video_title is not None:
715 video_info['title'] = video_title
716 return video_info
717
718 def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
719 urlrs = orderedSet(
720 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
721 for m in matches)
722 return self.playlist_result(
723 urlrs, playlist_id=video_id, playlist_title=video_title)
724
725 @staticmethod
726 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
727 """Returns a playlist"""
728 video_info = {'_type': 'playlist',
729 'entries': entries}
730 if playlist_id:
731 video_info['id'] = playlist_id
732 if playlist_title:
733 video_info['title'] = playlist_title
734 if playlist_description:
735 video_info['description'] = playlist_description
736 return video_info
737
738 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
739 """
740 Perform a regex search on the given string, using a single or a list of
741 patterns returning the first matching group.
742 In case of failure return a default value or raise a WARNING or a
743 RegexNotFoundError, depending on fatal, specifying the field name.
744 """
745 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
746 mobj = re.search(pattern, string, flags)
747 else:
748 for p in pattern:
749 mobj = re.search(p, string, flags)
750 if mobj:
751 break
752
753 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
754 _name = '\033[0;34m%s\033[0m' % name
755 else:
756 _name = name
757
758 if mobj:
759 if group is None:
760 # return the first matching group
761 return next(g for g in mobj.groups() if g is not None)
762 else:
763 return mobj.group(group)
764 elif default is not NO_DEFAULT:
765 return default
766 elif fatal:
767 raise RegexNotFoundError('Unable to extract %s' % _name)
768 else:
769 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
770 return None
771
772 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
773 """
774 Like _search_regex, but strips HTML tags and unescapes entities.
775 """
776 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
777 if res:
778 return clean_html(res).strip()
779 else:
780 return res
781
782 def _get_netrc_login_info(self, netrc_machine=None):
783 username = None
784 password = None
785 netrc_machine = netrc_machine or self._NETRC_MACHINE
786
787 if self._downloader.params.get('usenetrc', False):
788 try:
789 info = netrc.netrc().authenticators(netrc_machine)
790 if info is not None:
791 username = info[0]
792 password = info[2]
793 else:
794 raise netrc.NetrcParseError(
795 'No authenticators for %s' % netrc_machine)
796 except (IOError, netrc.NetrcParseError) as err:
797 self._downloader.report_warning(
798 'parsing .netrc: %s' % error_to_compat_str(err))
799
800 return username, password
801
802 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
803 """
804 Get the login info as (username, password)
805 First look for the manually specified credentials using username_option
806 and password_option as keys in params dictionary. If no such credentials
807 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
808 value.
809 If there's no info available, return (None, None)
810 """
811 if self._downloader is None:
812 return (None, None)
813
814 downloader_params = self._downloader.params
815
816 # Attempt to use provided username and password or .netrc data
817 if downloader_params.get(username_option) is not None:
818 username = downloader_params[username_option]
819 password = downloader_params[password_option]
820 else:
821 username, password = self._get_netrc_login_info(netrc_machine)
822
823 return username, password
824
825 def _get_tfa_info(self, note='two-factor verification code'):
826 """
827 Get the two-factor authentication info
828 TODO - asking the user will be required for sms/phone verify
829 currently just uses the command line option
830 If there's no info available, return None
831 """
832 if self._downloader is None:
833 return None
834 downloader_params = self._downloader.params
835
836 if downloader_params.get('twofactor') is not None:
837 return downloader_params['twofactor']
838
839 return compat_getpass('Type %s and press [Return]: ' % note)
840
841 # Helper functions for extracting OpenGraph info
842 @staticmethod
843 def _og_regexes(prop):
844 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
845 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
846 % {'prop': re.escape(prop)})
847 template = r'<meta[^>]+?%s[^>]+?%s'
848 return [
849 template % (property_re, content_re),
850 template % (content_re, property_re),
851 ]
852
853 @staticmethod
854 def _meta_regex(prop):
855 return r'''(?isx)<meta
856 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
857 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
858
859 def _og_search_property(self, prop, html, name=None, **kargs):
860 if not isinstance(prop, (list, tuple)):
861 prop = [prop]
862 if name is None:
863 name = 'OpenGraph %s' % prop[0]
864 og_regexes = []
865 for p in prop:
866 og_regexes.extend(self._og_regexes(p))
867 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
868 if escaped is None:
869 return None
870 return unescapeHTML(escaped)
871
872 def _og_search_thumbnail(self, html, **kargs):
873 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
874
875 def _og_search_description(self, html, **kargs):
876 return self._og_search_property('description', html, fatal=False, **kargs)
877
878 def _og_search_title(self, html, **kargs):
879 return self._og_search_property('title', html, **kargs)
880
881 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
882 regexes = self._og_regexes('video') + self._og_regexes('video:url')
883 if secure:
884 regexes = self._og_regexes('video:secure_url') + regexes
885 return self._html_search_regex(regexes, html, name, **kargs)
886
887 def _og_search_url(self, html, **kargs):
888 return self._og_search_property('url', html, **kargs)
889
890 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
891 if not isinstance(name, (list, tuple)):
892 name = [name]
893 if display_name is None:
894 display_name = name[0]
895 return self._html_search_regex(
896 [self._meta_regex(n) for n in name],
897 html, display_name, fatal=fatal, group='content', **kwargs)
898
899 def _dc_search_uploader(self, html):
900 return self._html_search_meta('dc.creator', html, 'uploader')
901
902 def _rta_search(self, html):
903 # See http://www.rtalabel.org/index.php?content=howtofaq#single
904 if re.search(r'(?ix)<meta\s+name="rating"\s+'
905 r' content="RTA-5042-1996-1400-1577-RTA"',
906 html):
907 return 18
908 return 0
909
910 def _media_rating_search(self, html):
911 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
912 rating = self._html_search_meta('rating', html)
913
914 if not rating:
915 return None
916
917 RATING_TABLE = {
918 'safe for kids': 0,
919 'general': 8,
920 '14 years': 14,
921 'mature': 17,
922 'restricted': 19,
923 }
924 return RATING_TABLE.get(rating.lower())
925
926 def _family_friendly_search(self, html):
927 # See http://schema.org/VideoObject
928 family_friendly = self._html_search_meta('isFamilyFriendly', html)
929
930 if not family_friendly:
931 return None
932
933 RATING_TABLE = {
934 '1': 0,
935 'true': 0,
936 '0': 18,
937 'false': 18,
938 }
939 return RATING_TABLE.get(family_friendly.lower())
940
941 def _twitter_search_player(self, html):
942 return self._html_search_meta('twitter:player', html,
943 'twitter card player')
944
945 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
946 json_ld = self._search_regex(
947 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
948 html, 'JSON-LD', group='json_ld', **kwargs)
949 default = kwargs.get('default', NO_DEFAULT)
950 if not json_ld:
951 return default if default is not NO_DEFAULT else {}
952 # JSON-LD may be malformed and thus `fatal` should be respected.
953 # At the same time `default` may be passed that assumes `fatal=False`
954 # for _search_regex. Let's simulate the same behavior here as well.
955 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
956 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
957
958 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
959 if isinstance(json_ld, compat_str):
960 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
961 if not json_ld:
962 return {}
963 info = {}
964 if not isinstance(json_ld, (list, tuple, dict)):
965 return info
966 if isinstance(json_ld, dict):
967 json_ld = [json_ld]
968 for e in json_ld:
969 if e.get('@context') == 'http://schema.org':
970 item_type = e.get('@type')
971 if expected_type is not None and expected_type != item_type:
972 return info
973 if item_type == 'TVEpisode':
974 info.update({
975 'episode': unescapeHTML(e.get('name')),
976 'episode_number': int_or_none(e.get('episodeNumber')),
977 'description': unescapeHTML(e.get('description')),
978 })
979 part_of_season = e.get('partOfSeason')
980 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
981 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
982 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
983 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
984 info['series'] = unescapeHTML(part_of_series.get('name'))
985 elif item_type == 'Article':
986 info.update({
987 'timestamp': parse_iso8601(e.get('datePublished')),
988 'title': unescapeHTML(e.get('headline')),
989 'description': unescapeHTML(e.get('articleBody')),
990 })
991 elif item_type == 'VideoObject':
992 info.update({
993 'url': e.get('contentUrl'),
994 'title': unescapeHTML(e.get('name')),
995 'description': unescapeHTML(e.get('description')),
996 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
997 'duration': parse_duration(e.get('duration')),
998 'timestamp': unified_timestamp(e.get('uploadDate')),
999 'filesize': float_or_none(e.get('contentSize')),
1000 'tbr': int_or_none(e.get('bitrate')),
1001 'width': int_or_none(e.get('width')),
1002 'height': int_or_none(e.get('height')),
1003 })
1004 break
1005 return dict((k, v) for k, v in info.items() if v is not None)
1006
1007 @staticmethod
1008 def _hidden_inputs(html):
1009 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1010 hidden_inputs = {}
1011 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1012 attrs = extract_attributes(input)
1013 if not input:
1014 continue
1015 if attrs.get('type') not in ('hidden', 'submit'):
1016 continue
1017 name = attrs.get('name') or attrs.get('id')
1018 value = attrs.get('value')
1019 if name and value is not None:
1020 hidden_inputs[name] = value
1021 return hidden_inputs
1022
1023 def _form_hidden_inputs(self, form_id, html):
1024 form = self._search_regex(
1025 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1026 html, '%s form' % form_id, group='form')
1027 return self._hidden_inputs(form)
1028
1029 def _sort_formats(self, formats, field_preference=None):
1030 if not formats:
1031 raise ExtractorError('No video formats found')
1032
1033 for f in formats:
1034 # Automatically determine tbr when missing based on abr and vbr (improves
1035 # formats sorting in some cases)
1036 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1037 f['tbr'] = f['abr'] + f['vbr']
1038
1039 def _formats_key(f):
1040 # TODO remove the following workaround
1041 from ..utils import determine_ext
1042 if not f.get('ext') and 'url' in f:
1043 f['ext'] = determine_ext(f['url'])
1044
1045 if isinstance(field_preference, (list, tuple)):
1046 return tuple(
1047 f.get(field)
1048 if f.get(field) is not None
1049 else ('' if field == 'format_id' else -1)
1050 for field in field_preference)
1051
1052 preference = f.get('preference')
1053 if preference is None:
1054 preference = 0
1055 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1056 preference -= 0.5
1057
1058 protocol = f.get('protocol') or determine_protocol(f)
1059 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1060
1061 if f.get('vcodec') == 'none': # audio only
1062 preference -= 50
1063 if self._downloader.params.get('prefer_free_formats'):
1064 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1065 else:
1066 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1067 ext_preference = 0
1068 try:
1069 audio_ext_preference = ORDER.index(f['ext'])
1070 except ValueError:
1071 audio_ext_preference = -1
1072 else:
1073 if f.get('acodec') == 'none': # video only
1074 preference -= 40
1075 if self._downloader.params.get('prefer_free_formats'):
1076 ORDER = ['flv', 'mp4', 'webm']
1077 else:
1078 ORDER = ['webm', 'flv', 'mp4']
1079 try:
1080 ext_preference = ORDER.index(f['ext'])
1081 except ValueError:
1082 ext_preference = -1
1083 audio_ext_preference = 0
1084
1085 return (
1086 preference,
1087 f.get('language_preference') if f.get('language_preference') is not None else -1,
1088 f.get('quality') if f.get('quality') is not None else -1,
1089 f.get('tbr') if f.get('tbr') is not None else -1,
1090 f.get('filesize') if f.get('filesize') is not None else -1,
1091 f.get('vbr') if f.get('vbr') is not None else -1,
1092 f.get('height') if f.get('height') is not None else -1,
1093 f.get('width') if f.get('width') is not None else -1,
1094 proto_preference,
1095 ext_preference,
1096 f.get('abr') if f.get('abr') is not None else -1,
1097 audio_ext_preference,
1098 f.get('fps') if f.get('fps') is not None else -1,
1099 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1100 f.get('source_preference') if f.get('source_preference') is not None else -1,
1101 f.get('format_id') if f.get('format_id') is not None else '',
1102 )
1103 formats.sort(key=_formats_key)
1104
1105 def _check_formats(self, formats, video_id):
1106 if formats:
1107 formats[:] = filter(
1108 lambda f: self._is_valid_url(
1109 f['url'], video_id,
1110 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1111 formats)
1112
1113 @staticmethod
1114 def _remove_duplicate_formats(formats):
1115 format_urls = set()
1116 unique_formats = []
1117 for f in formats:
1118 if f['url'] not in format_urls:
1119 format_urls.add(f['url'])
1120 unique_formats.append(f)
1121 formats[:] = unique_formats
1122
1123 def _is_valid_url(self, url, video_id, item='video', headers={}):
1124 url = self._proto_relative_url(url, scheme='http:')
1125 # For now assume non HTTP(S) URLs always valid
1126 if not (url.startswith('http://') or url.startswith('https://')):
1127 return True
1128 try:
1129 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1130 return True
1131 except ExtractorError as e:
1132 if isinstance(e.cause, compat_urllib_error.URLError):
1133 self.to_screen(
1134 '%s: %s URL is invalid, skipping' % (video_id, item))
1135 return False
1136 raise
1137
1138 def http_scheme(self):
1139 """ Either "http:" or "https:", depending on the user's preferences """
1140 return (
1141 'http:'
1142 if self._downloader.params.get('prefer_insecure', False)
1143 else 'https:')
1144
1145 def _proto_relative_url(self, url, scheme=None):
1146 if url is None:
1147 return url
1148 if url.startswith('//'):
1149 if scheme is None:
1150 scheme = self.http_scheme()
1151 return scheme + url
1152 else:
1153 return url
1154
1155 def _sleep(self, timeout, video_id, msg_template=None):
1156 if msg_template is None:
1157 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1158 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1159 self.to_screen(msg)
1160 time.sleep(timeout)
1161
1162 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1163 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1164 fatal=True, m3u8_id=None):
1165 manifest = self._download_xml(
1166 manifest_url, video_id, 'Downloading f4m manifest',
1167 'Unable to download f4m manifest',
1168 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1169 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1170 transform_source=transform_source,
1171 fatal=fatal)
1172
1173 if manifest is False:
1174 return []
1175
1176 return self._parse_f4m_formats(
1177 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1178 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1179
1180 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1181 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1182 fatal=True, m3u8_id=None):
1183 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1184 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1185 if akamai_pv is not None and ';' in akamai_pv.text:
1186 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1187 if playerVerificationChallenge.strip() != '':
1188 return []
1189
1190 formats = []
1191 manifest_version = '1.0'
1192 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1193 if not media_nodes:
1194 manifest_version = '2.0'
1195 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1196 # Remove unsupported DRM protected media from final formats
1197 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1198 media_nodes = remove_encrypted_media(media_nodes)
1199 if not media_nodes:
1200 return formats
1201 base_url = xpath_text(
1202 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1203 'base URL', default=None)
1204 if base_url:
1205 base_url = base_url.strip()
1206
1207 bootstrap_info = xpath_element(
1208 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1209 'bootstrap info', default=None)
1210
1211 vcodec = None
1212 mime_type = xpath_text(
1213 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1214 'base URL', default=None)
1215 if mime_type and mime_type.startswith('audio/'):
1216 vcodec = 'none'
1217
1218 for i, media_el in enumerate(media_nodes):
1219 tbr = int_or_none(media_el.attrib.get('bitrate'))
1220 width = int_or_none(media_el.attrib.get('width'))
1221 height = int_or_none(media_el.attrib.get('height'))
1222 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1223 # If <bootstrapInfo> is present, the specified f4m is a
1224 # stream-level manifest, and only set-level manifests may refer to
1225 # external resources. See section 11.4 and section 4 of F4M spec
1226 if bootstrap_info is None:
1227 media_url = None
1228 # @href is introduced in 2.0, see section 11.6 of F4M spec
1229 if manifest_version == '2.0':
1230 media_url = media_el.attrib.get('href')
1231 if media_url is None:
1232 media_url = media_el.attrib.get('url')
1233 if not media_url:
1234 continue
1235 manifest_url = (
1236 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1237 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1238 # If media_url is itself a f4m manifest do the recursive extraction
1239 # since bitrates in parent manifest (this one) and media_url manifest
1240 # may differ leading to inability to resolve the format by requested
1241 # bitrate in f4m downloader
1242 ext = determine_ext(manifest_url)
1243 if ext == 'f4m':
1244 f4m_formats = self._extract_f4m_formats(
1245 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1246 transform_source=transform_source, fatal=fatal)
1247 # Sometimes stream-level manifest contains single media entry that
1248 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1249 # At the same time parent's media entry in set-level manifest may
1250 # contain it. We will copy it from parent in such cases.
1251 if len(f4m_formats) == 1:
1252 f = f4m_formats[0]
1253 f.update({
1254 'tbr': f.get('tbr') or tbr,
1255 'width': f.get('width') or width,
1256 'height': f.get('height') or height,
1257 'format_id': f.get('format_id') if not tbr else format_id,
1258 'vcodec': vcodec,
1259 })
1260 formats.extend(f4m_formats)
1261 continue
1262 elif ext == 'm3u8':
1263 formats.extend(self._extract_m3u8_formats(
1264 manifest_url, video_id, 'mp4', preference=preference,
1265 m3u8_id=m3u8_id, fatal=fatal))
1266 continue
1267 formats.append({
1268 'format_id': format_id,
1269 'url': manifest_url,
1270 'manifest_url': manifest_url,
1271 'ext': 'flv' if bootstrap_info is not None else None,
1272 'tbr': tbr,
1273 'width': width,
1274 'height': height,
1275 'vcodec': vcodec,
1276 'preference': preference,
1277 })
1278 return formats
1279
1280 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1281 return {
1282 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1283 'url': m3u8_url,
1284 'ext': ext,
1285 'protocol': 'm3u8',
1286 'preference': preference - 100 if preference else -100,
1287 'resolution': 'multiple',
1288 'format_note': 'Quality selection URL',
1289 }
1290
1291 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1292 entry_protocol='m3u8', preference=None,
1293 m3u8_id=None, note=None, errnote=None,
1294 fatal=True, live=False):
1295
1296 res = self._download_webpage_handle(
1297 m3u8_url, video_id,
1298 note=note or 'Downloading m3u8 information',
1299 errnote=errnote or 'Failed to download m3u8 information',
1300 fatal=fatal)
1301 if res is False:
1302 return []
1303 m3u8_doc, urlh = res
1304 m3u8_url = urlh.geturl()
1305
1306 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1307 return []
1308
1309 formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1310
1311 format_url = lambda u: (
1312 u
1313 if re.match(r'^https?://', u)
1314 else compat_urlparse.urljoin(m3u8_url, u))
1315
1316 # We should try extracting formats only from master playlists [1], i.e.
1317 # playlists that describe available qualities. On the other hand media
1318 # playlists [2] should be returned as is since they contain just the media
1319 # without qualities renditions.
1320 # Fortunately, master playlist can be easily distinguished from media
1321 # playlist based on particular tags availability. As of [1, 2] master
1322 # playlist tags MUST NOT appear in a media playist and vice versa.
1323 # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1324 # and MUST NOT appear in master playlist thus we can clearly detect media
1325 # playlist with this criterion.
1326 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1327 # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1328 # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1329 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1330 return [{
1331 'url': m3u8_url,
1332 'format_id': m3u8_id,
1333 'ext': ext,
1334 'protocol': entry_protocol,
1335 'preference': preference,
1336 }]
1337 audio_in_video_stream = {}
1338 last_info = {}
1339 last_media = {}
1340 for line in m3u8_doc.splitlines():
1341 if line.startswith('#EXT-X-STREAM-INF:'):
1342 last_info = parse_m3u8_attributes(line)
1343 elif line.startswith('#EXT-X-MEDIA:'):
1344 media = parse_m3u8_attributes(line)
1345 media_type = media.get('TYPE')
1346 if media_type in ('VIDEO', 'AUDIO'):
1347 group_id = media.get('GROUP-ID')
1348 media_url = media.get('URI')
1349 if media_url:
1350 format_id = []
1351 for v in (group_id, media.get('NAME')):
1352 if v:
1353 format_id.append(v)
1354 f = {
1355 'format_id': '-'.join(format_id),
1356 'url': format_url(media_url),
1357 'language': media.get('LANGUAGE'),
1358 'ext': ext,
1359 'protocol': entry_protocol,
1360 'preference': preference,
1361 }
1362 if media_type == 'AUDIO':
1363 f['vcodec'] = 'none'
1364 if group_id and not audio_in_video_stream.get(group_id):
1365 audio_in_video_stream[group_id] = False
1366 formats.append(f)
1367 else:
1368 # When there is no URI in EXT-X-MEDIA let this tag's
1369 # data be used by regular URI lines below
1370 last_media = media
1371 if media_type == 'AUDIO' and group_id:
1372 audio_in_video_stream[group_id] = True
1373 elif line.startswith('#') or not line.strip():
1374 continue
1375 else:
1376 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1377 format_id = []
1378 if m3u8_id:
1379 format_id.append(m3u8_id)
1380 # Despite specification does not mention NAME attribute for
1381 # EXT-X-STREAM-INF it still sometimes may be present
1382 stream_name = last_info.get('NAME') or last_media.get('NAME')
1383 # Bandwidth of live streams may differ over time thus making
1384 # format_id unpredictable. So it's better to keep provided
1385 # format_id intact.
1386 if not live:
1387 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1388 manifest_url = format_url(line.strip())
1389 f = {
1390 'format_id': '-'.join(format_id),
1391 'url': manifest_url,
1392 'manifest_url': manifest_url,
1393 'tbr': tbr,
1394 'ext': ext,
1395 'fps': float_or_none(last_info.get('FRAME-RATE')),
1396 'protocol': entry_protocol,
1397 'preference': preference,
1398 }
1399 resolution = last_info.get('RESOLUTION')
1400 if resolution:
1401 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1402 if mobj:
1403 f['width'] = int(mobj.group('width'))
1404 f['height'] = int(mobj.group('height'))
1405 # Unified Streaming Platform
1406 mobj = re.search(
1407 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1408 if mobj:
1409 abr, vbr = mobj.groups()
1410 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1411 f.update({
1412 'vbr': vbr,
1413 'abr': abr,
1414 })
1415 f.update(parse_codecs(last_info.get('CODECS')))
1416 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1417 # TODO: update acodec for audio only formats with the same GROUP-ID
1418 f['acodec'] = 'none'
1419 formats.append(f)
1420 last_info = {}
1421 last_media = {}
1422 return formats
1423
1424 @staticmethod
1425 def _xpath_ns(path, namespace=None):
1426 if not namespace:
1427 return path
1428 out = []
1429 for c in path.split('/'):
1430 if not c or c == '.':
1431 out.append(c)
1432 else:
1433 out.append('{%s}%s' % (namespace, c))
1434 return '/'.join(out)
1435
1436 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1437 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1438
1439 if smil is False:
1440 assert not fatal
1441 return []
1442
1443 namespace = self._parse_smil_namespace(smil)
1444
1445 return self._parse_smil_formats(
1446 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1447
1448 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1449 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1450 if smil is False:
1451 return {}
1452 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1453
1454 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1455 return self._download_xml(
1456 smil_url, video_id, 'Downloading SMIL file',
1457 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1458
1459 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1460 namespace = self._parse_smil_namespace(smil)
1461
1462 formats = self._parse_smil_formats(
1463 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1464 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1465
1466 video_id = os.path.splitext(url_basename(smil_url))[0]
1467 title = None
1468 description = None
1469 upload_date = None
1470 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1471 name = meta.attrib.get('name')
1472 content = meta.attrib.get('content')
1473 if not name or not content:
1474 continue
1475 if not title and name == 'title':
1476 title = content
1477 elif not description and name in ('description', 'abstract'):
1478 description = content
1479 elif not upload_date and name == 'date':
1480 upload_date = unified_strdate(content)
1481
1482 thumbnails = [{
1483 'id': image.get('type'),
1484 'url': image.get('src'),
1485 'width': int_or_none(image.get('width')),
1486 'height': int_or_none(image.get('height')),
1487 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1488
1489 return {
1490 'id': video_id,
1491 'title': title or video_id,
1492 'description': description,
1493 'upload_date': upload_date,
1494 'thumbnails': thumbnails,
1495 'formats': formats,
1496 'subtitles': subtitles,
1497 }
1498
1499 def _parse_smil_namespace(self, smil):
1500 return self._search_regex(
1501 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1502
1503 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1504 base = smil_url
1505 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1506 b = meta.get('base') or meta.get('httpBase')
1507 if b:
1508 base = b
1509 break
1510
1511 formats = []
1512 rtmp_count = 0
1513 http_count = 0
1514 m3u8_count = 0
1515
1516 srcs = []
1517 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1518 for medium in media:
1519 src = medium.get('src')
1520 if not src or src in srcs:
1521 continue
1522 srcs.append(src)
1523
1524 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1525 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1526 width = int_or_none(medium.get('width'))
1527 height = int_or_none(medium.get('height'))
1528 proto = medium.get('proto')
1529 ext = medium.get('ext')
1530 src_ext = determine_ext(src)
1531 streamer = medium.get('streamer') or base
1532
1533 if proto == 'rtmp' or streamer.startswith('rtmp'):
1534 rtmp_count += 1
1535 formats.append({
1536 'url': streamer,
1537 'play_path': src,
1538 'ext': 'flv',
1539 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1540 'tbr': bitrate,
1541 'filesize': filesize,
1542 'width': width,
1543 'height': height,
1544 })
1545 if transform_rtmp_url:
1546 streamer, src = transform_rtmp_url(streamer, src)
1547 formats[-1].update({
1548 'url': streamer,
1549 'play_path': src,
1550 })
1551 continue
1552
1553 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1554 src_url = src_url.strip()
1555
1556 if proto == 'm3u8' or src_ext == 'm3u8':
1557 m3u8_formats = self._extract_m3u8_formats(
1558 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1559 if len(m3u8_formats) == 1:
1560 m3u8_count += 1
1561 m3u8_formats[0].update({
1562 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1563 'tbr': bitrate,
1564 'width': width,
1565 'height': height,
1566 })
1567 formats.extend(m3u8_formats)
1568 continue
1569
1570 if src_ext == 'f4m':
1571 f4m_url = src_url
1572 if not f4m_params:
1573 f4m_params = {
1574 'hdcore': '3.2.0',
1575 'plugin': 'flowplayer-3.2.0.1',
1576 }
1577 f4m_url += '&' if '?' in f4m_url else '?'
1578 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1579 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1580 continue
1581
1582 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1583 http_count += 1
1584 formats.append({
1585 'url': src_url,
1586 'ext': ext or src_ext or 'flv',
1587 'format_id': 'http-%d' % (bitrate or http_count),
1588 'tbr': bitrate,
1589 'filesize': filesize,
1590 'width': width,
1591 'height': height,
1592 })
1593 continue
1594
1595 return formats
1596
1597 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1598 urls = []
1599 subtitles = {}
1600 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1601 src = textstream.get('src')
1602 if not src or src in urls:
1603 continue
1604 urls.append(src)
1605 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1606 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1607 subtitles.setdefault(lang, []).append({
1608 'url': src,
1609 'ext': ext,
1610 })
1611 return subtitles
1612
1613 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1614 xspf = self._download_xml(
1615 playlist_url, playlist_id, 'Downloading xpsf playlist',
1616 'Unable to download xspf manifest', fatal=fatal)
1617 if xspf is False:
1618 return []
1619 return self._parse_xspf(xspf, playlist_id)
1620
1621 def _parse_xspf(self, playlist, playlist_id):
1622 NS_MAP = {
1623 'xspf': 'http://xspf.org/ns/0/',
1624 's1': 'http://static.streamone.nl/player/ns/0',
1625 }
1626
1627 entries = []
1628 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1629 title = xpath_text(
1630 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1631 description = xpath_text(
1632 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1633 thumbnail = xpath_text(
1634 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1635 duration = float_or_none(
1636 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1637
1638 formats = [{
1639 'url': location.text,
1640 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1641 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1642 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1643 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1644 self._sort_formats(formats)
1645
1646 entries.append({
1647 'id': playlist_id,
1648 'title': title,
1649 'description': description,
1650 'thumbnail': thumbnail,
1651 'duration': duration,
1652 'formats': formats,
1653 })
1654 return entries
1655
1656 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1657 res = self._download_webpage_handle(
1658 mpd_url, video_id,
1659 note=note or 'Downloading MPD manifest',
1660 errnote=errnote or 'Failed to download MPD manifest',
1661 fatal=fatal)
1662 if res is False:
1663 return []
1664 mpd, urlh = res
1665 mpd_base_url = base_url(urlh.geturl())
1666
1667 return self._parse_mpd_formats(
1668 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1669 formats_dict=formats_dict, mpd_url=mpd_url)
1670
1671 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1672 """
1673 Parse formats from MPD manifest.
1674 References:
1675 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1676 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1677 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1678 """
1679 if mpd_doc.get('type') == 'dynamic':
1680 return []
1681
1682 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1683
1684 def _add_ns(path):
1685 return self._xpath_ns(path, namespace)
1686
1687 def is_drm_protected(element):
1688 return element.find(_add_ns('ContentProtection')) is not None
1689
1690 def extract_multisegment_info(element, ms_parent_info):
1691 ms_info = ms_parent_info.copy()
1692
1693 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1694 # common attributes and elements. We will only extract relevant
1695 # for us.
1696 def extract_common(source):
1697 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1698 if segment_timeline is not None:
1699 s_e = segment_timeline.findall(_add_ns('S'))
1700 if s_e:
1701 ms_info['total_number'] = 0
1702 ms_info['s'] = []
1703 for s in s_e:
1704 r = int(s.get('r', 0))
1705 ms_info['total_number'] += 1 + r
1706 ms_info['s'].append({
1707 't': int(s.get('t', 0)),
1708 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1709 'd': int(s.attrib['d']),
1710 'r': r,
1711 })
1712 start_number = source.get('startNumber')
1713 if start_number:
1714 ms_info['start_number'] = int(start_number)
1715 timescale = source.get('timescale')
1716 if timescale:
1717 ms_info['timescale'] = int(timescale)
1718 segment_duration = source.get('duration')
1719 if segment_duration:
1720 ms_info['segment_duration'] = int(segment_duration)
1721
1722 def extract_Initialization(source):
1723 initialization = source.find(_add_ns('Initialization'))
1724 if initialization is not None:
1725 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1726
1727 segment_list = element.find(_add_ns('SegmentList'))
1728 if segment_list is not None:
1729 extract_common(segment_list)
1730 extract_Initialization(segment_list)
1731 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1732 if segment_urls_e:
1733 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1734 else:
1735 segment_template = element.find(_add_ns('SegmentTemplate'))
1736 if segment_template is not None:
1737 extract_common(segment_template)
1738 media = segment_template.get('media')
1739 if media:
1740 ms_info['media'] = media
1741 initialization = segment_template.get('initialization')
1742 if initialization:
1743 ms_info['initialization'] = initialization
1744 else:
1745 extract_Initialization(segment_template)
1746 return ms_info
1747
1748 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1749 formats = []
1750 for period in mpd_doc.findall(_add_ns('Period')):
1751 period_duration = parse_duration(period.get('duration')) or mpd_duration
1752 period_ms_info = extract_multisegment_info(period, {
1753 'start_number': 1,
1754 'timescale': 1,
1755 })
1756 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1757 if is_drm_protected(adaptation_set):
1758 continue
1759 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1760 for representation in adaptation_set.findall(_add_ns('Representation')):
1761 if is_drm_protected(representation):
1762 continue
1763 representation_attrib = adaptation_set.attrib.copy()
1764 representation_attrib.update(representation.attrib)
1765 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1766 mime_type = representation_attrib['mimeType']
1767 content_type = mime_type.split('/')[0]
1768 if content_type == 'text':
1769 # TODO implement WebVTT downloading
1770 pass
1771 elif content_type == 'video' or content_type == 'audio':
1772 base_url = ''
1773 for element in (representation, adaptation_set, period, mpd_doc):
1774 base_url_e = element.find(_add_ns('BaseURL'))
1775 if base_url_e is not None:
1776 base_url = base_url_e.text + base_url
1777 if re.match(r'^https?://', base_url):
1778 break
1779 if mpd_base_url and not re.match(r'^https?://', base_url):
1780 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1781 mpd_base_url += '/'
1782 base_url = mpd_base_url + base_url
1783 representation_id = representation_attrib.get('id')
1784 lang = representation_attrib.get('lang')
1785 url_el = representation.find(_add_ns('BaseURL'))
1786 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1787 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1788 f = {
1789 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1790 'url': base_url,
1791 'manifest_url': mpd_url,
1792 'ext': mimetype2ext(mime_type),
1793 'width': int_or_none(representation_attrib.get('width')),
1794 'height': int_or_none(representation_attrib.get('height')),
1795 'tbr': int_or_none(bandwidth, 1000),
1796 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1797 'fps': int_or_none(representation_attrib.get('frameRate')),
1798 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1799 'format_note': 'DASH %s' % content_type,
1800 'filesize': filesize,
1801 }
1802 f.update(parse_codecs(representation_attrib.get('codecs')))
1803 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1804
1805 def prepare_template(template_name, identifiers):
1806 t = representation_ms_info[template_name]
1807 t = t.replace('$RepresentationID$', representation_id)
1808 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1809 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1810 t.replace('$$', '$')
1811 return t
1812
1813 # @initialization is a regular template like @media one
1814 # so it should be handled just the same way (see
1815 # https://github.com/rg3/youtube-dl/issues/11605)
1816 if 'initialization' in representation_ms_info:
1817 initialization_template = prepare_template(
1818 'initialization',
1819 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1820 # $Time$ shall not be included for @initialization thus
1821 # only $Bandwidth$ remains
1822 ('Bandwidth', ))
1823 representation_ms_info['initialization_url'] = initialization_template % {
1824 'Bandwidth': bandwidth,
1825 }
1826
1827 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1828
1829 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1830
1831 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1832 # can't be used at the same time
1833 if '%(Number' in media_template and 's' not in representation_ms_info:
1834 segment_duration = None
1835 if 'total_number' not in representation_ms_info and 'segment_duration':
1836 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1837 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1838 representation_ms_info['fragments'] = [{
1839 'url': media_template % {
1840 'Number': segment_number,
1841 'Bandwidth': bandwidth,
1842 },
1843 'duration': segment_duration,
1844 } for segment_number in range(
1845 representation_ms_info['start_number'],
1846 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1847 else:
1848 # $Number*$ or $Time$ in media template with S list available
1849 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1850 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1851 representation_ms_info['fragments'] = []
1852 segment_time = 0
1853 segment_d = None
1854 segment_number = representation_ms_info['start_number']
1855
1856 def add_segment_url():
1857 segment_url = media_template % {
1858 'Time': segment_time,
1859 'Bandwidth': bandwidth,
1860 'Number': segment_number,
1861 }
1862 representation_ms_info['fragments'].append({
1863 'url': segment_url,
1864 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1865 })
1866
1867 for num, s in enumerate(representation_ms_info['s']):
1868 segment_time = s.get('t') or segment_time
1869 segment_d = s['d']
1870 add_segment_url()
1871 segment_number += 1
1872 for r in range(s.get('r', 0)):
1873 segment_time += segment_d
1874 add_segment_url()
1875 segment_number += 1
1876 segment_time += segment_d
1877 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1878 # No media template
1879 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1880 # or any YouTube dashsegments video
1881 fragments = []
1882 segment_index = 0
1883 timescale = representation_ms_info['timescale']
1884 for s in representation_ms_info['s']:
1885 duration = float_or_none(s['d'], timescale)
1886 for r in range(s.get('r', 0) + 1):
1887 fragments.append({
1888 'url': representation_ms_info['segment_urls'][segment_index],
1889 'duration': duration,
1890 })
1891 segment_index += 1
1892 representation_ms_info['fragments'] = fragments
1893 # NB: MPD manifest may contain direct URLs to unfragmented media.
1894 # No fragments key is present in this case.
1895 if 'fragments' in representation_ms_info:
1896 f.update({
1897 'fragments': [],
1898 'protocol': 'http_dash_segments',
1899 })
1900 if 'initialization_url' in representation_ms_info:
1901 initialization_url = representation_ms_info['initialization_url']
1902 if not f.get('url'):
1903 f['url'] = initialization_url
1904 f['fragments'].append({'url': initialization_url})
1905 f['fragments'].extend(representation_ms_info['fragments'])
1906 for fragment in f['fragments']:
1907 fragment['url'] = urljoin(base_url, fragment['url'])
1908 try:
1909 existing_format = next(
1910 fo for fo in formats
1911 if fo['format_id'] == representation_id)
1912 except StopIteration:
1913 full_info = formats_dict.get(representation_id, {}).copy()
1914 full_info.update(f)
1915 formats.append(full_info)
1916 else:
1917 existing_format.update(f)
1918 else:
1919 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1920 return formats
1921
1922 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1923 res = self._download_webpage_handle(
1924 ism_url, video_id,
1925 note=note or 'Downloading ISM manifest',
1926 errnote=errnote or 'Failed to download ISM manifest',
1927 fatal=fatal)
1928 if res is False:
1929 return []
1930 ism, urlh = res
1931
1932 return self._parse_ism_formats(
1933 compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1934
1935 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1936 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1937 return []
1938
1939 duration = int(ism_doc.attrib['Duration'])
1940 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1941
1942 formats = []
1943 for stream in ism_doc.findall('StreamIndex'):
1944 stream_type = stream.get('Type')
1945 if stream_type not in ('video', 'audio'):
1946 continue
1947 url_pattern = stream.attrib['Url']
1948 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1949 stream_name = stream.get('Name')
1950 for track in stream.findall('QualityLevel'):
1951 fourcc = track.get('FourCC')
1952 # TODO: add support for WVC1 and WMAP
1953 if fourcc not in ('H264', 'AVC1', 'AACL'):
1954 self.report_warning('%s is not a supported codec' % fourcc)
1955 continue
1956 tbr = int(track.attrib['Bitrate']) // 1000
1957 width = int_or_none(track.get('MaxWidth'))
1958 height = int_or_none(track.get('MaxHeight'))
1959 sampling_rate = int_or_none(track.get('SamplingRate'))
1960
1961 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1962 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1963
1964 fragments = []
1965 fragment_ctx = {
1966 'time': 0,
1967 }
1968 stream_fragments = stream.findall('c')
1969 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1970 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1971 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1972 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1973 if not fragment_ctx['duration']:
1974 try:
1975 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1976 except IndexError:
1977 next_fragment_time = duration
1978 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1979 for _ in range(fragment_repeat):
1980 fragments.append({
1981 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1982 'duration': fragment_ctx['duration'] / stream_timescale,
1983 })
1984 fragment_ctx['time'] += fragment_ctx['duration']
1985
1986 format_id = []
1987 if ism_id:
1988 format_id.append(ism_id)
1989 if stream_name:
1990 format_id.append(stream_name)
1991 format_id.append(compat_str(tbr))
1992
1993 formats.append({
1994 'format_id': '-'.join(format_id),
1995 'url': ism_url,
1996 'manifest_url': ism_url,
1997 'ext': 'ismv' if stream_type == 'video' else 'isma',
1998 'width': width,
1999 'height': height,
2000 'tbr': tbr,
2001 'asr': sampling_rate,
2002 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2003 'acodec': 'none' if stream_type == 'video' else fourcc,
2004 'protocol': 'ism',
2005 'fragments': fragments,
2006 '_download_params': {
2007 'duration': duration,
2008 'timescale': stream_timescale,
2009 'width': width or 0,
2010 'height': height or 0,
2011 'fourcc': fourcc,
2012 'codec_private_data': track.get('CodecPrivateData'),
2013 'sampling_rate': sampling_rate,
2014 'channels': int_or_none(track.get('Channels', 2)),
2015 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2016 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2017 },
2018 })
2019 return formats
2020
2021 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2022 def absolute_url(video_url):
2023 return compat_urlparse.urljoin(base_url, video_url)
2024
2025 def parse_content_type(content_type):
2026 if not content_type:
2027 return {}
2028 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2029 if ctr:
2030 mimetype, codecs = ctr.groups()
2031 f = parse_codecs(codecs)
2032 f['ext'] = mimetype2ext(mimetype)
2033 return f
2034 return {}
2035
2036 def _media_formats(src, cur_media_type):
2037 full_url = absolute_url(src)
2038 ext = determine_ext(full_url)
2039 if ext == 'm3u8':
2040 is_plain_url = False
2041 formats = self._extract_m3u8_formats(
2042 full_url, video_id, ext='mp4',
2043 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2044 preference=preference)
2045 elif ext == 'mpd':
2046 is_plain_url = False
2047 formats = self._extract_mpd_formats(
2048 full_url, video_id, mpd_id=mpd_id)
2049 else:
2050 is_plain_url = True
2051 formats = [{
2052 'url': full_url,
2053 'vcodec': 'none' if cur_media_type == 'audio' else None,
2054 }]
2055 return is_plain_url, formats
2056
2057 entries = []
2058 media_tags = [(media_tag, media_type, '')
2059 for media_tag, media_type
2060 in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2061 media_tags.extend(re.findall(
2062 # We only allow video|audio followed by a whitespace or '>'.
2063 # Allowing more characters may end up in significant slow down (see
2064 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2065 # http://www.porntrex.com/maps/videositemap.xml).
2066 r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2067 for media_tag, media_type, media_content in media_tags:
2068 media_info = {
2069 'formats': [],
2070 'subtitles': {},
2071 }
2072 media_attributes = extract_attributes(media_tag)
2073 src = media_attributes.get('src')
2074 if src:
2075 _, formats = _media_formats(src, media_type)
2076 media_info['formats'].extend(formats)
2077 media_info['thumbnail'] = media_attributes.get('poster')
2078 if media_content:
2079 for source_tag in re.findall(r'<source[^>]+>', media_content):
2080 source_attributes = extract_attributes(source_tag)
2081 src = source_attributes.get('src')
2082 if not src:
2083 continue
2084 is_plain_url, formats = _media_formats(src, media_type)
2085 if is_plain_url:
2086 f = parse_content_type(source_attributes.get('type'))
2087 f.update(formats[0])
2088 media_info['formats'].append(f)
2089 else:
2090 media_info['formats'].extend(formats)
2091 for track_tag in re.findall(r'<track[^>]+>', media_content):
2092 track_attributes = extract_attributes(track_tag)
2093 kind = track_attributes.get('kind')
2094 if not kind or kind in ('subtitles', 'captions'):
2095 src = track_attributes.get('src')
2096 if not src:
2097 continue
2098 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2099 media_info['subtitles'].setdefault(lang, []).append({
2100 'url': absolute_url(src),
2101 })
2102 if media_info['formats'] or media_info['subtitles']:
2103 entries.append(media_info)
2104 return entries
2105
2106 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2107 formats = []
2108 hdcore_sign = 'hdcore=3.7.0'
2109 f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2110 hds_host = hosts.get('hds')
2111 if hds_host:
2112 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2113 if 'hdcore=' not in f4m_url:
2114 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2115 f4m_formats = self._extract_f4m_formats(
2116 f4m_url, video_id, f4m_id='hds', fatal=False)
2117 for entry in f4m_formats:
2118 entry.update({'extra_param_to_segment_url': hdcore_sign})
2119 formats.extend(f4m_formats)
2120 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2121 hls_host = hosts.get('hls')
2122 if hls_host:
2123 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2124 formats.extend(self._extract_m3u8_formats(
2125 m3u8_url, video_id, 'mp4', 'm3u8_native',
2126 m3u8_id='hls', fatal=False))
2127 return formats
2128
2129 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2130 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2131 url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2132 http_base_url = 'http' + url_base
2133 formats = []
2134 if 'm3u8' not in skip_protocols:
2135 formats.extend(self._extract_m3u8_formats(
2136 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2137 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2138 if 'f4m' not in skip_protocols:
2139 formats.extend(self._extract_f4m_formats(
2140 http_base_url + '/manifest.f4m',
2141 video_id, f4m_id='hds', fatal=False))
2142 if 'dash' not in skip_protocols:
2143 formats.extend(self._extract_mpd_formats(
2144 http_base_url + '/manifest.mpd',
2145 video_id, mpd_id='dash', fatal=False))
2146 if re.search(r'(?:/smil:|\.smil)', url_base):
2147 if 'smil' not in skip_protocols:
2148 rtmp_formats = self._extract_smil_formats(
2149 http_base_url + '/jwplayer.smil',
2150 video_id, fatal=False)
2151 for rtmp_format in rtmp_formats:
2152 rtsp_format = rtmp_format.copy()
2153 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2154 del rtsp_format['play_path']
2155 del rtsp_format['ext']
2156 rtsp_format.update({
2157 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2158 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2159 'protocol': 'rtsp',
2160 })
2161 formats.extend([rtmp_format, rtsp_format])
2162 else:
2163 for protocol in ('rtmp', 'rtsp'):
2164 if protocol not in skip_protocols:
2165 formats.append({
2166 'url': protocol + url_base,
2167 'format_id': protocol,
2168 'protocol': protocol,
2169 })
2170 return formats
2171
2172 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2173 mobj = re.search(
2174 r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2175 webpage)
2176 if mobj:
2177 try:
2178 jwplayer_data = self._parse_json(mobj.group('options'),
2179 video_id=video_id,
2180 transform_source=transform_source)
2181 except ExtractorError:
2182 pass
2183 else:
2184 if isinstance(jwplayer_data, dict):
2185 return jwplayer_data
2186
2187 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2188 jwplayer_data = self._find_jwplayer_data(
2189 webpage, video_id, transform_source=js_to_json)
2190 return self._parse_jwplayer_data(
2191 jwplayer_data, video_id, *args, **kwargs)
2192
2193 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2194 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2195 # JWPlayer backward compatibility: flattened playlists
2196 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2197 if 'playlist' not in jwplayer_data:
2198 jwplayer_data = {'playlist': [jwplayer_data]}
2199
2200 entries = []
2201
2202 # JWPlayer backward compatibility: single playlist item
2203 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2204 if not isinstance(jwplayer_data['playlist'], list):
2205 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2206
2207 for video_data in jwplayer_data['playlist']:
2208 # JWPlayer backward compatibility: flattened sources
2209 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2210 if 'sources' not in video_data:
2211 video_data['sources'] = [video_data]
2212
2213 this_video_id = video_id or video_data['mediaid']
2214
2215 formats = self._parse_jwplayer_formats(
2216 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2217 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2218 self._sort_formats(formats)
2219
2220 subtitles = {}
2221 tracks = video_data.get('tracks')
2222 if tracks and isinstance(tracks, list):
2223 for track in tracks:
2224 if track.get('kind') != 'captions':
2225 continue
2226 track_url = urljoin(base_url, track.get('file'))
2227 if not track_url:
2228 continue
2229 subtitles.setdefault(track.get('label') or 'en', []).append({
2230 'url': self._proto_relative_url(track_url)
2231 })
2232
2233 entries.append({
2234 'id': this_video_id,
2235 'title': video_data['title'] if require_title else video_data.get('title'),
2236 'description': video_data.get('description'),
2237 'thumbnail': self._proto_relative_url(video_data.get('image')),
2238 'timestamp': int_or_none(video_data.get('pubdate')),
2239 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2240 'subtitles': subtitles,
2241 'formats': formats,
2242 })
2243 if len(entries) == 1:
2244 return entries[0]
2245 else:
2246 return self.playlist_result(entries)
2247
2248 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2249 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2250 formats = []
2251 for source in jwplayer_sources_data:
2252 source_url = self._proto_relative_url(source['file'])
2253 if base_url:
2254 source_url = compat_urlparse.urljoin(base_url, source_url)
2255 source_type = source.get('type') or ''
2256 ext = mimetype2ext(source_type) or determine_ext(source_url)
2257 if source_type == 'hls' or ext == 'm3u8':
2258 formats.extend(self._extract_m3u8_formats(
2259 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2260 m3u8_id=m3u8_id, fatal=False))
2261 elif ext == 'mpd':
2262 formats.extend(self._extract_mpd_formats(
2263 source_url, video_id, mpd_id=mpd_id, fatal=False))
2264 elif ext == 'smil':
2265 formats.extend(self._extract_smil_formats(
2266 source_url, video_id, fatal=False))
2267 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2268 elif source_type.startswith('audio') or ext in (
2269 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2270 formats.append({
2271 'url': source_url,
2272 'vcodec': 'none',
2273 'ext': ext,
2274 })
2275 else:
2276 height = int_or_none(source.get('height'))
2277 if height is None:
2278 # Often no height is provided but there is a label in
2279 # format like "1080p", "720p SD", or 1080.
2280 height = int_or_none(self._search_regex(
2281 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2282 'height', default=None))
2283 a_format = {
2284 'url': source_url,
2285 'width': int_or_none(source.get('width')),
2286 'height': height,
2287 'tbr': int_or_none(source.get('bitrate')),
2288 'ext': ext,
2289 }
2290 if source_url.startswith('rtmp'):
2291 a_format['ext'] = 'flv'
2292 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2293 # of jwplayer.flash.swf
2294 rtmp_url_parts = re.split(
2295 r'((?:mp4|mp3|flv):)', source_url, 1)
2296 if len(rtmp_url_parts) == 3:
2297 rtmp_url, prefix, play_path = rtmp_url_parts
2298 a_format.update({
2299 'url': rtmp_url,
2300 'play_path': prefix + play_path,
2301 })
2302 if rtmp_params:
2303 a_format.update(rtmp_params)
2304 formats.append(a_format)
2305 return formats
2306
2307 def _live_title(self, name):
2308 """ Generate the title for a live video """
2309 now = datetime.datetime.now()
2310 now_str = now.strftime('%Y-%m-%d %H:%M')
2311 return name + ' ' + now_str
2312
2313 def _int(self, v, name, fatal=False, **kwargs):
2314 res = int_or_none(v, **kwargs)
2315 if 'get_attr' in kwargs:
2316 print(getattr(v, kwargs['get_attr']))
2317 if res is None:
2318 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2319 if fatal:
2320 raise ExtractorError(msg)
2321 else:
2322 self._downloader.report_warning(msg)
2323 return res
2324
2325 def _float(self, v, name, fatal=False, **kwargs):
2326 res = float_or_none(v, **kwargs)
2327 if res is None:
2328 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2329 if fatal:
2330 raise ExtractorError(msg)
2331 else:
2332 self._downloader.report_warning(msg)
2333 return res
2334
2335 def _set_cookie(self, domain, name, value, expire_time=None):
2336 cookie = compat_cookiejar.Cookie(
2337 0, name, value, None, None, domain, None,
2338 None, '/', True, False, expire_time, '', None, None, None)
2339 self._downloader.cookiejar.set_cookie(cookie)
2340
2341 def _get_cookies(self, url):
2342 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2343 req = sanitized_Request(url)
2344 self._downloader.cookiejar.add_cookie_header(req)
2345 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2346
2347 def get_testcases(self, include_onlymatching=False):
2348 t = getattr(self, '_TEST', None)
2349 if t:
2350 assert not hasattr(self, '_TESTS'), \
2351 '%s has _TEST and _TESTS' % type(self).__name__
2352 tests = [t]
2353 else:
2354 tests = getattr(self, '_TESTS', [])
2355 for t in tests:
2356 if not include_onlymatching and t.get('only_matching', False):
2357 continue
2358 t['name'] = type(self).__name__[:-len('IE')]
2359 yield t
2360
2361 def is_suitable(self, age_limit):
2362 """ Test whether the extractor is generally suitable for the given
2363 age limit (i.e. pornographic sites are not, all others usually are) """
2364
2365 any_restricted = False
2366 for tc in self.get_testcases(include_onlymatching=False):
2367 if tc.get('playlist', []):
2368 tc = tc['playlist'][0]
2369 is_restricted = age_restricted(
2370 tc.get('info_dict', {}).get('age_limit'), age_limit)
2371 if not is_restricted:
2372 return True
2373 any_restricted = any_restricted or is_restricted
2374 return not any_restricted
2375
2376 def extract_subtitles(self, *args, **kwargs):
2377 if (self._downloader.params.get('writesubtitles', False) or
2378 self._downloader.params.get('listsubtitles')):
2379 return self._get_subtitles(*args, **kwargs)
2380 return {}
2381
2382 def _get_subtitles(self, *args, **kwargs):
2383 raise NotImplementedError('This method must be implemented by subclasses')
2384
2385 @staticmethod
2386 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2387 """ Merge subtitle items for one language. Items with duplicated URLs
2388 will be dropped. """
2389 list1_urls = set([item['url'] for item in subtitle_list1])
2390 ret = list(subtitle_list1)
2391 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2392 return ret
2393
2394 @classmethod
2395 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2396 """ Merge two subtitle dictionaries, language by language. """
2397 ret = dict(subtitle_dict1)
2398 for lang in subtitle_dict2:
2399 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2400 return ret
2401
2402 def extract_automatic_captions(self, *args, **kwargs):
2403 if (self._downloader.params.get('writeautomaticsub', False) or
2404 self._downloader.params.get('listsubtitles')):
2405 return self._get_automatic_captions(*args, **kwargs)
2406 return {}
2407
2408 def _get_automatic_captions(self, *args, **kwargs):
2409 raise NotImplementedError('This method must be implemented by subclasses')
2410
2411 def mark_watched(self, *args, **kwargs):
2412 if (self._downloader.params.get('mark_watched', False) and
2413 (self._get_login_info()[0] is not None or
2414 self._downloader.params.get('cookiefile') is not None)):
2415 self._mark_watched(*args, **kwargs)
2416
2417 def _mark_watched(self, *args, **kwargs):
2418 raise NotImplementedError('This method must be implemented by subclasses')
2419
2420 def geo_verification_headers(self):
2421 headers = {}
2422 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2423 if geo_verification_proxy:
2424 headers['Ytdl-request-proxy'] = geo_verification_proxy
2425 return headers
2426
2427 def _generic_id(self, url):
2428 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2429
2430 def _generic_title(self, url):
2431 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2432
2433
2434 class SearchInfoExtractor(InfoExtractor):
2435 """
2436 Base class for paged search queries extractors.
2437 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2438 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2439 """
2440
2441 @classmethod
2442 def _make_valid_url(cls):
2443 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2444
2445 @classmethod
2446 def suitable(cls, url):
2447 return re.match(cls._make_valid_url(), url) is not None
2448
2449 def _real_extract(self, query):
2450 mobj = re.match(self._make_valid_url(), query)
2451 if mobj is None:
2452 raise ExtractorError('Invalid search query "%s"' % query)
2453
2454 prefix = mobj.group('prefix')
2455 query = mobj.group('query')
2456 if prefix == '':
2457 return self._get_n_results(query, 1)
2458 elif prefix == 'all':
2459 return self._get_n_results(query, self._MAX_RESULTS)
2460 else:
2461 n = int(prefix)
2462 if n <= 0:
2463 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2464 elif n > self._MAX_RESULTS:
2465 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2466 n = self._MAX_RESULTS
2467 return self._get_n_results(query, n)
2468
2469 def _get_n_results(self, query, n):
2470 """Get a specified number of results for a query"""
2471 raise NotImplementedError('This method must be implemented by subclasses')
2472
2473 @property
2474 def SEARCH_KEY(self):
2475 return self._SEARCH_KEY