]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
debian/control: Remove trailing whitespace at EOF.
[youtubedl] / youtube_dl / extractor / common.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import base64
5 import datetime
6 import hashlib
7 import json
8 import netrc
9 import os
10 import random
11 import re
12 import socket
13 import sys
14 import time
15 import math
16
17 from ..compat import (
18 compat_cookiejar,
19 compat_cookies,
20 compat_etree_fromstring,
21 compat_getpass,
22 compat_http_client,
23 compat_os_name,
24 compat_str,
25 compat_urllib_error,
26 compat_urllib_parse_unquote,
27 compat_urllib_parse_urlencode,
28 compat_urllib_request,
29 compat_urlparse,
30 compat_xml_parse_error,
31 )
32 from ..downloader.f4m import (
33 get_base_url,
34 remove_encrypted_media,
35 )
36 from ..utils import (
37 NO_DEFAULT,
38 age_restricted,
39 base_url,
40 bug_reports_message,
41 clean_html,
42 compiled_regex_type,
43 determine_ext,
44 determine_protocol,
45 error_to_compat_str,
46 ExtractorError,
47 extract_attributes,
48 fix_xml_ampersands,
49 float_or_none,
50 GeoRestrictedError,
51 GeoUtils,
52 int_or_none,
53 js_to_json,
54 mimetype2ext,
55 orderedSet,
56 parse_codecs,
57 parse_duration,
58 parse_iso8601,
59 parse_m3u8_attributes,
60 RegexNotFoundError,
61 sanitized_Request,
62 sanitize_filename,
63 unescapeHTML,
64 unified_strdate,
65 unified_timestamp,
66 update_Request,
67 update_url_query,
68 urljoin,
69 url_basename,
70 xpath_element,
71 xpath_text,
72 xpath_with_ns,
73 )
74
75
76 class InfoExtractor(object):
77 """Information Extractor class.
78
79 Information extractors are the classes that, given a URL, extract
80 information about the video (or videos) the URL refers to. This
81 information includes the real video URL, the video title, author and
82 others. The information is stored in a dictionary which is then
83 passed to the YoutubeDL. The YoutubeDL processes this
84 information possibly downloading the video to the file system, among
85 other possible outcomes.
86
87 The type field determines the type of the result.
88 By far the most common value (and the default if _type is missing) is
89 "video", which indicates a single video.
90
91 For a video, the dictionaries must include the following fields:
92
93 id: Video identifier.
94 title: Video title, unescaped.
95
96 Additionally, it must contain either a formats entry or a url one:
97
98 formats: A list of dictionaries for each format available, ordered
99 from worst to best quality.
100
101 Potential fields:
102 * url Mandatory. The URL of the video file
103 * manifest_url
104 The URL of the manifest file in case of
105 fragmented media (DASH, hls, hds)
106 * ext Will be calculated from URL if missing
107 * format A human-readable description of the format
108 ("mp4 container with h264/opus").
109 Calculated from the format_id, width, height.
110 and format_note fields if missing.
111 * format_id A short description of the format
112 ("mp4_h264_opus" or "19").
113 Technically optional, but strongly recommended.
114 * format_note Additional info about the format
115 ("3D" or "DASH video")
116 * width Width of the video, if known
117 * height Height of the video, if known
118 * resolution Textual description of width and height
119 * tbr Average bitrate of audio and video in KBit/s
120 * abr Average audio bitrate in KBit/s
121 * acodec Name of the audio codec in use
122 * asr Audio sampling rate in Hertz
123 * vbr Average video bitrate in KBit/s
124 * fps Frame rate
125 * vcodec Name of the video codec in use
126 * container Name of the container format
127 * filesize The number of bytes, if known in advance
128 * filesize_approx An estimate for the number of bytes
129 * player_url SWF Player URL (used for rtmpdump).
130 * protocol The protocol that will be used for the actual
131 download, lower-case.
132 "http", "https", "rtsp", "rtmp", "rtmpe",
133 "m3u8", "m3u8_native" or "http_dash_segments".
134 * fragment_base_url
135 Base URL for fragments. Each fragment's path
136 value (if present) will be relative to
137 this URL.
138 * fragments A list of fragments of a fragmented media.
139 Each fragment entry must contain either an url
140 or a path. If an url is present it should be
141 considered by a client. Otherwise both path and
142 fragment_base_url must be present. Here is
143 the list of all potential fields:
144 * "url" - fragment's URL
145 * "path" - fragment's path relative to
146 fragment_base_url
147 * "duration" (optional, int or float)
148 * "filesize" (optional, int)
149 * preference Order number of this format. If this field is
150 present and not None, the formats get sorted
151 by this field, regardless of all other values.
152 -1 for default (order by other properties),
153 -2 or smaller for less than default.
154 < -1000 to hide the format (if there is
155 another one which is strictly better)
156 * language Language code, e.g. "de" or "en-US".
157 * language_preference Is this in the language mentioned in
158 the URL?
159 10 if it's what the URL is about,
160 -1 for default (don't know),
161 -10 otherwise, other values reserved for now.
162 * quality Order number of the video quality of this
163 format, irrespective of the file format.
164 -1 for default (order by other properties),
165 -2 or smaller for less than default.
166 * source_preference Order number for this video source
167 (quality takes higher priority)
168 -1 for default (order by other properties),
169 -2 or smaller for less than default.
170 * http_headers A dictionary of additional HTTP headers
171 to add to the request.
172 * stretched_ratio If given and not 1, indicates that the
173 video's pixels are not square.
174 width : height ratio as float.
175 * no_resume The server does not support resuming the
176 (HTTP or RTMP) download. Boolean.
177 * downloader_options A dictionary of downloader options as
178 described in FileDownloader
179
180 url: Final video URL.
181 ext: Video filename extension.
182 format: The video format, defaults to ext (used for --get-format)
183 player_url: SWF Player URL (used for rtmpdump).
184
185 The following fields are optional:
186
187 alt_title: A secondary title of the video.
188 display_id An alternative identifier for the video, not necessarily
189 unique, but available before title. Typically, id is
190 something like "4234987", title "Dancing naked mole rats",
191 and display_id "dancing-naked-mole-rats"
192 thumbnails: A list of dictionaries, with the following entries:
193 * "id" (optional, string) - Thumbnail format ID
194 * "url"
195 * "preference" (optional, int) - quality of the image
196 * "width" (optional, int)
197 * "height" (optional, int)
198 * "resolution" (optional, string "{width}x{height"},
199 deprecated)
200 * "filesize" (optional, int)
201 thumbnail: Full URL to a video thumbnail image.
202 description: Full video description.
203 uploader: Full name of the video uploader.
204 license: License name the video is licensed under.
205 creator: The creator of the video.
206 release_date: The date (YYYYMMDD) when the video was released.
207 timestamp: UNIX timestamp of the moment the video became available.
208 upload_date: Video upload date (YYYYMMDD).
209 If not explicitly set, calculated from timestamp.
210 uploader_id: Nickname or id of the video uploader.
211 uploader_url: Full URL to a personal webpage of the video uploader.
212 location: Physical location where the video was filmed.
213 subtitles: The available subtitles as a dictionary in the format
214 {tag: subformats}. "tag" is usually a language code, and
215 "subformats" is a list sorted from lower to higher
216 preference, each element is a dictionary with the "ext"
217 entry and one of:
218 * "data": The subtitles file contents
219 * "url": A URL pointing to the subtitles file
220 "ext" will be calculated from URL if missing
221 automatic_captions: Like 'subtitles', used by the YoutubeIE for
222 automatically generated captions
223 duration: Length of the video in seconds, as an integer or float.
224 view_count: How many users have watched the video on the platform.
225 like_count: Number of positive ratings of the video
226 dislike_count: Number of negative ratings of the video
227 repost_count: Number of reposts of the video
228 average_rating: Average rating give by users, the scale used depends on the webpage
229 comment_count: Number of comments on the video
230 comments: A list of comments, each with one or more of the following
231 properties (all but one of text or html optional):
232 * "author" - human-readable name of the comment author
233 * "author_id" - user ID of the comment author
234 * "id" - Comment ID
235 * "html" - Comment as HTML
236 * "text" - Plain text of the comment
237 * "timestamp" - UNIX timestamp of comment
238 * "parent" - ID of the comment this one is replying to.
239 Set to "root" to indicate that this is a
240 comment to the original video.
241 age_limit: Age restriction for the video, as an integer (years)
242 webpage_url: The URL to the video webpage, if given to youtube-dl it
243 should allow to get the same result again. (It will be set
244 by YoutubeDL if it's missing)
245 categories: A list of categories that the video falls in, for example
246 ["Sports", "Berlin"]
247 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
248 is_live: True, False, or None (=unknown). Whether this video is a
249 live stream that goes on instead of a fixed-length video.
250 start_time: Time in seconds where the reproduction should start, as
251 specified in the URL.
252 end_time: Time in seconds where the reproduction should end, as
253 specified in the URL.
254 chapters: A list of dictionaries, with the following entries:
255 * "start_time" - The start time of the chapter in seconds
256 * "end_time" - The end time of the chapter in seconds
257 * "title" (optional, string)
258
259 The following fields should only be used when the video belongs to some logical
260 chapter or section:
261
262 chapter: Name or title of the chapter the video belongs to.
263 chapter_number: Number of the chapter the video belongs to, as an integer.
264 chapter_id: Id of the chapter the video belongs to, as a unicode string.
265
266 The following fields should only be used when the video is an episode of some
267 series, programme or podcast:
268
269 series: Title of the series or programme the video episode belongs to.
270 season: Title of the season the video episode belongs to.
271 season_number: Number of the season the video episode belongs to, as an integer.
272 season_id: Id of the season the video episode belongs to, as a unicode string.
273 episode: Title of the video episode. Unlike mandatory video title field,
274 this field should denote the exact title of the video episode
275 without any kind of decoration.
276 episode_number: Number of the video episode within a season, as an integer.
277 episode_id: Id of the video episode, as a unicode string.
278
279 The following fields should only be used when the media is a track or a part of
280 a music album:
281
282 track: Title of the track.
283 track_number: Number of the track within an album or a disc, as an integer.
284 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
285 as a unicode string.
286 artist: Artist(s) of the track.
287 genre: Genre(s) of the track.
288 album: Title of the album the track belongs to.
289 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
290 album_artist: List of all artists appeared on the album (e.g.
291 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
292 and compilations).
293 disc_number: Number of the disc or other physical medium the track belongs to,
294 as an integer.
295 release_year: Year (YYYY) when the album was released.
296
297 Unless mentioned otherwise, the fields should be Unicode strings.
298
299 Unless mentioned otherwise, None is equivalent to absence of information.
300
301
302 _type "playlist" indicates multiple videos.
303 There must be a key "entries", which is a list, an iterable, or a PagedList
304 object, each element of which is a valid dictionary by this specification.
305
306 Additionally, playlists can have "id", "title", "description", "uploader",
307 "uploader_id", "uploader_url" attributes with the same semantics as videos
308 (see above).
309
310
311 _type "multi_video" indicates that there are multiple videos that
312 form a single show, for examples multiple acts of an opera or TV episode.
313 It must have an entries key like a playlist and contain all the keys
314 required for a video at the same time.
315
316
317 _type "url" indicates that the video must be extracted from another
318 location, possibly by a different extractor. Its only required key is:
319 "url" - the next URL to extract.
320 The key "ie_key" can be set to the class name (minus the trailing "IE",
321 e.g. "Youtube") if the extractor class is known in advance.
322 Additionally, the dictionary may have any properties of the resolved entity
323 known in advance, for example "title" if the title of the referred video is
324 known ahead of time.
325
326
327 _type "url_transparent" entities have the same specification as "url", but
328 indicate that the given additional information is more precise than the one
329 associated with the resolved URL.
330 This is useful when a site employs a video service that hosts the video and
331 its technical metadata, but that video service does not embed a useful
332 title, description etc.
333
334
335 Subclasses of this one should re-define the _real_initialize() and
336 _real_extract() methods and define a _VALID_URL regexp.
337 Probably, they should also be added to the list of extractors.
338
339 _GEO_BYPASS attribute may be set to False in order to disable
340 geo restriction bypass mechanisms for a particular extractor.
341 Though it won't disable explicit geo restriction bypass based on
342 country code provided with geo_bypass_country.
343
344 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
345 countries for this extractor. One of these countries will be used by
346 geo restriction bypass mechanism right away in order to bypass
347 geo restriction, of course, if the mechanism is not disabled.
348
349 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
350 IP blocks in CIDR notation for this extractor. One of these IP blocks
351 will be used by geo restriction bypass mechanism similarly
352 to _GEO_COUNTRIES.
353
354 Finally, the _WORKING attribute should be set to False for broken IEs
355 in order to warn the users and skip the tests.
356 """
357
358 _ready = False
359 _downloader = None
360 _x_forwarded_for_ip = None
361 _GEO_BYPASS = True
362 _GEO_COUNTRIES = None
363 _GEO_IP_BLOCKS = None
364 _WORKING = True
365
366 def __init__(self, downloader=None):
367 """Constructor. Receives an optional downloader."""
368 self._ready = False
369 self._x_forwarded_for_ip = None
370 self.set_downloader(downloader)
371
372 @classmethod
373 def suitable(cls, url):
374 """Receives a URL and returns True if suitable for this IE."""
375
376 # This does not use has/getattr intentionally - we want to know whether
377 # we have cached the regexp for *this* class, whereas getattr would also
378 # match the superclass
379 if '_VALID_URL_RE' not in cls.__dict__:
380 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
381 return cls._VALID_URL_RE.match(url) is not None
382
383 @classmethod
384 def _match_id(cls, url):
385 if '_VALID_URL_RE' not in cls.__dict__:
386 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
387 m = cls._VALID_URL_RE.match(url)
388 assert m
389 return compat_str(m.group('id'))
390
391 @classmethod
392 def working(cls):
393 """Getter method for _WORKING."""
394 return cls._WORKING
395
396 def initialize(self):
397 """Initializes an instance (authentication, etc)."""
398 self._initialize_geo_bypass({
399 'countries': self._GEO_COUNTRIES,
400 'ip_blocks': self._GEO_IP_BLOCKS,
401 })
402 if not self._ready:
403 self._real_initialize()
404 self._ready = True
405
406 def _initialize_geo_bypass(self, geo_bypass_context):
407 """
408 Initialize geo restriction bypass mechanism.
409
410 This method is used to initialize geo bypass mechanism based on faking
411 X-Forwarded-For HTTP header. A random country from provided country list
412 is selected and a random IP belonging to this country is generated. This
413 IP will be passed as X-Forwarded-For HTTP header in all subsequent
414 HTTP requests.
415
416 This method will be used for initial geo bypass mechanism initialization
417 during the instance initialization with _GEO_COUNTRIES and
418 _GEO_IP_BLOCKS.
419
420 You may also manually call it from extractor's code if geo bypass
421 information is not available beforehand (e.g. obtained during
422 extraction) or due to some other reason. In this case you should pass
423 this information in geo bypass context passed as first argument. It may
424 contain following fields:
425
426 countries: List of geo unrestricted countries (similar
427 to _GEO_COUNTRIES)
428 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
429 (similar to _GEO_IP_BLOCKS)
430
431 """
432 if not self._x_forwarded_for_ip:
433
434 # Geo bypass mechanism is explicitly disabled by user
435 if not self._downloader.params.get('geo_bypass', True):
436 return
437
438 if not geo_bypass_context:
439 geo_bypass_context = {}
440
441 # Backward compatibility: previously _initialize_geo_bypass
442 # expected a list of countries, some 3rd party code may still use
443 # it this way
444 if isinstance(geo_bypass_context, (list, tuple)):
445 geo_bypass_context = {
446 'countries': geo_bypass_context,
447 }
448
449 # The whole point of geo bypass mechanism is to fake IP
450 # as X-Forwarded-For HTTP header based on some IP block or
451 # country code.
452
453 # Path 1: bypassing based on IP block in CIDR notation
454
455 # Explicit IP block specified by user, use it right away
456 # regardless of whether extractor is geo bypassable or not
457 ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
458
459 # Otherwise use random IP block from geo bypass context but only
460 # if extractor is known as geo bypassable
461 if not ip_block:
462 ip_blocks = geo_bypass_context.get('ip_blocks')
463 if self._GEO_BYPASS and ip_blocks:
464 ip_block = random.choice(ip_blocks)
465
466 if ip_block:
467 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
468 if self._downloader.params.get('verbose', False):
469 self._downloader.to_screen(
470 '[debug] Using fake IP %s as X-Forwarded-For.'
471 % self._x_forwarded_for_ip)
472 return
473
474 # Path 2: bypassing based on country code
475
476 # Explicit country code specified by user, use it right away
477 # regardless of whether extractor is geo bypassable or not
478 country = self._downloader.params.get('geo_bypass_country', None)
479
480 # Otherwise use random country code from geo bypass context but
481 # only if extractor is known as geo bypassable
482 if not country:
483 countries = geo_bypass_context.get('countries')
484 if self._GEO_BYPASS and countries:
485 country = random.choice(countries)
486
487 if country:
488 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
489 if self._downloader.params.get('verbose', False):
490 self._downloader.to_screen(
491 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
492 % (self._x_forwarded_for_ip, country.upper()))
493
494 def extract(self, url):
495 """Extracts URL information and returns it in list of dicts."""
496 try:
497 for _ in range(2):
498 try:
499 self.initialize()
500 ie_result = self._real_extract(url)
501 if self._x_forwarded_for_ip:
502 ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
503 return ie_result
504 except GeoRestrictedError as e:
505 if self.__maybe_fake_ip_and_retry(e.countries):
506 continue
507 raise
508 except ExtractorError:
509 raise
510 except compat_http_client.IncompleteRead as e:
511 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
512 except (KeyError, StopIteration) as e:
513 raise ExtractorError('An extractor error has occurred.', cause=e)
514
515 def __maybe_fake_ip_and_retry(self, countries):
516 if (not self._downloader.params.get('geo_bypass_country', None) and
517 self._GEO_BYPASS and
518 self._downloader.params.get('geo_bypass', True) and
519 not self._x_forwarded_for_ip and
520 countries):
521 country_code = random.choice(countries)
522 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
523 if self._x_forwarded_for_ip:
524 self.report_warning(
525 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
526 % (self._x_forwarded_for_ip, country_code.upper()))
527 return True
528 return False
529
530 def set_downloader(self, downloader):
531 """Sets the downloader for this IE."""
532 self._downloader = downloader
533
534 def _real_initialize(self):
535 """Real initialization process. Redefine in subclasses."""
536 pass
537
538 def _real_extract(self, url):
539 """Real extraction process. Redefine in subclasses."""
540 pass
541
542 @classmethod
543 def ie_key(cls):
544 """A string for getting the InfoExtractor with get_info_extractor"""
545 return compat_str(cls.__name__[:-2])
546
547 @property
548 def IE_NAME(self):
549 return compat_str(type(self).__name__[:-2])
550
551 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
552 """ Returns the response handle """
553 if note is None:
554 self.report_download_webpage(video_id)
555 elif note is not False:
556 if video_id is None:
557 self.to_screen('%s' % (note,))
558 else:
559 self.to_screen('%s: %s' % (video_id, note))
560
561 # Some sites check X-Forwarded-For HTTP header in order to figure out
562 # the origin of the client behind proxy. This allows bypassing geo
563 # restriction by faking this header's value to IP that belongs to some
564 # geo unrestricted country. We will do so once we encounter any
565 # geo restriction error.
566 if self._x_forwarded_for_ip:
567 if 'X-Forwarded-For' not in headers:
568 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
569
570 if isinstance(url_or_request, compat_urllib_request.Request):
571 url_or_request = update_Request(
572 url_or_request, data=data, headers=headers, query=query)
573 else:
574 if query:
575 url_or_request = update_url_query(url_or_request, query)
576 if data is not None or headers:
577 url_or_request = sanitized_Request(url_or_request, data, headers)
578 try:
579 return self._downloader.urlopen(url_or_request)
580 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
581 if errnote is False:
582 return False
583 if errnote is None:
584 errnote = 'Unable to download webpage'
585
586 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
587 if fatal:
588 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
589 else:
590 self._downloader.report_warning(errmsg)
591 return False
592
593 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
594 """ Returns a tuple (page content as string, URL handle) """
595 # Strip hashes from the URL (#1038)
596 if isinstance(url_or_request, (compat_str, str)):
597 url_or_request = url_or_request.partition('#')[0]
598
599 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
600 if urlh is False:
601 assert not fatal
602 return False
603 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
604 return (content, urlh)
605
606 @staticmethod
607 def _guess_encoding_from_content(content_type, webpage_bytes):
608 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
609 if m:
610 encoding = m.group(1)
611 else:
612 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
613 webpage_bytes[:1024])
614 if m:
615 encoding = m.group(1).decode('ascii')
616 elif webpage_bytes.startswith(b'\xff\xfe'):
617 encoding = 'utf-16'
618 else:
619 encoding = 'utf-8'
620
621 return encoding
622
623 def __check_blocked(self, content):
624 first_block = content[:512]
625 if ('<title>Access to this site is blocked</title>' in content and
626 'Websense' in first_block):
627 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
628 blocked_iframe = self._html_search_regex(
629 r'<iframe src="([^"]+)"', content,
630 'Websense information URL', default=None)
631 if blocked_iframe:
632 msg += ' Visit %s for more details' % blocked_iframe
633 raise ExtractorError(msg, expected=True)
634 if '<title>The URL you requested has been blocked</title>' in first_block:
635 msg = (
636 'Access to this webpage has been blocked by Indian censorship. '
637 'Use a VPN or proxy server (with --proxy) to route around it.')
638 block_msg = self._html_search_regex(
639 r'</h1><p>(.*?)</p>',
640 content, 'block message', default=None)
641 if block_msg:
642 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
643 raise ExtractorError(msg, expected=True)
644 if ('<title>TTK :: Š”Š¾ŃŃ‚ŃƒŠæ Šŗ рŠµŃŃƒŃ€ŃŃƒ Š¾Š³Ń€Š°Š½ŠøчŠµŠ½</title>' in content and
645 'blocklist.rkn.gov.ru' in content):
646 raise ExtractorError(
647 'Access to this webpage has been blocked by decision of the Russian government. '
648 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
649 expected=True)
650
651 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
652 content_type = urlh.headers.get('Content-Type', '')
653 webpage_bytes = urlh.read()
654 if prefix is not None:
655 webpage_bytes = prefix + webpage_bytes
656 if not encoding:
657 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
658 if self._downloader.params.get('dump_intermediate_pages', False):
659 self.to_screen('Dumping request to ' + urlh.geturl())
660 dump = base64.b64encode(webpage_bytes).decode('ascii')
661 self._downloader.to_screen(dump)
662 if self._downloader.params.get('write_pages', False):
663 basen = '%s_%s' % (video_id, urlh.geturl())
664 if len(basen) > 240:
665 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
666 basen = basen[:240 - len(h)] + h
667 raw_filename = basen + '.dump'
668 filename = sanitize_filename(raw_filename, restricted=True)
669 self.to_screen('Saving request to ' + filename)
670 # Working around MAX_PATH limitation on Windows (see
671 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
672 if compat_os_name == 'nt':
673 absfilepath = os.path.abspath(filename)
674 if len(absfilepath) > 259:
675 filename = '\\\\?\\' + absfilepath
676 with open(filename, 'wb') as outf:
677 outf.write(webpage_bytes)
678
679 try:
680 content = webpage_bytes.decode(encoding, 'replace')
681 except LookupError:
682 content = webpage_bytes.decode('utf-8', 'replace')
683
684 self.__check_blocked(content)
685
686 return content
687
688 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
689 """ Returns the data of the page as a string """
690 success = False
691 try_count = 0
692 while success is False:
693 try:
694 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
695 success = True
696 except compat_http_client.IncompleteRead as e:
697 try_count += 1
698 if try_count >= tries:
699 raise e
700 self._sleep(timeout, video_id)
701 if res is False:
702 return res
703 else:
704 content, _ = res
705 return content
706
707 def _download_xml_handle(
708 self, url_or_request, video_id, note='Downloading XML',
709 errnote='Unable to download XML', transform_source=None,
710 fatal=True, encoding=None, data=None, headers={}, query={}):
711 """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
712 res = self._download_webpage_handle(
713 url_or_request, video_id, note, errnote, fatal=fatal,
714 encoding=encoding, data=data, headers=headers, query=query)
715 if res is False:
716 return res
717 xml_string, urlh = res
718 return self._parse_xml(
719 xml_string, video_id, transform_source=transform_source,
720 fatal=fatal), urlh
721
722 def _download_xml(self, url_or_request, video_id,
723 note='Downloading XML', errnote='Unable to download XML',
724 transform_source=None, fatal=True, encoding=None,
725 data=None, headers={}, query={}):
726 """Return the xml as an xml.etree.ElementTree.Element"""
727 res = self._download_xml_handle(
728 url_or_request, video_id, note=note, errnote=errnote,
729 transform_source=transform_source, fatal=fatal, encoding=encoding,
730 data=data, headers=headers, query=query)
731 return res if res is False else res[0]
732
733 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
734 if transform_source:
735 xml_string = transform_source(xml_string)
736 try:
737 return compat_etree_fromstring(xml_string.encode('utf-8'))
738 except compat_xml_parse_error as ve:
739 errmsg = '%s: Failed to parse XML ' % video_id
740 if fatal:
741 raise ExtractorError(errmsg, cause=ve)
742 else:
743 self.report_warning(errmsg + str(ve))
744
745 def _download_json_handle(
746 self, url_or_request, video_id, note='Downloading JSON metadata',
747 errnote='Unable to download JSON metadata', transform_source=None,
748 fatal=True, encoding=None, data=None, headers={}, query={}):
749 """Return a tuple (JSON object, URL handle)"""
750 res = self._download_webpage_handle(
751 url_or_request, video_id, note, errnote, fatal=fatal,
752 encoding=encoding, data=data, headers=headers, query=query)
753 if res is False:
754 return res
755 json_string, urlh = res
756 return self._parse_json(
757 json_string, video_id, transform_source=transform_source,
758 fatal=fatal), urlh
759
760 def _download_json(
761 self, url_or_request, video_id, note='Downloading JSON metadata',
762 errnote='Unable to download JSON metadata', transform_source=None,
763 fatal=True, encoding=None, data=None, headers={}, query={}):
764 res = self._download_json_handle(
765 url_or_request, video_id, note=note, errnote=errnote,
766 transform_source=transform_source, fatal=fatal, encoding=encoding,
767 data=data, headers=headers, query=query)
768 return res if res is False else res[0]
769
770 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
771 if transform_source:
772 json_string = transform_source(json_string)
773 try:
774 return json.loads(json_string)
775 except ValueError as ve:
776 errmsg = '%s: Failed to parse JSON ' % video_id
777 if fatal:
778 raise ExtractorError(errmsg, cause=ve)
779 else:
780 self.report_warning(errmsg + str(ve))
781
782 def report_warning(self, msg, video_id=None):
783 idstr = '' if video_id is None else '%s: ' % video_id
784 self._downloader.report_warning(
785 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
786
787 def to_screen(self, msg):
788 """Print msg to screen, prefixing it with '[ie_name]'"""
789 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
790
791 def report_extraction(self, id_or_name):
792 """Report information extraction."""
793 self.to_screen('%s: Extracting information' % id_or_name)
794
795 def report_download_webpage(self, video_id):
796 """Report webpage download."""
797 self.to_screen('%s: Downloading webpage' % video_id)
798
799 def report_age_confirmation(self):
800 """Report attempt to confirm age."""
801 self.to_screen('Confirming age')
802
803 def report_login(self):
804 """Report attempt to log in."""
805 self.to_screen('Logging in')
806
807 @staticmethod
808 def raise_login_required(msg='This video is only available for registered users'):
809 raise ExtractorError(
810 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
811 expected=True)
812
813 @staticmethod
814 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
815 raise GeoRestrictedError(msg, countries=countries)
816
817 # Methods for following #608
818 @staticmethod
819 def url_result(url, ie=None, video_id=None, video_title=None):
820 """Returns a URL that points to a page that should be processed"""
821 # TODO: ie should be the class used for getting the info
822 video_info = {'_type': 'url',
823 'url': url,
824 'ie_key': ie}
825 if video_id is not None:
826 video_info['id'] = video_id
827 if video_title is not None:
828 video_info['title'] = video_title
829 return video_info
830
831 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
832 urls = orderedSet(
833 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
834 for m in matches)
835 return self.playlist_result(
836 urls, playlist_id=playlist_id, playlist_title=playlist_title)
837
838 @staticmethod
839 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
840 """Returns a playlist"""
841 video_info = {'_type': 'playlist',
842 'entries': entries}
843 if playlist_id:
844 video_info['id'] = playlist_id
845 if playlist_title:
846 video_info['title'] = playlist_title
847 if playlist_description:
848 video_info['description'] = playlist_description
849 return video_info
850
851 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
852 """
853 Perform a regex search on the given string, using a single or a list of
854 patterns returning the first matching group.
855 In case of failure return a default value or raise a WARNING or a
856 RegexNotFoundError, depending on fatal, specifying the field name.
857 """
858 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
859 mobj = re.search(pattern, string, flags)
860 else:
861 for p in pattern:
862 mobj = re.search(p, string, flags)
863 if mobj:
864 break
865
866 if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
867 _name = '\033[0;34m%s\033[0m' % name
868 else:
869 _name = name
870
871 if mobj:
872 if group is None:
873 # return the first matching group
874 return next(g for g in mobj.groups() if g is not None)
875 else:
876 return mobj.group(group)
877 elif default is not NO_DEFAULT:
878 return default
879 elif fatal:
880 raise RegexNotFoundError('Unable to extract %s' % _name)
881 else:
882 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
883 return None
884
885 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
886 """
887 Like _search_regex, but strips HTML tags and unescapes entities.
888 """
889 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
890 if res:
891 return clean_html(res).strip()
892 else:
893 return res
894
895 def _get_netrc_login_info(self, netrc_machine=None):
896 username = None
897 password = None
898 netrc_machine = netrc_machine or self._NETRC_MACHINE
899
900 if self._downloader.params.get('usenetrc', False):
901 try:
902 info = netrc.netrc().authenticators(netrc_machine)
903 if info is not None:
904 username = info[0]
905 password = info[2]
906 else:
907 raise netrc.NetrcParseError(
908 'No authenticators for %s' % netrc_machine)
909 except (IOError, netrc.NetrcParseError) as err:
910 self._downloader.report_warning(
911 'parsing .netrc: %s' % error_to_compat_str(err))
912
913 return username, password
914
915 def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
916 """
917 Get the login info as (username, password)
918 First look for the manually specified credentials using username_option
919 and password_option as keys in params dictionary. If no such credentials
920 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
921 value.
922 If there's no info available, return (None, None)
923 """
924 if self._downloader is None:
925 return (None, None)
926
927 downloader_params = self._downloader.params
928
929 # Attempt to use provided username and password or .netrc data
930 if downloader_params.get(username_option) is not None:
931 username = downloader_params[username_option]
932 password = downloader_params[password_option]
933 else:
934 username, password = self._get_netrc_login_info(netrc_machine)
935
936 return username, password
937
938 def _get_tfa_info(self, note='two-factor verification code'):
939 """
940 Get the two-factor authentication info
941 TODO - asking the user will be required for sms/phone verify
942 currently just uses the command line option
943 If there's no info available, return None
944 """
945 if self._downloader is None:
946 return None
947 downloader_params = self._downloader.params
948
949 if downloader_params.get('twofactor') is not None:
950 return downloader_params['twofactor']
951
952 return compat_getpass('Type %s and press [Return]: ' % note)
953
954 # Helper functions for extracting OpenGraph info
955 @staticmethod
956 def _og_regexes(prop):
957 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
958 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
959 % {'prop': re.escape(prop)})
960 template = r'<meta[^>]+?%s[^>]+?%s'
961 return [
962 template % (property_re, content_re),
963 template % (content_re, property_re),
964 ]
965
966 @staticmethod
967 def _meta_regex(prop):
968 return r'''(?isx)<meta
969 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
970 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
971
972 def _og_search_property(self, prop, html, name=None, **kargs):
973 if not isinstance(prop, (list, tuple)):
974 prop = [prop]
975 if name is None:
976 name = 'OpenGraph %s' % prop[0]
977 og_regexes = []
978 for p in prop:
979 og_regexes.extend(self._og_regexes(p))
980 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
981 if escaped is None:
982 return None
983 return unescapeHTML(escaped)
984
985 def _og_search_thumbnail(self, html, **kargs):
986 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
987
988 def _og_search_description(self, html, **kargs):
989 return self._og_search_property('description', html, fatal=False, **kargs)
990
991 def _og_search_title(self, html, **kargs):
992 return self._og_search_property('title', html, **kargs)
993
994 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
995 regexes = self._og_regexes('video') + self._og_regexes('video:url')
996 if secure:
997 regexes = self._og_regexes('video:secure_url') + regexes
998 return self._html_search_regex(regexes, html, name, **kargs)
999
1000 def _og_search_url(self, html, **kargs):
1001 return self._og_search_property('url', html, **kargs)
1002
1003 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1004 if not isinstance(name, (list, tuple)):
1005 name = [name]
1006 if display_name is None:
1007 display_name = name[0]
1008 return self._html_search_regex(
1009 [self._meta_regex(n) for n in name],
1010 html, display_name, fatal=fatal, group='content', **kwargs)
1011
1012 def _dc_search_uploader(self, html):
1013 return self._html_search_meta('dc.creator', html, 'uploader')
1014
1015 def _rta_search(self, html):
1016 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1017 if re.search(r'(?ix)<meta\s+name="rating"\s+'
1018 r' content="RTA-5042-1996-1400-1577-RTA"',
1019 html):
1020 return 18
1021 return 0
1022
1023 def _media_rating_search(self, html):
1024 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1025 rating = self._html_search_meta('rating', html)
1026
1027 if not rating:
1028 return None
1029
1030 RATING_TABLE = {
1031 'safe for kids': 0,
1032 'general': 8,
1033 '14 years': 14,
1034 'mature': 17,
1035 'restricted': 19,
1036 }
1037 return RATING_TABLE.get(rating.lower())
1038
1039 def _family_friendly_search(self, html):
1040 # See http://schema.org/VideoObject
1041 family_friendly = self._html_search_meta(
1042 'isFamilyFriendly', html, default=None)
1043
1044 if not family_friendly:
1045 return None
1046
1047 RATING_TABLE = {
1048 '1': 0,
1049 'true': 0,
1050 '0': 18,
1051 'false': 18,
1052 }
1053 return RATING_TABLE.get(family_friendly.lower())
1054
1055 def _twitter_search_player(self, html):
1056 return self._html_search_meta('twitter:player', html,
1057 'twitter card player')
1058
1059 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1060 json_ld = self._search_regex(
1061 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1062 html, 'JSON-LD', group='json_ld', **kwargs)
1063 default = kwargs.get('default', NO_DEFAULT)
1064 if not json_ld:
1065 return default if default is not NO_DEFAULT else {}
1066 # JSON-LD may be malformed and thus `fatal` should be respected.
1067 # At the same time `default` may be passed that assumes `fatal=False`
1068 # for _search_regex. Let's simulate the same behavior here as well.
1069 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1070 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1071
1072 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1073 if isinstance(json_ld, compat_str):
1074 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1075 if not json_ld:
1076 return {}
1077 info = {}
1078 if not isinstance(json_ld, (list, tuple, dict)):
1079 return info
1080 if isinstance(json_ld, dict):
1081 json_ld = [json_ld]
1082
1083 INTERACTION_TYPE_MAP = {
1084 'CommentAction': 'comment',
1085 'AgreeAction': 'like',
1086 'DisagreeAction': 'dislike',
1087 'LikeAction': 'like',
1088 'DislikeAction': 'dislike',
1089 'ListenAction': 'view',
1090 'WatchAction': 'view',
1091 'ViewAction': 'view',
1092 }
1093
1094 def extract_interaction_statistic(e):
1095 interaction_statistic = e.get('interactionStatistic')
1096 if not isinstance(interaction_statistic, list):
1097 return
1098 for is_e in interaction_statistic:
1099 if not isinstance(is_e, dict):
1100 continue
1101 if is_e.get('@type') != 'InteractionCounter':
1102 continue
1103 interaction_type = is_e.get('interactionType')
1104 if not isinstance(interaction_type, compat_str):
1105 continue
1106 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1107 if interaction_count is None:
1108 continue
1109 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1110 if not count_kind:
1111 continue
1112 count_key = '%s_count' % count_kind
1113 if info.get(count_key) is not None:
1114 continue
1115 info[count_key] = interaction_count
1116
1117 def extract_video_object(e):
1118 assert e['@type'] == 'VideoObject'
1119 info.update({
1120 'url': e.get('contentUrl'),
1121 'title': unescapeHTML(e.get('name')),
1122 'description': unescapeHTML(e.get('description')),
1123 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1124 'duration': parse_duration(e.get('duration')),
1125 'timestamp': unified_timestamp(e.get('uploadDate')),
1126 'filesize': float_or_none(e.get('contentSize')),
1127 'tbr': int_or_none(e.get('bitrate')),
1128 'width': int_or_none(e.get('width')),
1129 'height': int_or_none(e.get('height')),
1130 'view_count': int_or_none(e.get('interactionCount')),
1131 })
1132 extract_interaction_statistic(e)
1133
1134 for e in json_ld:
1135 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1136 item_type = e.get('@type')
1137 if expected_type is not None and expected_type != item_type:
1138 return info
1139 if item_type in ('TVEpisode', 'Episode'):
1140 info.update({
1141 'episode': unescapeHTML(e.get('name')),
1142 'episode_number': int_or_none(e.get('episodeNumber')),
1143 'description': unescapeHTML(e.get('description')),
1144 })
1145 part_of_season = e.get('partOfSeason')
1146 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1147 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1148 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1149 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1150 info['series'] = unescapeHTML(part_of_series.get('name'))
1151 elif item_type in ('Article', 'NewsArticle'):
1152 info.update({
1153 'timestamp': parse_iso8601(e.get('datePublished')),
1154 'title': unescapeHTML(e.get('headline')),
1155 'description': unescapeHTML(e.get('articleBody')),
1156 })
1157 elif item_type == 'VideoObject':
1158 extract_video_object(e)
1159 continue
1160 video = e.get('video')
1161 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1162 extract_video_object(video)
1163 break
1164 return dict((k, v) for k, v in info.items() if v is not None)
1165
1166 @staticmethod
1167 def _hidden_inputs(html):
1168 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1169 hidden_inputs = {}
1170 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1171 attrs = extract_attributes(input)
1172 if not input:
1173 continue
1174 if attrs.get('type') not in ('hidden', 'submit'):
1175 continue
1176 name = attrs.get('name') or attrs.get('id')
1177 value = attrs.get('value')
1178 if name and value is not None:
1179 hidden_inputs[name] = value
1180 return hidden_inputs
1181
1182 def _form_hidden_inputs(self, form_id, html):
1183 form = self._search_regex(
1184 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1185 html, '%s form' % form_id, group='form')
1186 return self._hidden_inputs(form)
1187
1188 def _sort_formats(self, formats, field_preference=None):
1189 if not formats:
1190 raise ExtractorError('No video formats found')
1191
1192 for f in formats:
1193 # Automatically determine tbr when missing based on abr and vbr (improves
1194 # formats sorting in some cases)
1195 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1196 f['tbr'] = f['abr'] + f['vbr']
1197
1198 def _formats_key(f):
1199 # TODO remove the following workaround
1200 from ..utils import determine_ext
1201 if not f.get('ext') and 'url' in f:
1202 f['ext'] = determine_ext(f['url'])
1203
1204 if isinstance(field_preference, (list, tuple)):
1205 return tuple(
1206 f.get(field)
1207 if f.get(field) is not None
1208 else ('' if field == 'format_id' else -1)
1209 for field in field_preference)
1210
1211 preference = f.get('preference')
1212 if preference is None:
1213 preference = 0
1214 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
1215 preference -= 0.5
1216
1217 protocol = f.get('protocol') or determine_protocol(f)
1218 proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1219
1220 if f.get('vcodec') == 'none': # audio only
1221 preference -= 50
1222 if self._downloader.params.get('prefer_free_formats'):
1223 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1224 else:
1225 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1226 ext_preference = 0
1227 try:
1228 audio_ext_preference = ORDER.index(f['ext'])
1229 except ValueError:
1230 audio_ext_preference = -1
1231 else:
1232 if f.get('acodec') == 'none': # video only
1233 preference -= 40
1234 if self._downloader.params.get('prefer_free_formats'):
1235 ORDER = ['flv', 'mp4', 'webm']
1236 else:
1237 ORDER = ['webm', 'flv', 'mp4']
1238 try:
1239 ext_preference = ORDER.index(f['ext'])
1240 except ValueError:
1241 ext_preference = -1
1242 audio_ext_preference = 0
1243
1244 return (
1245 preference,
1246 f.get('language_preference') if f.get('language_preference') is not None else -1,
1247 f.get('quality') if f.get('quality') is not None else -1,
1248 f.get('tbr') if f.get('tbr') is not None else -1,
1249 f.get('filesize') if f.get('filesize') is not None else -1,
1250 f.get('vbr') if f.get('vbr') is not None else -1,
1251 f.get('height') if f.get('height') is not None else -1,
1252 f.get('width') if f.get('width') is not None else -1,
1253 proto_preference,
1254 ext_preference,
1255 f.get('abr') if f.get('abr') is not None else -1,
1256 audio_ext_preference,
1257 f.get('fps') if f.get('fps') is not None else -1,
1258 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1259 f.get('source_preference') if f.get('source_preference') is not None else -1,
1260 f.get('format_id') if f.get('format_id') is not None else '',
1261 )
1262 formats.sort(key=_formats_key)
1263
1264 def _check_formats(self, formats, video_id):
1265 if formats:
1266 formats[:] = filter(
1267 lambda f: self._is_valid_url(
1268 f['url'], video_id,
1269 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1270 formats)
1271
1272 @staticmethod
1273 def _remove_duplicate_formats(formats):
1274 format_urls = set()
1275 unique_formats = []
1276 for f in formats:
1277 if f['url'] not in format_urls:
1278 format_urls.add(f['url'])
1279 unique_formats.append(f)
1280 formats[:] = unique_formats
1281
1282 def _is_valid_url(self, url, video_id, item='video', headers={}):
1283 url = self._proto_relative_url(url, scheme='http:')
1284 # For now assume non HTTP(S) URLs always valid
1285 if not (url.startswith('http://') or url.startswith('https://')):
1286 return True
1287 try:
1288 self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1289 return True
1290 except ExtractorError as e:
1291 if isinstance(e.cause, compat_urllib_error.URLError):
1292 self.to_screen(
1293 '%s: %s URL is invalid, skipping' % (video_id, item))
1294 return False
1295 raise
1296
1297 def http_scheme(self):
1298 """ Either "http:" or "https:", depending on the user's preferences """
1299 return (
1300 'http:'
1301 if self._downloader.params.get('prefer_insecure', False)
1302 else 'https:')
1303
1304 def _proto_relative_url(self, url, scheme=None):
1305 if url is None:
1306 return url
1307 if url.startswith('//'):
1308 if scheme is None:
1309 scheme = self.http_scheme()
1310 return scheme + url
1311 else:
1312 return url
1313
1314 def _sleep(self, timeout, video_id, msg_template=None):
1315 if msg_template is None:
1316 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1317 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1318 self.to_screen(msg)
1319 time.sleep(timeout)
1320
1321 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1322 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1323 fatal=True, m3u8_id=None):
1324 manifest = self._download_xml(
1325 manifest_url, video_id, 'Downloading f4m manifest',
1326 'Unable to download f4m manifest',
1327 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1328 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1329 transform_source=transform_source,
1330 fatal=fatal)
1331
1332 if manifest is False:
1333 return []
1334
1335 return self._parse_f4m_formats(
1336 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1337 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1338
1339 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1340 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1341 fatal=True, m3u8_id=None):
1342 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1343 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1344 if akamai_pv is not None and ';' in akamai_pv.text:
1345 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1346 if playerVerificationChallenge.strip() != '':
1347 return []
1348
1349 formats = []
1350 manifest_version = '1.0'
1351 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1352 if not media_nodes:
1353 manifest_version = '2.0'
1354 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1355 # Remove unsupported DRM protected media from final formats
1356 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1357 media_nodes = remove_encrypted_media(media_nodes)
1358 if not media_nodes:
1359 return formats
1360
1361 manifest_base_url = get_base_url(manifest)
1362
1363 bootstrap_info = xpath_element(
1364 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1365 'bootstrap info', default=None)
1366
1367 vcodec = None
1368 mime_type = xpath_text(
1369 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1370 'base URL', default=None)
1371 if mime_type and mime_type.startswith('audio/'):
1372 vcodec = 'none'
1373
1374 for i, media_el in enumerate(media_nodes):
1375 tbr = int_or_none(media_el.attrib.get('bitrate'))
1376 width = int_or_none(media_el.attrib.get('width'))
1377 height = int_or_none(media_el.attrib.get('height'))
1378 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1379 # If <bootstrapInfo> is present, the specified f4m is a
1380 # stream-level manifest, and only set-level manifests may refer to
1381 # external resources. See section 11.4 and section 4 of F4M spec
1382 if bootstrap_info is None:
1383 media_url = None
1384 # @href is introduced in 2.0, see section 11.6 of F4M spec
1385 if manifest_version == '2.0':
1386 media_url = media_el.attrib.get('href')
1387 if media_url is None:
1388 media_url = media_el.attrib.get('url')
1389 if not media_url:
1390 continue
1391 manifest_url = (
1392 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1393 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1394 # If media_url is itself a f4m manifest do the recursive extraction
1395 # since bitrates in parent manifest (this one) and media_url manifest
1396 # may differ leading to inability to resolve the format by requested
1397 # bitrate in f4m downloader
1398 ext = determine_ext(manifest_url)
1399 if ext == 'f4m':
1400 f4m_formats = self._extract_f4m_formats(
1401 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1402 transform_source=transform_source, fatal=fatal)
1403 # Sometimes stream-level manifest contains single media entry that
1404 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1405 # At the same time parent's media entry in set-level manifest may
1406 # contain it. We will copy it from parent in such cases.
1407 if len(f4m_formats) == 1:
1408 f = f4m_formats[0]
1409 f.update({
1410 'tbr': f.get('tbr') or tbr,
1411 'width': f.get('width') or width,
1412 'height': f.get('height') or height,
1413 'format_id': f.get('format_id') if not tbr else format_id,
1414 'vcodec': vcodec,
1415 })
1416 formats.extend(f4m_formats)
1417 continue
1418 elif ext == 'm3u8':
1419 formats.extend(self._extract_m3u8_formats(
1420 manifest_url, video_id, 'mp4', preference=preference,
1421 m3u8_id=m3u8_id, fatal=fatal))
1422 continue
1423 formats.append({
1424 'format_id': format_id,
1425 'url': manifest_url,
1426 'manifest_url': manifest_url,
1427 'ext': 'flv' if bootstrap_info is not None else None,
1428 'protocol': 'f4m',
1429 'tbr': tbr,
1430 'width': width,
1431 'height': height,
1432 'vcodec': vcodec,
1433 'preference': preference,
1434 })
1435 return formats
1436
1437 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1438 return {
1439 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1440 'url': m3u8_url,
1441 'ext': ext,
1442 'protocol': 'm3u8',
1443 'preference': preference - 100 if preference else -100,
1444 'resolution': 'multiple',
1445 'format_note': 'Quality selection URL',
1446 }
1447
1448 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1449 entry_protocol='m3u8', preference=None,
1450 m3u8_id=None, note=None, errnote=None,
1451 fatal=True, live=False):
1452 res = self._download_webpage_handle(
1453 m3u8_url, video_id,
1454 note=note or 'Downloading m3u8 information',
1455 errnote=errnote or 'Failed to download m3u8 information',
1456 fatal=fatal)
1457
1458 if res is False:
1459 return []
1460
1461 m3u8_doc, urlh = res
1462 m3u8_url = urlh.geturl()
1463
1464 return self._parse_m3u8_formats(
1465 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1466 preference=preference, m3u8_id=m3u8_id, live=live)
1467
1468 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1469 entry_protocol='m3u8', preference=None,
1470 m3u8_id=None, live=False):
1471 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1472 return []
1473
1474 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1475 return []
1476
1477 formats = []
1478
1479 format_url = lambda u: (
1480 u
1481 if re.match(r'^https?://', u)
1482 else compat_urlparse.urljoin(m3u8_url, u))
1483
1484 # References:
1485 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1486 # 2. https://github.com/rg3/youtube-dl/issues/12211
1487
1488 # We should try extracting formats only from master playlists [1, 4.3.4],
1489 # i.e. playlists that describe available qualities. On the other hand
1490 # media playlists [1, 4.3.3] should be returned as is since they contain
1491 # just the media without qualities renditions.
1492 # Fortunately, master playlist can be easily distinguished from media
1493 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1494 # master playlist tags MUST NOT appear in a media playist and vice versa.
1495 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1496 # media playlist and MUST NOT appear in master playlist thus we can
1497 # clearly detect media playlist with this criterion.
1498
1499 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1500 return [{
1501 'url': m3u8_url,
1502 'format_id': m3u8_id,
1503 'ext': ext,
1504 'protocol': entry_protocol,
1505 'preference': preference,
1506 }]
1507
1508 groups = {}
1509 last_stream_inf = {}
1510
1511 def extract_media(x_media_line):
1512 media = parse_m3u8_attributes(x_media_line)
1513 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1514 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1515 if not (media_type and group_id and name):
1516 return
1517 groups.setdefault(group_id, []).append(media)
1518 if media_type not in ('VIDEO', 'AUDIO'):
1519 return
1520 media_url = media.get('URI')
1521 if media_url:
1522 format_id = []
1523 for v in (m3u8_id, group_id, name):
1524 if v:
1525 format_id.append(v)
1526 f = {
1527 'format_id': '-'.join(format_id),
1528 'url': format_url(media_url),
1529 'manifest_url': m3u8_url,
1530 'language': media.get('LANGUAGE'),
1531 'ext': ext,
1532 'protocol': entry_protocol,
1533 'preference': preference,
1534 }
1535 if media_type == 'AUDIO':
1536 f['vcodec'] = 'none'
1537 formats.append(f)
1538
1539 def build_stream_name():
1540 # Despite specification does not mention NAME attribute for
1541 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1542 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1543 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1544 stream_name = last_stream_inf.get('NAME')
1545 if stream_name:
1546 return stream_name
1547 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1548 # from corresponding rendition group
1549 stream_group_id = last_stream_inf.get('VIDEO')
1550 if not stream_group_id:
1551 return
1552 stream_group = groups.get(stream_group_id)
1553 if not stream_group:
1554 return stream_group_id
1555 rendition = stream_group[0]
1556 return rendition.get('NAME') or stream_group_id
1557
1558 for line in m3u8_doc.splitlines():
1559 if line.startswith('#EXT-X-STREAM-INF:'):
1560 last_stream_inf = parse_m3u8_attributes(line)
1561 elif line.startswith('#EXT-X-MEDIA:'):
1562 extract_media(line)
1563 elif line.startswith('#') or not line.strip():
1564 continue
1565 else:
1566 tbr = float_or_none(
1567 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1568 last_stream_inf.get('BANDWIDTH'), scale=1000)
1569 format_id = []
1570 if m3u8_id:
1571 format_id.append(m3u8_id)
1572 stream_name = build_stream_name()
1573 # Bandwidth of live streams may differ over time thus making
1574 # format_id unpredictable. So it's better to keep provided
1575 # format_id intact.
1576 if not live:
1577 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1578 manifest_url = format_url(line.strip())
1579 f = {
1580 'format_id': '-'.join(format_id),
1581 'url': manifest_url,
1582 'manifest_url': m3u8_url,
1583 'tbr': tbr,
1584 'ext': ext,
1585 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1586 'protocol': entry_protocol,
1587 'preference': preference,
1588 }
1589 resolution = last_stream_inf.get('RESOLUTION')
1590 if resolution:
1591 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1592 if mobj:
1593 f['width'] = int(mobj.group('width'))
1594 f['height'] = int(mobj.group('height'))
1595 # Unified Streaming Platform
1596 mobj = re.search(
1597 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1598 if mobj:
1599 abr, vbr = mobj.groups()
1600 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1601 f.update({
1602 'vbr': vbr,
1603 'abr': abr,
1604 })
1605 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1606 f.update(codecs)
1607 audio_group_id = last_stream_inf.get('AUDIO')
1608 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1609 # references a rendition group MUST have a CODECS attribute.
1610 # However, this is not always respected, for example, [2]
1611 # contains EXT-X-STREAM-INF tag which references AUDIO
1612 # rendition group but does not have CODECS and despite
1613 # referencing audio group an audio group, it represents
1614 # a complete (with audio and video) format. So, for such cases
1615 # we will ignore references to rendition groups and treat them
1616 # as complete formats.
1617 if audio_group_id and codecs and f.get('vcodec') != 'none':
1618 audio_group = groups.get(audio_group_id)
1619 if audio_group and audio_group[0].get('URI'):
1620 # TODO: update acodec for audio only formats with
1621 # the same GROUP-ID
1622 f['acodec'] = 'none'
1623 formats.append(f)
1624 last_stream_inf = {}
1625 return formats
1626
1627 @staticmethod
1628 def _xpath_ns(path, namespace=None):
1629 if not namespace:
1630 return path
1631 out = []
1632 for c in path.split('/'):
1633 if not c or c == '.':
1634 out.append(c)
1635 else:
1636 out.append('{%s}%s' % (namespace, c))
1637 return '/'.join(out)
1638
1639 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1640 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1641
1642 if smil is False:
1643 assert not fatal
1644 return []
1645
1646 namespace = self._parse_smil_namespace(smil)
1647
1648 return self._parse_smil_formats(
1649 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1650
1651 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1652 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1653 if smil is False:
1654 return {}
1655 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1656
1657 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1658 return self._download_xml(
1659 smil_url, video_id, 'Downloading SMIL file',
1660 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1661
1662 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1663 namespace = self._parse_smil_namespace(smil)
1664
1665 formats = self._parse_smil_formats(
1666 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1667 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1668
1669 video_id = os.path.splitext(url_basename(smil_url))[0]
1670 title = None
1671 description = None
1672 upload_date = None
1673 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1674 name = meta.attrib.get('name')
1675 content = meta.attrib.get('content')
1676 if not name or not content:
1677 continue
1678 if not title and name == 'title':
1679 title = content
1680 elif not description and name in ('description', 'abstract'):
1681 description = content
1682 elif not upload_date and name == 'date':
1683 upload_date = unified_strdate(content)
1684
1685 thumbnails = [{
1686 'id': image.get('type'),
1687 'url': image.get('src'),
1688 'width': int_or_none(image.get('width')),
1689 'height': int_or_none(image.get('height')),
1690 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1691
1692 return {
1693 'id': video_id,
1694 'title': title or video_id,
1695 'description': description,
1696 'upload_date': upload_date,
1697 'thumbnails': thumbnails,
1698 'formats': formats,
1699 'subtitles': subtitles,
1700 }
1701
1702 def _parse_smil_namespace(self, smil):
1703 return self._search_regex(
1704 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1705
1706 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1707 base = smil_url
1708 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1709 b = meta.get('base') or meta.get('httpBase')
1710 if b:
1711 base = b
1712 break
1713
1714 formats = []
1715 rtmp_count = 0
1716 http_count = 0
1717 m3u8_count = 0
1718
1719 srcs = []
1720 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1721 for medium in media:
1722 src = medium.get('src')
1723 if not src or src in srcs:
1724 continue
1725 srcs.append(src)
1726
1727 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1728 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1729 width = int_or_none(medium.get('width'))
1730 height = int_or_none(medium.get('height'))
1731 proto = medium.get('proto')
1732 ext = medium.get('ext')
1733 src_ext = determine_ext(src)
1734 streamer = medium.get('streamer') or base
1735
1736 if proto == 'rtmp' or streamer.startswith('rtmp'):
1737 rtmp_count += 1
1738 formats.append({
1739 'url': streamer,
1740 'play_path': src,
1741 'ext': 'flv',
1742 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1743 'tbr': bitrate,
1744 'filesize': filesize,
1745 'width': width,
1746 'height': height,
1747 })
1748 if transform_rtmp_url:
1749 streamer, src = transform_rtmp_url(streamer, src)
1750 formats[-1].update({
1751 'url': streamer,
1752 'play_path': src,
1753 })
1754 continue
1755
1756 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1757 src_url = src_url.strip()
1758
1759 if proto == 'm3u8' or src_ext == 'm3u8':
1760 m3u8_formats = self._extract_m3u8_formats(
1761 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1762 if len(m3u8_formats) == 1:
1763 m3u8_count += 1
1764 m3u8_formats[0].update({
1765 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1766 'tbr': bitrate,
1767 'width': width,
1768 'height': height,
1769 })
1770 formats.extend(m3u8_formats)
1771 continue
1772
1773 if src_ext == 'f4m':
1774 f4m_url = src_url
1775 if not f4m_params:
1776 f4m_params = {
1777 'hdcore': '3.2.0',
1778 'plugin': 'flowplayer-3.2.0.1',
1779 }
1780 f4m_url += '&' if '?' in f4m_url else '?'
1781 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1782 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1783 continue
1784
1785 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1786 http_count += 1
1787 formats.append({
1788 'url': src_url,
1789 'ext': ext or src_ext or 'flv',
1790 'format_id': 'http-%d' % (bitrate or http_count),
1791 'tbr': bitrate,
1792 'filesize': filesize,
1793 'width': width,
1794 'height': height,
1795 })
1796 continue
1797
1798 return formats
1799
1800 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1801 urls = []
1802 subtitles = {}
1803 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1804 src = textstream.get('src')
1805 if not src or src in urls:
1806 continue
1807 urls.append(src)
1808 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1809 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1810 subtitles.setdefault(lang, []).append({
1811 'url': src,
1812 'ext': ext,
1813 })
1814 return subtitles
1815
1816 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1817 xspf = self._download_xml(
1818 xspf_url, playlist_id, 'Downloading xpsf playlist',
1819 'Unable to download xspf manifest', fatal=fatal)
1820 if xspf is False:
1821 return []
1822 return self._parse_xspf(
1823 xspf, playlist_id, xspf_url=xspf_url,
1824 xspf_base_url=base_url(xspf_url))
1825
1826 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1827 NS_MAP = {
1828 'xspf': 'http://xspf.org/ns/0/',
1829 's1': 'http://static.streamone.nl/player/ns/0',
1830 }
1831
1832 entries = []
1833 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1834 title = xpath_text(
1835 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1836 description = xpath_text(
1837 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1838 thumbnail = xpath_text(
1839 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1840 duration = float_or_none(
1841 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1842
1843 formats = []
1844 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1845 format_url = urljoin(xspf_base_url, location.text)
1846 if not format_url:
1847 continue
1848 formats.append({
1849 'url': format_url,
1850 'manifest_url': xspf_url,
1851 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1852 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1853 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1854 })
1855 self._sort_formats(formats)
1856
1857 entries.append({
1858 'id': playlist_id,
1859 'title': title,
1860 'description': description,
1861 'thumbnail': thumbnail,
1862 'duration': duration,
1863 'formats': formats,
1864 })
1865 return entries
1866
1867 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1868 res = self._download_xml_handle(
1869 mpd_url, video_id,
1870 note=note or 'Downloading MPD manifest',
1871 errnote=errnote or 'Failed to download MPD manifest',
1872 fatal=fatal)
1873 if res is False:
1874 return []
1875 mpd_doc, urlh = res
1876 mpd_base_url = base_url(urlh.geturl())
1877
1878 return self._parse_mpd_formats(
1879 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1880 formats_dict=formats_dict, mpd_url=mpd_url)
1881
1882 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1883 """
1884 Parse formats from MPD manifest.
1885 References:
1886 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1887 http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1888 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1889 """
1890 if mpd_doc.get('type') == 'dynamic':
1891 return []
1892
1893 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1894
1895 def _add_ns(path):
1896 return self._xpath_ns(path, namespace)
1897
1898 def is_drm_protected(element):
1899 return element.find(_add_ns('ContentProtection')) is not None
1900
1901 def extract_multisegment_info(element, ms_parent_info):
1902 ms_info = ms_parent_info.copy()
1903
1904 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1905 # common attributes and elements. We will only extract relevant
1906 # for us.
1907 def extract_common(source):
1908 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1909 if segment_timeline is not None:
1910 s_e = segment_timeline.findall(_add_ns('S'))
1911 if s_e:
1912 ms_info['total_number'] = 0
1913 ms_info['s'] = []
1914 for s in s_e:
1915 r = int(s.get('r', 0))
1916 ms_info['total_number'] += 1 + r
1917 ms_info['s'].append({
1918 't': int(s.get('t', 0)),
1919 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1920 'd': int(s.attrib['d']),
1921 'r': r,
1922 })
1923 start_number = source.get('startNumber')
1924 if start_number:
1925 ms_info['start_number'] = int(start_number)
1926 timescale = source.get('timescale')
1927 if timescale:
1928 ms_info['timescale'] = int(timescale)
1929 segment_duration = source.get('duration')
1930 if segment_duration:
1931 ms_info['segment_duration'] = float(segment_duration)
1932
1933 def extract_Initialization(source):
1934 initialization = source.find(_add_ns('Initialization'))
1935 if initialization is not None:
1936 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1937
1938 segment_list = element.find(_add_ns('SegmentList'))
1939 if segment_list is not None:
1940 extract_common(segment_list)
1941 extract_Initialization(segment_list)
1942 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1943 if segment_urls_e:
1944 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1945 else:
1946 segment_template = element.find(_add_ns('SegmentTemplate'))
1947 if segment_template is not None:
1948 extract_common(segment_template)
1949 media = segment_template.get('media')
1950 if media:
1951 ms_info['media'] = media
1952 initialization = segment_template.get('initialization')
1953 if initialization:
1954 ms_info['initialization'] = initialization
1955 else:
1956 extract_Initialization(segment_template)
1957 return ms_info
1958
1959 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1960 formats = []
1961 for period in mpd_doc.findall(_add_ns('Period')):
1962 period_duration = parse_duration(period.get('duration')) or mpd_duration
1963 period_ms_info = extract_multisegment_info(period, {
1964 'start_number': 1,
1965 'timescale': 1,
1966 })
1967 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1968 if is_drm_protected(adaptation_set):
1969 continue
1970 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1971 for representation in adaptation_set.findall(_add_ns('Representation')):
1972 if is_drm_protected(representation):
1973 continue
1974 representation_attrib = adaptation_set.attrib.copy()
1975 representation_attrib.update(representation.attrib)
1976 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1977 mime_type = representation_attrib['mimeType']
1978 content_type = mime_type.split('/')[0]
1979 if content_type == 'text':
1980 # TODO implement WebVTT downloading
1981 pass
1982 elif content_type in ('video', 'audio'):
1983 base_url = ''
1984 for element in (representation, adaptation_set, period, mpd_doc):
1985 base_url_e = element.find(_add_ns('BaseURL'))
1986 if base_url_e is not None:
1987 base_url = base_url_e.text + base_url
1988 if re.match(r'^https?://', base_url):
1989 break
1990 if mpd_base_url and not re.match(r'^https?://', base_url):
1991 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1992 mpd_base_url += '/'
1993 base_url = mpd_base_url + base_url
1994 representation_id = representation_attrib.get('id')
1995 lang = representation_attrib.get('lang')
1996 url_el = representation.find(_add_ns('BaseURL'))
1997 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1998 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1999 f = {
2000 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2001 'url': base_url,
2002 'manifest_url': mpd_url,
2003 'ext': mimetype2ext(mime_type),
2004 'width': int_or_none(representation_attrib.get('width')),
2005 'height': int_or_none(representation_attrib.get('height')),
2006 'tbr': float_or_none(bandwidth, 1000),
2007 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2008 'fps': int_or_none(representation_attrib.get('frameRate')),
2009 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2010 'format_note': 'DASH %s' % content_type,
2011 'filesize': filesize,
2012 'container': mimetype2ext(mime_type) + '_dash',
2013 }
2014 f.update(parse_codecs(representation_attrib.get('codecs')))
2015 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2016
2017 def prepare_template(template_name, identifiers):
2018 t = representation_ms_info[template_name]
2019 t = t.replace('$RepresentationID$', representation_id)
2020 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2021 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2022 t.replace('$$', '$')
2023 return t
2024
2025 # @initialization is a regular template like @media one
2026 # so it should be handled just the same way (see
2027 # https://github.com/rg3/youtube-dl/issues/11605)
2028 if 'initialization' in representation_ms_info:
2029 initialization_template = prepare_template(
2030 'initialization',
2031 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2032 # $Time$ shall not be included for @initialization thus
2033 # only $Bandwidth$ remains
2034 ('Bandwidth', ))
2035 representation_ms_info['initialization_url'] = initialization_template % {
2036 'Bandwidth': bandwidth,
2037 }
2038
2039 def location_key(location):
2040 return 'url' if re.match(r'^https?://', location) else 'path'
2041
2042 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2043
2044 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2045 media_location_key = location_key(media_template)
2046
2047 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2048 # can't be used at the same time
2049 if '%(Number' in media_template and 's' not in representation_ms_info:
2050 segment_duration = None
2051 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2052 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2053 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2054 representation_ms_info['fragments'] = [{
2055 media_location_key: media_template % {
2056 'Number': segment_number,
2057 'Bandwidth': bandwidth,
2058 },
2059 'duration': segment_duration,
2060 } for segment_number in range(
2061 representation_ms_info['start_number'],
2062 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2063 else:
2064 # $Number*$ or $Time$ in media template with S list available
2065 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2066 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2067 representation_ms_info['fragments'] = []
2068 segment_time = 0
2069 segment_d = None
2070 segment_number = representation_ms_info['start_number']
2071
2072 def add_segment_url():
2073 segment_url = media_template % {
2074 'Time': segment_time,
2075 'Bandwidth': bandwidth,
2076 'Number': segment_number,
2077 }
2078 representation_ms_info['fragments'].append({
2079 media_location_key: segment_url,
2080 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2081 })
2082
2083 for num, s in enumerate(representation_ms_info['s']):
2084 segment_time = s.get('t') or segment_time
2085 segment_d = s['d']
2086 add_segment_url()
2087 segment_number += 1
2088 for r in range(s.get('r', 0)):
2089 segment_time += segment_d
2090 add_segment_url()
2091 segment_number += 1
2092 segment_time += segment_d
2093 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2094 # No media template
2095 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2096 # or any YouTube dashsegments video
2097 fragments = []
2098 segment_index = 0
2099 timescale = representation_ms_info['timescale']
2100 for s in representation_ms_info['s']:
2101 duration = float_or_none(s['d'], timescale)
2102 for r in range(s.get('r', 0) + 1):
2103 segment_uri = representation_ms_info['segment_urls'][segment_index]
2104 fragments.append({
2105 location_key(segment_uri): segment_uri,
2106 'duration': duration,
2107 })
2108 segment_index += 1
2109 representation_ms_info['fragments'] = fragments
2110 elif 'segment_urls' in representation_ms_info:
2111 # Segment URLs with no SegmentTimeline
2112 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2113 # https://github.com/rg3/youtube-dl/pull/14844
2114 fragments = []
2115 segment_duration = float_or_none(
2116 representation_ms_info['segment_duration'],
2117 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2118 for segment_url in representation_ms_info['segment_urls']:
2119 fragment = {
2120 location_key(segment_url): segment_url,
2121 }
2122 if segment_duration:
2123 fragment['duration'] = segment_duration
2124 fragments.append(fragment)
2125 representation_ms_info['fragments'] = fragments
2126 # NB: MPD manifest may contain direct URLs to unfragmented media.
2127 # No fragments key is present in this case.
2128 if 'fragments' in representation_ms_info:
2129 f.update({
2130 'fragment_base_url': base_url,
2131 'fragments': [],
2132 'protocol': 'http_dash_segments',
2133 })
2134 if 'initialization_url' in representation_ms_info:
2135 initialization_url = representation_ms_info['initialization_url']
2136 if not f.get('url'):
2137 f['url'] = initialization_url
2138 f['fragments'].append({location_key(initialization_url): initialization_url})
2139 f['fragments'].extend(representation_ms_info['fragments'])
2140 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2141 # is not necessarily unique within a Period thus formats with
2142 # the same `format_id` are quite possible. There are numerous examples
2143 # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2144 # https://github.com/rg3/youtube-dl/issues/13919)
2145 full_info = formats_dict.get(representation_id, {}).copy()
2146 full_info.update(f)
2147 formats.append(full_info)
2148 else:
2149 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2150 return formats
2151
2152 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2153 res = self._download_xml_handle(
2154 ism_url, video_id,
2155 note=note or 'Downloading ISM manifest',
2156 errnote=errnote or 'Failed to download ISM manifest',
2157 fatal=fatal)
2158 if res is False:
2159 return []
2160 ism_doc, urlh = res
2161
2162 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2163
2164 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2165 """
2166 Parse formats from ISM manifest.
2167 References:
2168 1. [MS-SSTR]: Smooth Streaming Protocol,
2169 https://msdn.microsoft.com/en-us/library/ff469518.aspx
2170 """
2171 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2172 return []
2173
2174 duration = int(ism_doc.attrib['Duration'])
2175 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2176
2177 formats = []
2178 for stream in ism_doc.findall('StreamIndex'):
2179 stream_type = stream.get('Type')
2180 if stream_type not in ('video', 'audio'):
2181 continue
2182 url_pattern = stream.attrib['Url']
2183 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2184 stream_name = stream.get('Name')
2185 for track in stream.findall('QualityLevel'):
2186 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2187 # TODO: add support for WVC1 and WMAP
2188 if fourcc not in ('H264', 'AVC1', 'AACL'):
2189 self.report_warning('%s is not a supported codec' % fourcc)
2190 continue
2191 tbr = int(track.attrib['Bitrate']) // 1000
2192 # [1] does not mention Width and Height attributes. However,
2193 # they're often present while MaxWidth and MaxHeight are
2194 # missing, so should be used as fallbacks
2195 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2196 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2197 sampling_rate = int_or_none(track.get('SamplingRate'))
2198
2199 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2200 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2201
2202 fragments = []
2203 fragment_ctx = {
2204 'time': 0,
2205 }
2206 stream_fragments = stream.findall('c')
2207 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2208 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2209 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2210 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2211 if not fragment_ctx['duration']:
2212 try:
2213 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2214 except IndexError:
2215 next_fragment_time = duration
2216 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2217 for _ in range(fragment_repeat):
2218 fragments.append({
2219 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2220 'duration': fragment_ctx['duration'] / stream_timescale,
2221 })
2222 fragment_ctx['time'] += fragment_ctx['duration']
2223
2224 format_id = []
2225 if ism_id:
2226 format_id.append(ism_id)
2227 if stream_name:
2228 format_id.append(stream_name)
2229 format_id.append(compat_str(tbr))
2230
2231 formats.append({
2232 'format_id': '-'.join(format_id),
2233 'url': ism_url,
2234 'manifest_url': ism_url,
2235 'ext': 'ismv' if stream_type == 'video' else 'isma',
2236 'width': width,
2237 'height': height,
2238 'tbr': tbr,
2239 'asr': sampling_rate,
2240 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2241 'acodec': 'none' if stream_type == 'video' else fourcc,
2242 'protocol': 'ism',
2243 'fragments': fragments,
2244 '_download_params': {
2245 'duration': duration,
2246 'timescale': stream_timescale,
2247 'width': width or 0,
2248 'height': height or 0,
2249 'fourcc': fourcc,
2250 'codec_private_data': track.get('CodecPrivateData'),
2251 'sampling_rate': sampling_rate,
2252 'channels': int_or_none(track.get('Channels', 2)),
2253 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2254 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2255 },
2256 })
2257 return formats
2258
2259 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2260 def absolute_url(item_url):
2261 return urljoin(base_url, item_url)
2262
2263 def parse_content_type(content_type):
2264 if not content_type:
2265 return {}
2266 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2267 if ctr:
2268 mimetype, codecs = ctr.groups()
2269 f = parse_codecs(codecs)
2270 f['ext'] = mimetype2ext(mimetype)
2271 return f
2272 return {}
2273
2274 def _media_formats(src, cur_media_type, type_info={}):
2275 full_url = absolute_url(src)
2276 ext = type_info.get('ext') or determine_ext(full_url)
2277 if ext == 'm3u8':
2278 is_plain_url = False
2279 formats = self._extract_m3u8_formats(
2280 full_url, video_id, ext='mp4',
2281 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2282 preference=preference, fatal=False)
2283 elif ext == 'mpd':
2284 is_plain_url = False
2285 formats = self._extract_mpd_formats(
2286 full_url, video_id, mpd_id=mpd_id, fatal=False)
2287 else:
2288 is_plain_url = True
2289 formats = [{
2290 'url': full_url,
2291 'vcodec': 'none' if cur_media_type == 'audio' else None,
2292 }]
2293 return is_plain_url, formats
2294
2295 entries = []
2296 # amp-video and amp-audio are very similar to their HTML5 counterparts
2297 # so we wll include them right here (see
2298 # https://www.ampproject.org/docs/reference/components/amp-video)
2299 media_tags = [(media_tag, media_type, '')
2300 for media_tag, media_type
2301 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2302 media_tags.extend(re.findall(
2303 # We only allow video|audio followed by a whitespace or '>'.
2304 # Allowing more characters may end up in significant slow down (see
2305 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2306 # http://www.porntrex.com/maps/videositemap.xml).
2307 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2308 for media_tag, media_type, media_content in media_tags:
2309 media_info = {
2310 'formats': [],
2311 'subtitles': {},
2312 }
2313 media_attributes = extract_attributes(media_tag)
2314 src = media_attributes.get('src')
2315 if src:
2316 _, formats = _media_formats(src, media_type)
2317 media_info['formats'].extend(formats)
2318 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2319 if media_content:
2320 for source_tag in re.findall(r'<source[^>]+>', media_content):
2321 source_attributes = extract_attributes(source_tag)
2322 src = source_attributes.get('src')
2323 if not src:
2324 continue
2325 f = parse_content_type(source_attributes.get('type'))
2326 is_plain_url, formats = _media_formats(src, media_type, f)
2327 if is_plain_url:
2328 # res attribute is not standard but seen several times
2329 # in the wild
2330 f.update({
2331 'height': int_or_none(source_attributes.get('res')),
2332 'format_id': source_attributes.get('label'),
2333 })
2334 f.update(formats[0])
2335 media_info['formats'].append(f)
2336 else:
2337 media_info['formats'].extend(formats)
2338 for track_tag in re.findall(r'<track[^>]+>', media_content):
2339 track_attributes = extract_attributes(track_tag)
2340 kind = track_attributes.get('kind')
2341 if not kind or kind in ('subtitles', 'captions'):
2342 src = track_attributes.get('src')
2343 if not src:
2344 continue
2345 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2346 media_info['subtitles'].setdefault(lang, []).append({
2347 'url': absolute_url(src),
2348 })
2349 if media_info['formats'] or media_info['subtitles']:
2350 entries.append(media_info)
2351 return entries
2352
2353 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2354 formats = []
2355 hdcore_sign = 'hdcore=3.7.0'
2356 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2357 hds_host = hosts.get('hds')
2358 if hds_host:
2359 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2360 if 'hdcore=' not in f4m_url:
2361 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2362 f4m_formats = self._extract_f4m_formats(
2363 f4m_url, video_id, f4m_id='hds', fatal=False)
2364 for entry in f4m_formats:
2365 entry.update({'extra_param_to_segment_url': hdcore_sign})
2366 formats.extend(f4m_formats)
2367 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2368 hls_host = hosts.get('hls')
2369 if hls_host:
2370 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2371 formats.extend(self._extract_m3u8_formats(
2372 m3u8_url, video_id, 'mp4', 'm3u8_native',
2373 m3u8_id='hls', fatal=False))
2374 return formats
2375
2376 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2377 query = compat_urlparse.urlparse(url).query
2378 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2379 mobj = re.search(
2380 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2381 url_base = mobj.group('url')
2382 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2383 formats = []
2384
2385 def manifest_url(manifest):
2386 m_url = '%s/%s' % (http_base_url, manifest)
2387 if query:
2388 m_url += '?%s' % query
2389 return m_url
2390
2391 if 'm3u8' not in skip_protocols:
2392 formats.extend(self._extract_m3u8_formats(
2393 manifest_url('playlist.m3u8'), video_id, 'mp4',
2394 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2395 if 'f4m' not in skip_protocols:
2396 formats.extend(self._extract_f4m_formats(
2397 manifest_url('manifest.f4m'),
2398 video_id, f4m_id='hds', fatal=False))
2399 if 'dash' not in skip_protocols:
2400 formats.extend(self._extract_mpd_formats(
2401 manifest_url('manifest.mpd'),
2402 video_id, mpd_id='dash', fatal=False))
2403 if re.search(r'(?:/smil:|\.smil)', url_base):
2404 if 'smil' not in skip_protocols:
2405 rtmp_formats = self._extract_smil_formats(
2406 manifest_url('jwplayer.smil'),
2407 video_id, fatal=False)
2408 for rtmp_format in rtmp_formats:
2409 rtsp_format = rtmp_format.copy()
2410 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2411 del rtsp_format['play_path']
2412 del rtsp_format['ext']
2413 rtsp_format.update({
2414 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2415 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2416 'protocol': 'rtsp',
2417 })
2418 formats.extend([rtmp_format, rtsp_format])
2419 else:
2420 for protocol in ('rtmp', 'rtsp'):
2421 if protocol not in skip_protocols:
2422 formats.append({
2423 'url': '%s:%s' % (protocol, url_base),
2424 'format_id': protocol,
2425 'protocol': protocol,
2426 })
2427 return formats
2428
2429 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2430 mobj = re.search(
2431 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2432 webpage)
2433 if mobj:
2434 try:
2435 jwplayer_data = self._parse_json(mobj.group('options'),
2436 video_id=video_id,
2437 transform_source=transform_source)
2438 except ExtractorError:
2439 pass
2440 else:
2441 if isinstance(jwplayer_data, dict):
2442 return jwplayer_data
2443
2444 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2445 jwplayer_data = self._find_jwplayer_data(
2446 webpage, video_id, transform_source=js_to_json)
2447 return self._parse_jwplayer_data(
2448 jwplayer_data, video_id, *args, **kwargs)
2449
2450 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2451 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2452 # JWPlayer backward compatibility: flattened playlists
2453 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2454 if 'playlist' not in jwplayer_data:
2455 jwplayer_data = {'playlist': [jwplayer_data]}
2456
2457 entries = []
2458
2459 # JWPlayer backward compatibility: single playlist item
2460 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2461 if not isinstance(jwplayer_data['playlist'], list):
2462 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2463
2464 for video_data in jwplayer_data['playlist']:
2465 # JWPlayer backward compatibility: flattened sources
2466 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2467 if 'sources' not in video_data:
2468 video_data['sources'] = [video_data]
2469
2470 this_video_id = video_id or video_data['mediaid']
2471
2472 formats = self._parse_jwplayer_formats(
2473 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2474 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2475
2476 subtitles = {}
2477 tracks = video_data.get('tracks')
2478 if tracks and isinstance(tracks, list):
2479 for track in tracks:
2480 if not isinstance(track, dict):
2481 continue
2482 track_kind = track.get('kind')
2483 if not track_kind or not isinstance(track_kind, compat_str):
2484 continue
2485 if track_kind.lower() not in ('captions', 'subtitles'):
2486 continue
2487 track_url = urljoin(base_url, track.get('file'))
2488 if not track_url:
2489 continue
2490 subtitles.setdefault(track.get('label') or 'en', []).append({
2491 'url': self._proto_relative_url(track_url)
2492 })
2493
2494 entry = {
2495 'id': this_video_id,
2496 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2497 'description': video_data.get('description'),
2498 'thumbnail': self._proto_relative_url(video_data.get('image')),
2499 'timestamp': int_or_none(video_data.get('pubdate')),
2500 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2501 'subtitles': subtitles,
2502 }
2503 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2504 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2505 entry.update({
2506 '_type': 'url_transparent',
2507 'url': formats[0]['url'],
2508 })
2509 else:
2510 self._sort_formats(formats)
2511 entry['formats'] = formats
2512 entries.append(entry)
2513 if len(entries) == 1:
2514 return entries[0]
2515 else:
2516 return self.playlist_result(entries)
2517
2518 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2519 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2520 urls = []
2521 formats = []
2522 for source in jwplayer_sources_data:
2523 if not isinstance(source, dict):
2524 continue
2525 source_url = self._proto_relative_url(source.get('file'))
2526 if not source_url:
2527 continue
2528 if base_url:
2529 source_url = compat_urlparse.urljoin(base_url, source_url)
2530 if source_url in urls:
2531 continue
2532 urls.append(source_url)
2533 source_type = source.get('type') or ''
2534 ext = mimetype2ext(source_type) or determine_ext(source_url)
2535 if source_type == 'hls' or ext == 'm3u8':
2536 formats.extend(self._extract_m3u8_formats(
2537 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2538 m3u8_id=m3u8_id, fatal=False))
2539 elif source_type == 'dash' or ext == 'mpd':
2540 formats.extend(self._extract_mpd_formats(
2541 source_url, video_id, mpd_id=mpd_id, fatal=False))
2542 elif ext == 'smil':
2543 formats.extend(self._extract_smil_formats(
2544 source_url, video_id, fatal=False))
2545 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2546 elif source_type.startswith('audio') or ext in (
2547 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2548 formats.append({
2549 'url': source_url,
2550 'vcodec': 'none',
2551 'ext': ext,
2552 })
2553 else:
2554 height = int_or_none(source.get('height'))
2555 if height is None:
2556 # Often no height is provided but there is a label in
2557 # format like "1080p", "720p SD", or 1080.
2558 height = int_or_none(self._search_regex(
2559 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2560 'height', default=None))
2561 a_format = {
2562 'url': source_url,
2563 'width': int_or_none(source.get('width')),
2564 'height': height,
2565 'tbr': int_or_none(source.get('bitrate')),
2566 'ext': ext,
2567 }
2568 if source_url.startswith('rtmp'):
2569 a_format['ext'] = 'flv'
2570 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2571 # of jwplayer.flash.swf
2572 rtmp_url_parts = re.split(
2573 r'((?:mp4|mp3|flv):)', source_url, 1)
2574 if len(rtmp_url_parts) == 3:
2575 rtmp_url, prefix, play_path = rtmp_url_parts
2576 a_format.update({
2577 'url': rtmp_url,
2578 'play_path': prefix + play_path,
2579 })
2580 if rtmp_params:
2581 a_format.update(rtmp_params)
2582 formats.append(a_format)
2583 return formats
2584
2585 def _live_title(self, name):
2586 """ Generate the title for a live video """
2587 now = datetime.datetime.now()
2588 now_str = now.strftime('%Y-%m-%d %H:%M')
2589 return name + ' ' + now_str
2590
2591 def _int(self, v, name, fatal=False, **kwargs):
2592 res = int_or_none(v, **kwargs)
2593 if 'get_attr' in kwargs:
2594 print(getattr(v, kwargs['get_attr']))
2595 if res is None:
2596 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2597 if fatal:
2598 raise ExtractorError(msg)
2599 else:
2600 self._downloader.report_warning(msg)
2601 return res
2602
2603 def _float(self, v, name, fatal=False, **kwargs):
2604 res = float_or_none(v, **kwargs)
2605 if res is None:
2606 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2607 if fatal:
2608 raise ExtractorError(msg)
2609 else:
2610 self._downloader.report_warning(msg)
2611 return res
2612
2613 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2614 path='/', secure=False, discard=False, rest={}, **kwargs):
2615 cookie = compat_cookiejar.Cookie(
2616 0, name, value, port, port is not None, domain, True,
2617 domain.startswith('.'), path, True, secure, expire_time,
2618 discard, None, None, rest)
2619 self._downloader.cookiejar.set_cookie(cookie)
2620
2621 def _get_cookies(self, url):
2622 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2623 req = sanitized_Request(url)
2624 self._downloader.cookiejar.add_cookie_header(req)
2625 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2626
2627 def get_testcases(self, include_onlymatching=False):
2628 t = getattr(self, '_TEST', None)
2629 if t:
2630 assert not hasattr(self, '_TESTS'), \
2631 '%s has _TEST and _TESTS' % type(self).__name__
2632 tests = [t]
2633 else:
2634 tests = getattr(self, '_TESTS', [])
2635 for t in tests:
2636 if not include_onlymatching and t.get('only_matching', False):
2637 continue
2638 t['name'] = type(self).__name__[:-len('IE')]
2639 yield t
2640
2641 def is_suitable(self, age_limit):
2642 """ Test whether the extractor is generally suitable for the given
2643 age limit (i.e. pornographic sites are not, all others usually are) """
2644
2645 any_restricted = False
2646 for tc in self.get_testcases(include_onlymatching=False):
2647 if tc.get('playlist', []):
2648 tc = tc['playlist'][0]
2649 is_restricted = age_restricted(
2650 tc.get('info_dict', {}).get('age_limit'), age_limit)
2651 if not is_restricted:
2652 return True
2653 any_restricted = any_restricted or is_restricted
2654 return not any_restricted
2655
2656 def extract_subtitles(self, *args, **kwargs):
2657 if (self._downloader.params.get('writesubtitles', False) or
2658 self._downloader.params.get('listsubtitles')):
2659 return self._get_subtitles(*args, **kwargs)
2660 return {}
2661
2662 def _get_subtitles(self, *args, **kwargs):
2663 raise NotImplementedError('This method must be implemented by subclasses')
2664
2665 @staticmethod
2666 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2667 """ Merge subtitle items for one language. Items with duplicated URLs
2668 will be dropped. """
2669 list1_urls = set([item['url'] for item in subtitle_list1])
2670 ret = list(subtitle_list1)
2671 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2672 return ret
2673
2674 @classmethod
2675 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2676 """ Merge two subtitle dictionaries, language by language. """
2677 ret = dict(subtitle_dict1)
2678 for lang in subtitle_dict2:
2679 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2680 return ret
2681
2682 def extract_automatic_captions(self, *args, **kwargs):
2683 if (self._downloader.params.get('writeautomaticsub', False) or
2684 self._downloader.params.get('listsubtitles')):
2685 return self._get_automatic_captions(*args, **kwargs)
2686 return {}
2687
2688 def _get_automatic_captions(self, *args, **kwargs):
2689 raise NotImplementedError('This method must be implemented by subclasses')
2690
2691 def mark_watched(self, *args, **kwargs):
2692 if (self._downloader.params.get('mark_watched', False) and
2693 (self._get_login_info()[0] is not None or
2694 self._downloader.params.get('cookiefile') is not None)):
2695 self._mark_watched(*args, **kwargs)
2696
2697 def _mark_watched(self, *args, **kwargs):
2698 raise NotImplementedError('This method must be implemented by subclasses')
2699
2700 def geo_verification_headers(self):
2701 headers = {}
2702 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2703 if geo_verification_proxy:
2704 headers['Ytdl-request-proxy'] = geo_verification_proxy
2705 return headers
2706
2707 def _generic_id(self, url):
2708 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2709
2710 def _generic_title(self, url):
2711 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2712
2713
2714 class SearchInfoExtractor(InfoExtractor):
2715 """
2716 Base class for paged search queries extractors.
2717 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2718 Instances should define _SEARCH_KEY and _MAX_RESULTS.
2719 """
2720
2721 @classmethod
2722 def _make_valid_url(cls):
2723 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2724
2725 @classmethod
2726 def suitable(cls, url):
2727 return re.match(cls._make_valid_url(), url) is not None
2728
2729 def _real_extract(self, query):
2730 mobj = re.match(self._make_valid_url(), query)
2731 if mobj is None:
2732 raise ExtractorError('Invalid search query "%s"' % query)
2733
2734 prefix = mobj.group('prefix')
2735 query = mobj.group('query')
2736 if prefix == '':
2737 return self._get_n_results(query, 1)
2738 elif prefix == 'all':
2739 return self._get_n_results(query, self._MAX_RESULTS)
2740 else:
2741 n = int(prefix)
2742 if n <= 0:
2743 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2744 elif n > self._MAX_RESULTS:
2745 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2746 n = self._MAX_RESULTS
2747 return self._get_n_results(query, n)
2748
2749 def _get_n_results(self, query, n):
2750 """Get a specified number of results for a query"""
2751 raise NotImplementedError('This method must be implemented by subclasses')
2752
2753 @property
2754 def SEARCH_KEY(self):
2755 return self._SEARCH_KEY