2 from __future__
import unicode_literals
17 from ..compat
import (
20 compat_etree_fromstring
,
26 compat_urllib_parse_unquote
,
27 compat_urllib_parse_urlencode
,
28 compat_urllib_request
,
30 compat_xml_parse_error
,
32 from ..downloader
.f4m
import (
34 remove_encrypted_media
,
59 parse_m3u8_attributes
,
76 class InfoExtractor(object):
77 """Information Extractor class.
79 Information extractors are the classes that, given a URL, extract
80 information about the video (or videos) the URL refers to. This
81 information includes the real video URL, the video title, author and
82 others. The information is stored in a dictionary which is then
83 passed to the YoutubeDL. The YoutubeDL processes this
84 information possibly downloading the video to the file system, among
85 other possible outcomes.
87 The type field determines the type of the result.
88 By far the most common value (and the default if _type is missing) is
89 "video", which indicates a single video.
91 For a video, the dictionaries must include the following fields:
94 title: Video title, unescaped.
96 Additionally, it must contain either a formats entry or a url one:
98 formats: A list of dictionaries for each format available, ordered
99 from worst to best quality.
102 * url Mandatory. The URL of the video file
104 The URL of the manifest file in case of
105 fragmented media (DASH, hls, hds)
106 * ext Will be calculated from URL if missing
107 * format A human-readable description of the format
108 ("mp4 container with h264/opus").
109 Calculated from the format_id, width, height.
110 and format_note fields if missing.
111 * format_id A short description of the format
112 ("mp4_h264_opus" or "19").
113 Technically optional, but strongly recommended.
114 * format_note Additional info about the format
115 ("3D" or "DASH video")
116 * width Width of the video, if known
117 * height Height of the video, if known
118 * resolution Textual description of width and height
119 * tbr Average bitrate of audio and video in KBit/s
120 * abr Average audio bitrate in KBit/s
121 * acodec Name of the audio codec in use
122 * asr Audio sampling rate in Hertz
123 * vbr Average video bitrate in KBit/s
125 * vcodec Name of the video codec in use
126 * container Name of the container format
127 * filesize The number of bytes, if known in advance
128 * filesize_approx An estimate for the number of bytes
129 * player_url SWF Player URL (used for rtmpdump).
130 * protocol The protocol that will be used for the actual
131 download, lower-case.
132 "http", "https", "rtsp", "rtmp", "rtmpe",
133 "m3u8", "m3u8_native" or "http_dash_segments".
135 Base URL for fragments. Each fragment's path
136 value (if present) will be relative to
138 * fragments A list of fragments of a fragmented media.
139 Each fragment entry must contain either an url
140 or a path. If an url is present it should be
141 considered by a client. Otherwise both path and
142 fragment_base_url must be present. Here is
143 the list of all potential fields:
144 * "url" - fragment's URL
145 * "path" - fragment's path relative to
147 * "duration" (optional, int or float)
148 * "filesize" (optional, int)
149 * preference Order number of this format. If this field is
150 present and not None, the formats get sorted
151 by this field, regardless of all other values.
152 -1 for default (order by other properties),
153 -2 or smaller for less than default.
154 < -1000 to hide the format (if there is
155 another one which is strictly better)
156 * language Language code, e.g. "de" or "en-US".
157 * language_preference Is this in the language mentioned in
159 10 if it's what the URL is about,
160 -1 for default (don't know),
161 -10 otherwise, other values reserved for now.
162 * quality Order number of the video quality of this
163 format, irrespective of the file format.
164 -1 for default (order by other properties),
165 -2 or smaller for less than default.
166 * source_preference Order number for this video source
167 (quality takes higher priority)
168 -1 for default (order by other properties),
169 -2 or smaller for less than default.
170 * http_headers A dictionary of additional HTTP headers
171 to add to the request.
172 * stretched_ratio If given and not 1, indicates that the
173 video's pixels are not square.
174 width : height ratio as float.
175 * no_resume The server does not support resuming the
176 (HTTP or RTMP) download. Boolean.
177 * downloader_options A dictionary of downloader options as
178 described in FileDownloader
180 url: Final video URL.
181 ext: Video filename extension.
182 format: The video format, defaults to ext (used for --get-format)
183 player_url: SWF Player URL (used for rtmpdump).
185 The following fields are optional:
187 alt_title: A secondary title of the video.
188 display_id An alternative identifier for the video, not necessarily
189 unique, but available before title. Typically, id is
190 something like "4234987", title "Dancing naked mole rats",
191 and display_id "dancing-naked-mole-rats"
192 thumbnails: A list of dictionaries, with the following entries:
193 * "id" (optional, string) - Thumbnail format ID
195 * "preference" (optional, int) - quality of the image
196 * "width" (optional, int)
197 * "height" (optional, int)
198 * "resolution" (optional, string "{width}x{height"},
200 * "filesize" (optional, int)
201 thumbnail: Full URL to a video thumbnail image.
202 description: Full video description.
203 uploader: Full name of the video uploader.
204 license: License name the video is licensed under.
205 creator: The creator of the video.
206 release_date: The date (YYYYMMDD) when the video was released.
207 timestamp: UNIX timestamp of the moment the video became available.
208 upload_date: Video upload date (YYYYMMDD).
209 If not explicitly set, calculated from timestamp.
210 uploader_id: Nickname or id of the video uploader.
211 uploader_url: Full URL to a personal webpage of the video uploader.
212 location: Physical location where the video was filmed.
213 subtitles: The available subtitles as a dictionary in the format
214 {tag: subformats}. "tag" is usually a language code, and
215 "subformats" is a list sorted from lower to higher
216 preference, each element is a dictionary with the "ext"
218 * "data": The subtitles file contents
219 * "url": A URL pointing to the subtitles file
220 "ext" will be calculated from URL if missing
221 automatic_captions: Like 'subtitles', used by the YoutubeIE for
222 automatically generated captions
223 duration: Length of the video in seconds, as an integer or float.
224 view_count: How many users have watched the video on the platform.
225 like_count: Number of positive ratings of the video
226 dislike_count: Number of negative ratings of the video
227 repost_count: Number of reposts of the video
228 average_rating: Average rating give by users, the scale used depends on the webpage
229 comment_count: Number of comments on the video
230 comments: A list of comments, each with one or more of the following
231 properties (all but one of text or html optional):
232 * "author" - human-readable name of the comment author
233 * "author_id" - user ID of the comment author
235 * "html" - Comment as HTML
236 * "text" - Plain text of the comment
237 * "timestamp" - UNIX timestamp of comment
238 * "parent" - ID of the comment this one is replying to.
239 Set to "root" to indicate that this is a
240 comment to the original video.
241 age_limit: Age restriction for the video, as an integer (years)
242 webpage_url: The URL to the video webpage, if given to youtube-dl it
243 should allow to get the same result again. (It will be set
244 by YoutubeDL if it's missing)
245 categories: A list of categories that the video falls in, for example
247 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
248 is_live: True, False, or None (=unknown). Whether this video is a
249 live stream that goes on instead of a fixed-length video.
250 start_time: Time in seconds where the reproduction should start, as
251 specified in the URL.
252 end_time: Time in seconds where the reproduction should end, as
253 specified in the URL.
254 chapters: A list of dictionaries, with the following entries:
255 * "start_time" - The start time of the chapter in seconds
256 * "end_time" - The end time of the chapter in seconds
257 * "title" (optional, string)
259 The following fields should only be used when the video belongs to some logical
262 chapter: Name or title of the chapter the video belongs to.
263 chapter_number: Number of the chapter the video belongs to, as an integer.
264 chapter_id: Id of the chapter the video belongs to, as a unicode string.
266 The following fields should only be used when the video is an episode of some
267 series, programme or podcast:
269 series: Title of the series or programme the video episode belongs to.
270 season: Title of the season the video episode belongs to.
271 season_number: Number of the season the video episode belongs to, as an integer.
272 season_id: Id of the season the video episode belongs to, as a unicode string.
273 episode: Title of the video episode. Unlike mandatory video title field,
274 this field should denote the exact title of the video episode
275 without any kind of decoration.
276 episode_number: Number of the video episode within a season, as an integer.
277 episode_id: Id of the video episode, as a unicode string.
279 The following fields should only be used when the media is a track or a part of
282 track: Title of the track.
283 track_number: Number of the track within an album or a disc, as an integer.
284 track_id: Id of the track (useful in case of custom indexing, e.g. 6.iii),
286 artist: Artist(s) of the track.
287 genre: Genre(s) of the track.
288 album: Title of the album the track belongs to.
289 album_type: Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
290 album_artist: List of all artists appeared on the album (e.g.
291 "Ash Borer / Fell Voices" or "Various Artists", useful for splits
293 disc_number: Number of the disc or other physical medium the track belongs to,
295 release_year: Year (YYYY) when the album was released.
297 Unless mentioned otherwise, the fields should be Unicode strings.
299 Unless mentioned otherwise, None is equivalent to absence of information.
302 _type "playlist" indicates multiple videos.
303 There must be a key "entries", which is a list, an iterable, or a PagedList
304 object, each element of which is a valid dictionary by this specification.
306 Additionally, playlists can have "id", "title", "description", "uploader",
307 "uploader_id", "uploader_url" attributes with the same semantics as videos
311 _type "multi_video" indicates that there are multiple videos that
312 form a single show, for examples multiple acts of an opera or TV episode.
313 It must have an entries key like a playlist and contain all the keys
314 required for a video at the same time.
317 _type "url" indicates that the video must be extracted from another
318 location, possibly by a different extractor. Its only required key is:
319 "url" - the next URL to extract.
320 The key "ie_key" can be set to the class name (minus the trailing "IE",
321 e.g. "Youtube") if the extractor class is known in advance.
322 Additionally, the dictionary may have any properties of the resolved entity
323 known in advance, for example "title" if the title of the referred video is
327 _type "url_transparent" entities have the same specification as "url", but
328 indicate that the given additional information is more precise than the one
329 associated with the resolved URL.
330 This is useful when a site employs a video service that hosts the video and
331 its technical metadata, but that video service does not embed a useful
332 title, description etc.
335 Subclasses of this one should re-define the _real_initialize() and
336 _real_extract() methods and define a _VALID_URL regexp.
337 Probably, they should also be added to the list of extractors.
339 _GEO_BYPASS attribute may be set to False in order to disable
340 geo restriction bypass mechanisms for a particular extractor.
341 Though it won't disable explicit geo restriction bypass based on
342 country code provided with geo_bypass_country.
344 _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
345 countries for this extractor. One of these countries will be used by
346 geo restriction bypass mechanism right away in order to bypass
347 geo restriction, of course, if the mechanism is not disabled.
349 _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
350 IP blocks in CIDR notation for this extractor. One of these IP blocks
351 will be used by geo restriction bypass mechanism similarly
354 Finally, the _WORKING attribute should be set to False for broken IEs
355 in order to warn the users and skip the tests.
360 _x_forwarded_for_ip
= None
362 _GEO_COUNTRIES
= None
363 _GEO_IP_BLOCKS
= None
366 def __init__(self
, downloader
=None):
367 """Constructor. Receives an optional downloader."""
369 self
._x
_forwarded
_for
_ip
= None
370 self
.set_downloader(downloader
)
373 def suitable(cls
, url
):
374 """Receives a URL and returns True if suitable for this IE."""
376 # This does not use has/getattr intentionally - we want to know whether
377 # we have cached the regexp for *this* class, whereas getattr would also
378 # match the superclass
379 if '_VALID_URL_RE' not in cls
.__dict
__:
380 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
381 return cls
._VALID
_URL
_RE
.match(url
) is not None
384 def _match_id(cls
, url
):
385 if '_VALID_URL_RE' not in cls
.__dict
__:
386 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
387 m
= cls
._VALID
_URL
_RE
.match(url
)
389 return compat_str(m
.group('id'))
393 """Getter method for _WORKING."""
396 def initialize(self
):
397 """Initializes an instance (authentication, etc)."""
398 self
._initialize
_geo
_bypass
({
399 'countries': self
._GEO
_COUNTRIES
,
400 'ip_blocks': self
._GEO
_IP
_BLOCKS
,
403 self
._real
_initialize
()
406 def _initialize_geo_bypass(self
, geo_bypass_context
):
408 Initialize geo restriction bypass mechanism.
410 This method is used to initialize geo bypass mechanism based on faking
411 X-Forwarded-For HTTP header. A random country from provided country list
412 is selected and a random IP belonging to this country is generated. This
413 IP will be passed as X-Forwarded-For HTTP header in all subsequent
416 This method will be used for initial geo bypass mechanism initialization
417 during the instance initialization with _GEO_COUNTRIES and
420 You may also manually call it from extractor's code if geo bypass
421 information is not available beforehand (e.g. obtained during
422 extraction) or due to some other reason. In this case you should pass
423 this information in geo bypass context passed as first argument. It may
424 contain following fields:
426 countries: List of geo unrestricted countries (similar
428 ip_blocks: List of geo unrestricted IP blocks in CIDR notation
429 (similar to _GEO_IP_BLOCKS)
432 if not self
._x
_forwarded
_for
_ip
:
434 # Geo bypass mechanism is explicitly disabled by user
435 if not self
._downloader
.params
.get('geo_bypass', True):
438 if not geo_bypass_context
:
439 geo_bypass_context
= {}
441 # Backward compatibility: previously _initialize_geo_bypass
442 # expected a list of countries, some 3rd party code may still use
444 if isinstance(geo_bypass_context
, (list, tuple)):
445 geo_bypass_context
= {
446 'countries': geo_bypass_context
,
449 # The whole point of geo bypass mechanism is to fake IP
450 # as X-Forwarded-For HTTP header based on some IP block or
453 # Path 1: bypassing based on IP block in CIDR notation
455 # Explicit IP block specified by user, use it right away
456 # regardless of whether extractor is geo bypassable or not
457 ip_block
= self
._downloader
.params
.get('geo_bypass_ip_block', None)
459 # Otherwise use random IP block from geo bypass context but only
460 # if extractor is known as geo bypassable
462 ip_blocks
= geo_bypass_context
.get('ip_blocks')
463 if self
._GEO
_BYPASS
and ip_blocks
:
464 ip_block
= random
.choice(ip_blocks
)
467 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(ip_block
)
468 if self
._downloader
.params
.get('verbose', False):
469 self
._downloader
.to_screen(
470 '[debug] Using fake IP %s as X-Forwarded-For.'
471 % self
._x
_forwarded
_for
_ip
)
474 # Path 2: bypassing based on country code
476 # Explicit country code specified by user, use it right away
477 # regardless of whether extractor is geo bypassable or not
478 country
= self
._downloader
.params
.get('geo_bypass_country', None)
480 # Otherwise use random country code from geo bypass context but
481 # only if extractor is known as geo bypassable
483 countries
= geo_bypass_context
.get('countries')
484 if self
._GEO
_BYPASS
and countries
:
485 country
= random
.choice(countries
)
488 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country
)
489 if self
._downloader
.params
.get('verbose', False):
490 self
._downloader
.to_screen(
491 '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
492 % (self
._x
_forwarded
_for
_ip
, country
.upper()))
494 def extract(self
, url
):
495 """Extracts URL information and returns it in list of dicts."""
500 ie_result
= self
._real
_extract
(url
)
501 if self
._x
_forwarded
_for
_ip
:
502 ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
504 except GeoRestrictedError
as e
:
505 if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
):
508 except ExtractorError
:
510 except compat_http_client
.IncompleteRead
as e
:
511 raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True)
512 except (KeyError, StopIteration) as e
:
513 raise ExtractorError('An extractor error has occurred.', cause
=e
)
515 def __maybe_fake_ip_and_retry(self
, countries
):
516 if (not self
._downloader
.params
.get('geo_bypass_country', None) and
518 self
._downloader
.params
.get('geo_bypass', True) and
519 not self
._x
_forwarded
_for
_ip
and
521 country_code
= random
.choice(countries
)
522 self
._x
_forwarded
_for
_ip
= GeoUtils
.random_ipv4(country_code
)
523 if self
._x
_forwarded
_for
_ip
:
525 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
526 % (self
._x
_forwarded
_for
_ip
, country_code
.upper()))
530 def set_downloader(self
, downloader
):
531 """Sets the downloader for this IE."""
532 self
._downloader
= downloader
534 def _real_initialize(self
):
535 """Real initialization process. Redefine in subclasses."""
538 def _real_extract(self
, url
):
539 """Real extraction process. Redefine in subclasses."""
544 """A string for getting the InfoExtractor with get_info_extractor"""
545 return compat_str(cls
.__name
__[:-2])
549 return compat_str(type(self
).__name
__[:-2])
551 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query
={}):
552 """ Returns the response handle """
554 self
.report_download_webpage(video_id
)
555 elif note
is not False:
557 self
.to_screen('%s' % (note
,))
559 self
.to_screen('%s: %s' % (video_id
, note
))
561 # Some sites check X-Forwarded-For HTTP header in order to figure out
562 # the origin of the client behind proxy. This allows bypassing geo
563 # restriction by faking this header's value to IP that belongs to some
564 # geo unrestricted country. We will do so once we encounter any
565 # geo restriction error.
566 if self
._x
_forwarded
_for
_ip
:
567 if 'X-Forwarded-For' not in headers
:
568 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
570 if isinstance(url_or_request
, compat_urllib_request
.Request
):
571 url_or_request
= update_Request(
572 url_or_request
, data
=data
, headers
=headers
, query
=query
)
575 url_or_request
= update_url_query(url_or_request
, query
)
576 if data
is not None or headers
:
577 url_or_request
= sanitized_Request(url_or_request
, data
, headers
)
579 return self
._downloader
.urlopen(url_or_request
)
580 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
584 errnote
= 'Unable to download webpage'
586 errmsg
= '%s: %s' % (errnote
, error_to_compat_str(err
))
588 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
590 self
._downloader
.report_warning(errmsg
)
593 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query
={}):
594 """ Returns a tuple (page content as string, URL handle) """
595 # Strip hashes from the URL (#1038)
596 if isinstance(url_or_request
, (compat_str
, str)):
597 url_or_request
= url_or_request
.partition('#')[0]
599 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
)
603 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
)
604 return (content
, urlh
)
607 def _guess_encoding_from_content(content_type
, webpage_bytes
):
608 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
610 encoding
= m
.group(1)
612 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
613 webpage_bytes[:1024])
615 encoding = m.group(1).decode('ascii')
616 elif webpage_bytes.startswith(b'\xff\xfe'):
623 def __check_blocked(self, content):
624 first_block = content[:512]
625 if ('<title>Access to this site is blocked</title>' in content and
626 'Websense' in first_block):
627 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
628 blocked_iframe = self._html_search_regex(
629 r'<iframe src="([^
"]+)"', content,
630 'Websense information URL
', default=None)
632 msg += ' Visit
%s for more details
' % blocked_iframe
633 raise ExtractorError(msg, expected=True)
634 if '<title
>The URL you requested has been blocked
</title
>' in first_block:
636 'Access to this webpage has been blocked by Indian censorship
. '
637 'Use a VPN
or proxy
server (with --proxy
) to route around it
.')
638 block_msg = self._html_search_regex(
639 r'</h1
><p
>(.*?
)</p
>',
640 content, 'block message
', default=None)
642 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ')
643 raise ExtractorError(msg, expected=True)
644 if ('<title
>TTK
:: ŠŠ¾ŃŃŃŠæ Šŗ ŃŠµŃŃŃŃŃ Š¾Š³ŃŠ°Š½ŠøŃŠµŠ½
</title
>' in content and
645 'blocklist
.rkn
.gov
.ru
' in content):
646 raise ExtractorError(
647 'Access to this webpage has been blocked by decision of the Russian government
. '
648 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.',
651 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
652 content_type = urlh.headers.get('Content
-Type
', '')
653 webpage_bytes = urlh.read()
654 if prefix is not None:
655 webpage_bytes = prefix + webpage_bytes
657 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
658 if self._downloader.params.get('dump_intermediate_pages
', False):
659 self.to_screen('Dumping request to
' + urlh.geturl())
660 dump = base64.b64encode(webpage_bytes).decode('ascii
')
661 self._downloader.to_screen(dump)
662 if self._downloader.params.get('write_pages
', False):
663 basen = '%s_%s' % (video_id, urlh.geturl())
665 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest()
666 basen = basen[:240 - len(h)] + h
667 raw_filename = basen + '.dump
'
668 filename = sanitize_filename(raw_filename, restricted=True)
669 self.to_screen('Saving request to
' + filename)
670 # Working around MAX_PATH limitation on Windows (see
671 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
672 if compat_os_name == 'nt
':
673 absfilepath = os.path.abspath(filename)
674 if len(absfilepath) > 259:
675 filename = '\\\\?
\\' + absfilepath
676 with open(filename, 'wb
') as outf:
677 outf.write(webpage_bytes)
680 content = webpage_bytes.decode(encoding, 'replace
')
682 content = webpage_bytes.decode('utf
-8', 'replace
')
684 self.__check_blocked(content)
688 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
689 """ Returns the data of the page as a string """
692 while success is False:
694 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
696 except compat_http_client.IncompleteRead as e:
698 if try_count >= tries:
700 self._sleep(timeout, video_id)
707 def _download_xml_handle(
708 self, url_or_request, video_id, note='Downloading XML
',
709 errnote='Unable to download XML
', transform_source=None,
710 fatal=True, encoding=None, data=None, headers={}, query={}):
711 """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
712 res = self._download_webpage_handle(
713 url_or_request, video_id, note, errnote, fatal=fatal,
714 encoding=encoding, data=data, headers=headers, query=query)
717 xml_string, urlh = res
718 return self._parse_xml(
719 xml_string, video_id, transform_source=transform_source,
722 def _download_xml(self, url_or_request, video_id,
723 note='Downloading XML
', errnote='Unable to download XML
',
724 transform_source=None, fatal=True, encoding=None,
725 data=None, headers={}, query={}):
726 """Return the xml as an xml.etree.ElementTree.Element"""
727 res = self._download_xml_handle(
728 url_or_request, video_id, note=note, errnote=errnote,
729 transform_source=transform_source, fatal=fatal, encoding=encoding,
730 data=data, headers=headers, query=query)
731 return res if res is False else res[0]
733 def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
735 xml_string = transform_source(xml_string)
737 return compat_etree_fromstring(xml_string.encode('utf
-8'))
738 except compat_xml_parse_error as ve:
739 errmsg = '%s: Failed to parse XML
' % video_id
741 raise ExtractorError(errmsg, cause=ve)
743 self.report_warning(errmsg + str(ve))
745 def _download_json_handle(
746 self, url_or_request, video_id, note='Downloading JSON metadata
',
747 errnote='Unable to download JSON metadata
', transform_source=None,
748 fatal=True, encoding=None, data=None, headers={}, query={}):
749 """Return a tuple (JSON object, URL handle)"""
750 res = self._download_webpage_handle(
751 url_or_request, video_id, note, errnote, fatal=fatal,
752 encoding=encoding, data=data, headers=headers, query=query)
755 json_string, urlh = res
756 return self._parse_json(
757 json_string, video_id, transform_source=transform_source,
761 self, url_or_request, video_id, note='Downloading JSON metadata
',
762 errnote='Unable to download JSON metadata
', transform_source=None,
763 fatal=True, encoding=None, data=None, headers={}, query={}):
764 res = self._download_json_handle(
765 url_or_request, video_id, note=note, errnote=errnote,
766 transform_source=transform_source, fatal=fatal, encoding=encoding,
767 data=data, headers=headers, query=query)
768 return res if res is False else res[0]
770 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
772 json_string = transform_source(json_string)
774 return json.loads(json_string)
775 except ValueError as ve:
776 errmsg = '%s: Failed to parse JSON
' % video_id
778 raise ExtractorError(errmsg, cause=ve)
780 self.report_warning(errmsg + str(ve))
782 def report_warning(self, msg, video_id=None):
783 idstr = '' if video_id is None else '%s: ' % video_id
784 self._downloader.report_warning(
785 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
787 def to_screen(self, msg):
788 """Print msg to screen, prefixing it with '[ie_name
]'"""
789 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
791 def report_extraction(self, id_or_name):
792 """Report information extraction."""
793 self.to_screen('%s: Extracting information
' % id_or_name)
795 def report_download_webpage(self, video_id):
796 """Report webpage download."""
797 self.to_screen('%s: Downloading webpage
' % video_id)
799 def report_age_confirmation(self):
800 """Report attempt to confirm age."""
801 self.to_screen('Confirming age
')
803 def report_login(self):
804 """Report attempt to log in."""
805 self.to_screen('Logging
in')
808 def raise_login_required(msg='This video
is only available
for registered users
'):
809 raise ExtractorError(
810 '%s. Use
--username
and --password
or --netrc to provide account credentials
.' % msg,
814 def raise_geo_restricted(msg='This video
is not available
from your location due to geo restriction
', countries=None):
815 raise GeoRestrictedError(msg, countries=countries)
817 # Methods for following #608
819 def url_result(url, ie=None, video_id=None, video_title=None):
820 """Returns a URL that points to a page that should be processed"""
821 # TODO: ie should be the class used for getting the info
822 video_info = {'_type
': 'url
',
825 if video_id is not None:
826 video_info['id'] = video_id
827 if video_title is not None:
828 video_info['title
'] = video_title
831 def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
833 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
835 return self.playlist_result(
836 urls, playlist_id=playlist_id, playlist_title=playlist_title)
839 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
840 """Returns a playlist"""
841 video_info = {'_type
': 'playlist
',
844 video_info['id'] = playlist_id
846 video_info['title
'] = playlist_title
847 if playlist_description:
848 video_info['description
'] = playlist_description
851 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
853 Perform a regex search on the given string, using a single or a list of
854 patterns returning the first matching group.
855 In case of failure return a default value or raise a WARNING or a
856 RegexNotFoundError, depending on fatal, specifying the field name.
858 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
859 mobj = re.search(pattern, string, flags)
862 mobj = re.search(p, string, flags)
866 if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty():
867 _name = '\033[0;34m
%s\033[0m
' % name
873 # return the first matching group
874 return next(g for g in mobj.groups() if g is not None)
876 return mobj.group(group)
877 elif default is not NO_DEFAULT:
880 raise RegexNotFoundError('Unable to extract
%s' % _name)
882 self._downloader.report_warning('unable to extract
%s' % _name + bug_reports_message())
885 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
887 Like _search_regex, but strips HTML tags and unescapes entities.
889 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
891 return clean_html(res).strip()
895 def _get_netrc_login_info(self, netrc_machine=None):
898 netrc_machine = netrc_machine or self._NETRC_MACHINE
900 if self._downloader.params.get('usenetrc
', False):
902 info = netrc.netrc().authenticators(netrc_machine)
907 raise netrc.NetrcParseError(
908 'No authenticators
for %s' % netrc_machine)
909 except (IOError, netrc.NetrcParseError) as err:
910 self._downloader.report_warning(
911 'parsing
.netrc
: %s' % error_to_compat_str(err))
913 return username, password
915 def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None):
917 Get the login info as (username, password)
918 First look for the manually specified credentials using username_option
919 and password_option as keys in params dictionary. If no such credentials
920 available look in the netrc file using the netrc_machine or _NETRC_MACHINE
922 If there's no info available
, return (None, None)
924 if self._downloader is None:
927 downloader_params = self._downloader.params
929 # Attempt to use provided username and password or .netrc data
930 if downloader_params.get(username_option) is not None:
931 username = downloader_params[username_option]
932 password = downloader_params[password_option]
934 username, password = self._get_netrc_login_info(netrc_machine)
936 return username, password
938 def _get_tfa_info(self, note='two-factor verification code'):
940 Get the two
-factor authentication info
941 TODO
- asking the user will be required
for sms
/phone verify
942 currently just uses the command line option
943 If there
's no info available, return None
945 if self._downloader is None:
947 downloader_params = self._downloader.params
949 if downloader_params.get('twofactor
') is not None:
950 return downloader_params['twofactor
']
952 return compat_getpass('Type
%s and press
[Return
]: ' % note)
954 # Helper functions for extracting OpenGraph info
956 def _og_regexes(prop):
957 content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))'
958 property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)'
959 % {'prop
': re.escape(prop)})
960 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
962 template % (property_re, content_re),
963 template % (content_re, property_re),
967 def _meta_regex(prop):
968 return r'''(?isx)<meta
969 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
970 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
972 def _og_search_property(self, prop, html, name=None, **kargs):
973 if not isinstance(prop, (list, tuple)):
976 name = 'OpenGraph
%s' % prop[0]
979 og_regexes.extend(self._og_regexes(p))
980 escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
983 return unescapeHTML(escaped)
985 def _og_search_thumbnail(self, html, **kargs):
986 return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs)
988 def _og_search_description(self, html, **kargs):
989 return self._og_search_property('description
', html, fatal=False, **kargs)
991 def _og_search_title(self, html, **kargs):
992 return self._og_search_property('title
', html, **kargs)
994 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
995 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
997 regexes = self._og_regexes('video
:secure_url
') + regexes
998 return self._html_search_regex(regexes, html, name, **kargs)
1000 def _og_search_url(self, html, **kargs):
1001 return self._og_search_property('url
', html, **kargs)
1003 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1004 if not isinstance(name, (list, tuple)):
1006 if display_name is None:
1007 display_name = name[0]
1008 return self._html_search_regex(
1009 [self._meta_regex(n) for n in name],
1010 html, display_name, fatal=fatal, group='content
', **kwargs)
1012 def _dc_search_uploader(self, html):
1013 return self._html_search_meta('dc
.creator
', html, 'uploader
')
1015 def _rta_search(self, html):
1016 # See http://www.rtalabel.org/index.php?content=howtofaq#single
1017 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
1018 r' content
="RTA-5042-1996-1400-1577-RTA"',
1023 def _media_rating_search(self, html):
1024 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1025 rating = self._html_search_meta('rating
', html)
1037 return RATING_TABLE.get(rating.lower())
1039 def _family_friendly_search(self, html):
1040 # See http://schema.org/VideoObject
1041 family_friendly = self._html_search_meta(
1042 'isFamilyFriendly
', html, default=None)
1044 if not family_friendly:
1053 return RATING_TABLE.get(family_friendly.lower())
1055 def _twitter_search_player(self, html):
1056 return self._html_search_meta('twitter
:player
', html,
1057 'twitter card player
')
1059 def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1060 json_ld = self._search_regex(
1061 r'(?s
)<script
[^
>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1062 html, 'JSON-LD', group='json_ld', **kwargs)
1063 default = kwargs.get('default', NO_DEFAULT)
1065 return default if default is not NO_DEFAULT else {}
1066 # JSON-LD may be malformed and thus `fatal` should be respected.
1067 # At the same time `default` may be passed that assumes `fatal=False`
1068 # for _search_regex. Let's simulate the same behavior here as well.
1069 fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1070 return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1072 def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1073 if isinstance(json_ld, compat_str):
1074 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1078 if not isinstance(json_ld, (list, tuple, dict)):
1080 if isinstance(json_ld, dict):
1083 INTERACTION_TYPE_MAP = {
1084 'CommentAction': 'comment',
1085 'AgreeAction': 'like',
1086 'DisagreeAction': 'dislike',
1087 'LikeAction': 'like',
1088 'DislikeAction': 'dislike',
1089 'ListenAction': 'view',
1090 'WatchAction': 'view',
1091 'ViewAction': 'view',
1094 def extract_interaction_statistic(e):
1095 interaction_statistic = e.get('interactionStatistic')
1096 if not isinstance(interaction_statistic, list):
1098 for is_e in interaction_statistic:
1099 if not isinstance(is_e, dict):
1101 if is_e.get('@type') != 'InteractionCounter':
1103 interaction_type = is_e.get('interactionType')
1104 if not isinstance(interaction_type, compat_str):
1106 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1107 if interaction_count is None:
1109 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1112 count_key = '%s_count' % count_kind
1113 if info.get(count_key) is not None:
1115 info[count_key] = interaction_count
1117 def extract_video_object(e):
1118 assert e['@type'] == 'VideoObject'
1120 'url': e.get('contentUrl'),
1121 'title': unescapeHTML(e.get('name')),
1122 'description': unescapeHTML(e.get('description')),
1123 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1124 'duration': parse_duration(e.get('duration')),
1125 'timestamp': unified_timestamp(e.get('uploadDate')),
1126 'filesize': float_or_none(e.get('contentSize')),
1127 'tbr': int_or_none(e.get('bitrate')),
1128 'width': int_or_none(e.get('width')),
1129 'height': int_or_none(e.get('height')),
1130 'view_count': int_or_none(e.get('interactionCount')),
1132 extract_interaction_statistic(e)
1135 if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1136 item_type = e.get('@type')
1137 if expected_type is not None and expected_type != item_type:
1139 if item_type in ('TVEpisode', 'Episode'):
1141 'episode': unescapeHTML(e.get('name')),
1142 'episode_number': int_or_none(e.get('episodeNumber')),
1143 'description': unescapeHTML(e.get('description')),
1145 part_of_season = e.get('partOfSeason')
1146 if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1147 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1148 part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1149 if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1150 info['series'] = unescapeHTML(part_of_series.get('name'))
1151 elif item_type in ('Article', 'NewsArticle'):
1153 'timestamp': parse_iso8601(e.get('datePublished')),
1154 'title': unescapeHTML(e.get('headline')),
1155 'description': unescapeHTML(e.get('articleBody')),
1157 elif item_type == 'VideoObject':
1158 extract_video_object(e)
1160 video = e.get('video')
1161 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1162 extract_video_object(video)
1164 return dict((k, v) for k, v in info.items() if v is not None)
1167 def _hidden_inputs(html):
1168 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1170 for input in re.findall(r'(?i)(<input[^>]+>)', html):
1171 attrs = extract_attributes(input)
1174 if attrs.get('type') not in ('hidden', 'submit'):
1176 name = attrs.get('name') or attrs.get('id')
1177 value = attrs.get('value')
1178 if name and value is not None:
1179 hidden_inputs[name] = value
1180 return hidden_inputs
1182 def _form_hidden_inputs(self, form_id, html):
1183 form = self._search_regex(
1184 r'(?is)<form[^>]+?id=(["\'])%s\
1[^
>]*>(?P
<form
>.+?
)</form
>' % form_id,
1185 html, '%s form
' % form_id, group='form
')
1186 return self._hidden_inputs(form)
1188 def _sort_formats(self, formats, field_preference=None):
1190 raise ExtractorError('No video formats found
')
1193 # Automatically determine tbr when missing based on abr and vbr (improves
1194 # formats sorting in some cases)
1195 if 'tbr
' not in f and f.get('abr
') is not None and f.get('vbr
') is not None:
1196 f['tbr
'] = f['abr
'] + f['vbr
']
1198 def _formats_key(f):
1199 # TODO remove the following workaround
1200 from ..utils import determine_ext
1201 if not f.get('ext
') and 'url
' in f:
1202 f['ext
'] = determine_ext(f['url
'])
1204 if isinstance(field_preference, (list, tuple)):
1207 if f.get(field) is not None
1208 else ('' if field == 'format_id
' else -1)
1209 for field in field_preference)
1211 preference = f.get('preference
')
1212 if preference is None:
1214 if f.get('ext
') in ['f4f
', 'f4m
']: # Not yet supported
1217 protocol = f.get('protocol
') or determine_protocol(f)
1218 proto_preference = 0 if protocol in ['http
', 'https
'] else (-0.5 if protocol == 'rtsp
' else -0.1)
1220 if f.get('vcodec
') == 'none
': # audio only
1222 if self._downloader.params.get('prefer_free_formats
'):
1223 ORDER = ['aac
', 'mp3
', 'm4a
', 'webm
', 'ogg
', 'opus
']
1225 ORDER = ['webm
', 'opus
', 'ogg
', 'mp3
', 'aac
', 'm4a
']
1228 audio_ext_preference = ORDER.index(f['ext
'])
1230 audio_ext_preference = -1
1232 if f.get('acodec
') == 'none
': # video only
1234 if self._downloader.params.get('prefer_free_formats
'):
1235 ORDER = ['flv
', 'mp4
', 'webm
']
1237 ORDER = ['webm
', 'flv
', 'mp4
']
1239 ext_preference = ORDER.index(f['ext
'])
1242 audio_ext_preference = 0
1246 f.get('language_preference
') if f.get('language_preference
') is not None else -1,
1247 f.get('quality
') if f.get('quality
') is not None else -1,
1248 f.get('tbr
') if f.get('tbr
') is not None else -1,
1249 f.get('filesize
') if f.get('filesize
') is not None else -1,
1250 f.get('vbr
') if f.get('vbr
') is not None else -1,
1251 f.get('height
') if f.get('height
') is not None else -1,
1252 f.get('width
') if f.get('width
') is not None else -1,
1255 f.get('abr
') if f.get('abr
') is not None else -1,
1256 audio_ext_preference,
1257 f.get('fps
') if f.get('fps
') is not None else -1,
1258 f.get('filesize_approx
') if f.get('filesize_approx
') is not None else -1,
1259 f.get('source_preference
') if f.get('source_preference
') is not None else -1,
1260 f.get('format_id
') if f.get('format_id
') is not None else '',
1262 formats.sort(key=_formats_key)
1264 def _check_formats(self, formats, video_id):
1266 formats[:] = filter(
1267 lambda f: self._is_valid_url(
1269 item='%s video format
' % f.get('format_id
') if f.get('format_id
') else 'video
'),
1273 def _remove_duplicate_formats(formats):
1277 if f['url
'] not in format_urls:
1278 format_urls.add(f['url
'])
1279 unique_formats.append(f)
1280 formats[:] = unique_formats
1282 def _is_valid_url(self, url, video_id, item='video
', headers={}):
1283 url = self._proto_relative_url(url, scheme='http
:')
1284 # For now assume non HTTP(S) URLs always valid
1285 if not (url.startswith('http
://') or url.startswith('https
://')):
1288 self._request_webpage(url, video_id, 'Checking
%s URL
' % item, headers=headers)
1290 except ExtractorError as e:
1291 if isinstance(e.cause, compat_urllib_error.URLError):
1293 '%s: %s URL
is invalid
, skipping
' % (video_id, item))
1297 def http_scheme(self):
1298 """ Either "http:" or "https:", depending on the user's preferences
"""
1301 if self._downloader.params.get('prefer_insecure', False)
1304 def _proto_relative_url(self, url, scheme=None):
1307 if url.startswith('//'):
1309 scheme = self.http_scheme()
1314 def _sleep(self, timeout, video_id, msg_template=None):
1315 if msg_template is None:
1316 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1317 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1321 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1322 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1323 fatal=True, m3u8_id=None):
1324 manifest = self._download_xml(
1325 manifest_url, video_id, 'Downloading f4m manifest',
1326 'Unable to download f4m manifest',
1327 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1328 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1329 transform_source=transform_source,
1332 if manifest is False:
1335 return self._parse_f4m_formats(
1336 manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1337 transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1339 def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1340 transform_source=lambda s: fix_xml_ampersands(s).strip(),
1341 fatal=True, m3u8_id=None):
1342 # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1343 akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1344 if akamai_pv is not None and ';' in akamai_pv.text:
1345 playerVerificationChallenge = akamai_pv.text.split(';')[0]
1346 if playerVerificationChallenge.strip() != '':
1350 manifest_version = '1.0'
1351 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1353 manifest_version = '2.0'
1354 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1355 # Remove unsupported DRM protected media from final formats
1356 # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1357 media_nodes = remove_encrypted_media(media_nodes)
1361 manifest_base_url = get_base_url(manifest)
1363 bootstrap_info = xpath_element(
1364 manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1365 'bootstrap info', default=None)
1368 mime_type = xpath_text(
1369 manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1370 'base URL', default=None)
1371 if mime_type and mime_type.startswith('audio/'):
1374 for i, media_el in enumerate(media_nodes):
1375 tbr = int_or_none(media_el.attrib.get('bitrate'))
1376 width = int_or_none(media_el.attrib.get('width'))
1377 height = int_or_none(media_el.attrib.get('height'))
1378 format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1379 # If <bootstrapInfo> is present, the specified f4m is a
1380 # stream-level manifest, and only set-level manifests may refer to
1381 # external resources. See section 11.4 and section 4 of F4M spec
1382 if bootstrap_info is None:
1384 # @href is introduced in 2.0, see section 11.6 of F4M spec
1385 if manifest_version == '2.0':
1386 media_url = media_el.attrib.get('href')
1387 if media_url is None:
1388 media_url = media_el.attrib.get('url')
1392 media_url if media_url.startswith('http://') or media_url.startswith('https://')
1393 else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1394 # If media_url is itself a f4m manifest do the recursive extraction
1395 # since bitrates in parent manifest (this one) and media_url manifest
1396 # may differ leading to inability to resolve the format by requested
1397 # bitrate in f4m downloader
1398 ext = determine_ext(manifest_url)
1400 f4m_formats = self._extract_f4m_formats(
1401 manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1402 transform_source=transform_source, fatal=fatal)
1403 # Sometimes stream-level manifest contains single media entry that
1404 # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1405 # At the same time parent's media entry in set-level manifest may
1406 # contain it. We will copy it from parent in such cases.
1407 if len(f4m_formats) == 1:
1410 'tbr': f.get('tbr') or tbr,
1411 'width': f.get('width') or width,
1412 'height': f.get('height') or height,
1413 'format_id': f.get('format_id') if not tbr else format_id,
1416 formats.extend(f4m_formats)
1419 formats.extend(self._extract_m3u8_formats(
1420 manifest_url, video_id, 'mp4', preference=preference,
1421 m3u8_id=m3u8_id, fatal=fatal))
1424 'format_id': format_id,
1425 'url': manifest_url,
1426 'manifest_url': manifest_url,
1427 'ext': 'flv' if bootstrap_info is not None else None,
1433 'preference': preference,
1437 def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1439 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1443 'preference': preference - 100 if preference else -100,
1444 'resolution': 'multiple',
1445 'format_note': 'Quality selection URL',
1448 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1449 entry_protocol='m3u8', preference=None,
1450 m3u8_id=None, note=None, errnote=None,
1451 fatal=True, live=False):
1452 res = self._download_webpage_handle(
1454 note=note or 'Downloading m3u8 information',
1455 errnote=errnote or 'Failed to download m3u8 information',
1461 m3u8_doc, urlh = res
1462 m3u8_url = urlh.geturl()
1464 return self._parse_m3u8_formats(
1465 m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1466 preference=preference, m3u8_id=m3u8_id, live=live)
1468 def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1469 entry_protocol='m3u8', preference=None,
1470 m3u8_id=None, live=False):
1471 if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
1474 if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay
1479 format_url = lambda u: (
1481 if re.match(r'^https?://', u)
1482 else compat_urlparse.urljoin(m3u8_url, u))
1485 # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1486 # 2. https://github.com/rg3/youtube-dl/issues/12211
1488 # We should try extracting formats only from master playlists [1, 4.3.4],
1489 # i.e. playlists that describe available qualities. On the other hand
1490 # media playlists [1, 4.3.3] should be returned as is since they contain
1491 # just the media without qualities renditions.
1492 # Fortunately, master playlist can be easily distinguished from media
1493 # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1494 # master playlist tags MUST NOT appear in a media playist and vice versa.
1495 # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1496 # media playlist and MUST NOT appear in master playlist thus we can
1497 # clearly detect media playlist with this criterion.
1499 if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
1502 'format_id': m3u8_id,
1504 'protocol': entry_protocol,
1505 'preference': preference,
1509 last_stream_inf = {}
1511 def extract_media(x_media_line):
1512 media = parse_m3u8_attributes(x_media_line)
1513 # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1514 media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1515 if not (media_type and group_id and name):
1517 groups.setdefault(group_id, []).append(media)
1518 if media_type not in ('VIDEO', 'AUDIO'):
1520 media_url = media.get('URI')
1523 for v in (m3u8_id, group_id, name):
1527 'format_id': '-'.join(format_id),
1528 'url': format_url(media_url),
1529 'manifest_url': m3u8_url,
1530 'language': media.get('LANGUAGE'),
1532 'protocol': entry_protocol,
1533 'preference': preference,
1535 if media_type == 'AUDIO':
1536 f['vcodec'] = 'none'
1539 def build_stream_name():
1540 # Despite specification does not mention NAME attribute for
1541 # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1542 # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1543 # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1544 stream_name = last_stream_inf.get('NAME')
1547 # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1548 # from corresponding rendition group
1549 stream_group_id = last_stream_inf.get('VIDEO')
1550 if not stream_group_id:
1552 stream_group = groups.get(stream_group_id)
1553 if not stream_group:
1554 return stream_group_id
1555 rendition = stream_group[0]
1556 return rendition.get('NAME') or stream_group_id
1558 for line in m3u8_doc.splitlines():
1559 if line.startswith('#EXT-X-STREAM-INF:'):
1560 last_stream_inf = parse_m3u8_attributes(line)
1561 elif line.startswith('#EXT-X-MEDIA:'):
1563 elif line.startswith('#') or not line.strip():
1566 tbr = float_or_none(
1567 last_stream_inf.get('AVERAGE-BANDWIDTH') or
1568 last_stream_inf.get('BANDWIDTH'), scale=1000)
1571 format_id.append(m3u8_id)
1572 stream_name = build_stream_name()
1573 # Bandwidth of live streams may differ over time thus making
1574 # format_id unpredictable. So it's better to keep provided
1577 format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1578 manifest_url = format_url(line.strip())
1580 'format_id': '-'.join(format_id),
1581 'url': manifest_url,
1582 'manifest_url': m3u8_url,
1585 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1586 'protocol': entry_protocol,
1587 'preference': preference,
1589 resolution = last_stream_inf.get('RESOLUTION')
1591 mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1593 f['width'] = int(mobj.group('width'))
1594 f['height'] = int(mobj.group('height'))
1595 # Unified Streaming Platform
1597 r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1599 abr, vbr = mobj.groups()
1600 abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1605 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1607 audio_group_id = last_stream_inf.get('AUDIO')
1608 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1609 # references a rendition group MUST have a CODECS attribute.
1610 # However, this is not always respected, for example, [2]
1611 # contains EXT-X-STREAM-INF tag which references AUDIO
1612 # rendition group but does not have CODECS and despite
1613 # referencing audio group an audio group, it represents
1614 # a complete (with audio and video) format. So, for such cases
1615 # we will ignore references to rendition groups and treat them
1616 # as complete formats.
1617 if audio_group_id and codecs and f.get('vcodec') != 'none':
1618 audio_group = groups.get(audio_group_id)
1619 if audio_group and audio_group[0].get('URI'):
1620 # TODO: update acodec for audio only formats with
1622 f['acodec'] = 'none'
1624 last_stream_inf = {}
1628 def _xpath_ns(path, namespace=None):
1632 for c in path.split('/'):
1633 if not c or c == '.':
1636 out.append('{%s}%s' % (namespace, c))
1637 return '/'.join(out)
1639 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1640 smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1646 namespace = self._parse_smil_namespace(smil)
1648 return self._parse_smil_formats(
1649 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1651 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1652 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1655 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1657 def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1658 return self._download_xml(
1659 smil_url, video_id, 'Downloading SMIL file',
1660 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1662 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1663 namespace = self._parse_smil_namespace(smil)
1665 formats = self._parse_smil_formats(
1666 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1667 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1669 video_id = os.path.splitext(url_basename(smil_url))[0]
1673 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1674 name = meta.attrib.get('name')
1675 content = meta.attrib.get('content')
1676 if not name or not content:
1678 if not title and name == 'title':
1680 elif not description and name in ('description', 'abstract'):
1681 description = content
1682 elif not upload_date and name == 'date':
1683 upload_date = unified_strdate(content)
1686 'id': image.get('type'),
1687 'url': image.get('src'),
1688 'width': int_or_none(image.get('width')),
1689 'height': int_or_none(image.get('height')),
1690 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1694 'title': title or video_id,
1695 'description': description,
1696 'upload_date': upload_date,
1697 'thumbnails': thumbnails,
1699 'subtitles': subtitles,
1702 def _parse_smil_namespace(self, smil):
1703 return self._search_regex(
1704 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1706 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1708 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1709 b = meta.get('base') or meta.get('httpBase')
1720 media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1721 for medium in media:
1722 src = medium.get('src')
1723 if not src or src in srcs:
1727 bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1728 filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1729 width = int_or_none(medium.get('width'))
1730 height = int_or_none(medium.get('height'))
1731 proto = medium.get('proto')
1732 ext = medium.get('ext')
1733 src_ext = determine_ext(src)
1734 streamer = medium.get('streamer') or base
1736 if proto == 'rtmp' or streamer.startswith('rtmp'):
1742 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1744 'filesize': filesize,
1748 if transform_rtmp_url:
1749 streamer, src = transform_rtmp_url(streamer, src)
1750 formats[-1].update({
1756 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1757 src_url = src_url.strip()
1759 if proto == 'm3u8' or src_ext == 'm3u8':
1760 m3u8_formats = self._extract_m3u8_formats(
1761 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1762 if len(m3u8_formats) == 1:
1764 m3u8_formats[0].update({
1765 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1770 formats.extend(m3u8_formats)
1773 if src_ext == 'f4m':
1778 'plugin': 'flowplayer-3.2.0.1',
1780 f4m_url += '&' if '?' in f4m_url else '?'
1781 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1782 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1785 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1789 'ext': ext or src_ext or 'flv',
1790 'format_id': 'http-%d' % (bitrate or http_count),
1792 'filesize': filesize,
1800 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1803 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1804 src = textstream.get('src')
1805 if not src or src in urls:
1808 ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1809 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1810 subtitles.setdefault(lang, []).append({
1816 def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1817 xspf = self._download_xml(
1818 xspf_url, playlist_id, 'Downloading xpsf playlist',
1819 'Unable to download xspf manifest', fatal=fatal)
1822 return self._parse_xspf(
1823 xspf, playlist_id, xspf_url=xspf_url,
1824 xspf_base_url=base_url(xspf_url))
1826 def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1828 'xspf': 'http://xspf.org/ns/0/',
1829 's1': 'http://static.streamone.nl/player/ns/0',
1833 for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1835 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1836 description = xpath_text(
1837 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1838 thumbnail = xpath_text(
1839 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1840 duration = float_or_none(
1841 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1844 for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1845 format_url = urljoin(xspf_base_url, location.text)
1850 'manifest_url': xspf_url,
1851 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1852 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1853 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1855 self._sort_formats(formats)
1860 'description': description,
1861 'thumbnail': thumbnail,
1862 'duration': duration,
1867 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1868 res = self._download_xml_handle(
1870 note=note or 'Downloading MPD manifest',
1871 errnote=errnote or 'Failed to download MPD manifest',
1876 mpd_base_url = base_url(urlh.geturl())
1878 return self._parse_mpd_formats(
1879 mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1880 formats_dict=formats_dict, mpd_url=mpd_url)
1882 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1884 Parse formats
from MPD manifest
.
1886 1. MPEG
-DASH Standard
, ISO
/IEC
23009-1:2014(E
),
1887 http
://standards
.iso
.org
/ittf
/PubliclyAvailableStandards
/c065274_ISO_IEC_23009
-1_2014.zip
1888 2. https
://en
.wikipedia
.org
/wiki
/Dynamic_Adaptive_Streaming_over_HTTP
1890 if mpd_doc.get('type') == 'dynamic':
1893 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1896 return self._xpath_ns(path, namespace)
1898 def is_drm_protected(element):
1899 return element.find(_add_ns('ContentProtection')) is not None
1901 def extract_multisegment_info(element, ms_parent_info):
1902 ms_info = ms_parent_info.copy()
1904 # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1905 # common attributes and elements. We will only extract relevant
1907 def extract_common(source):
1908 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1909 if segment_timeline is not None:
1910 s_e = segment_timeline.findall(_add_ns('S'))
1912 ms_info['total_number'] = 0
1915 r = int(s.get('r', 0))
1916 ms_info['total_number'] += 1 + r
1917 ms_info['s'].append({
1918 't': int(s.get('t', 0)),
1919 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1920 'd': int(s.attrib['d']),
1923 start_number = source.get('startNumber')
1925 ms_info['start_number'] = int(start_number)
1926 timescale = source.get('timescale')
1928 ms_info['timescale'] = int(timescale)
1929 segment_duration = source.get('duration')
1930 if segment_duration:
1931 ms_info['segment_duration'] = float(segment_duration)
1933 def extract_Initialization(source):
1934 initialization = source.find(_add_ns('Initialization'))
1935 if initialization is not None:
1936 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1938 segment_list = element.find(_add_ns('SegmentList'))
1939 if segment_list is not None:
1940 extract_common(segment_list)
1941 extract_Initialization(segment_list)
1942 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1944 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1946 segment_template = element.find(_add_ns('SegmentTemplate'))
1947 if segment_template is not None:
1948 extract_common(segment_template)
1949 media = segment_template.get('media')
1951 ms_info['media'] = media
1952 initialization = segment_template.get('initialization')
1954 ms_info['initialization'] = initialization
1956 extract_Initialization(segment_template)
1959 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1961 for period in mpd_doc.findall(_add_ns('Period')):
1962 period_duration = parse_duration(period.get('duration')) or mpd_duration
1963 period_ms_info = extract_multisegment_info(period, {
1967 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1968 if is_drm_protected(adaptation_set):
1970 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1971 for representation in adaptation_set.findall(_add_ns('Representation')):
1972 if is_drm_protected(representation):
1974 representation_attrib = adaptation_set.attrib.copy()
1975 representation_attrib.update(representation.attrib)
1976 # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1977 mime_type = representation_attrib['mimeType']
1978 content_type = mime_type.split('/')[0]
1979 if content_type == 'text':
1980 # TODO implement WebVTT downloading
1982 elif content_type in ('video', 'audio'):
1984 for element in (representation, adaptation_set, period, mpd_doc):
1985 base_url_e = element.find(_add_ns('BaseURL'))
1986 if base_url_e is not None:
1987 base_url = base_url_e.text + base_url
1988 if re.match(r'^https?://', base_url):
1990 if mpd_base_url and not re.match(r'^https?://', base_url):
1991 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1993 base_url = mpd_base_url + base_url
1994 representation_id = representation_attrib.get('id')
1995 lang = representation_attrib.get('lang')
1996 url_el = representation.find(_add_ns('BaseURL'))
1997 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1998 bandwidth = int_or_none(representation_attrib.get('bandwidth'))
2000 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2002 'manifest_url': mpd_url,
2003 'ext': mimetype2ext(mime_type),
2004 'width': int_or_none(representation_attrib.get('width')),
2005 'height': int_or_none(representation_attrib.get('height')),
2006 'tbr': float_or_none(bandwidth, 1000),
2007 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2008 'fps': int_or_none(representation_attrib.get('frameRate')),
2009 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2010 'format_note': 'DASH %s' % content_type,
2011 'filesize': filesize,
2012 'container': mimetype2ext(mime_type) + '_dash',
2014 f.update(parse_codecs(representation_attrib.get('codecs')))
2015 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2017 def prepare_template(template_name, identifiers):
2018 t = representation_ms_info[template_name]
2019 t = t.replace('$RepresentationID$', representation_id)
2020 t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2021 t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2022 t.replace('$$', '$')
2025 # @initialization is a regular template like @media one
2026 # so it should be handled just the same way (see
2027 # https://github.com/rg3/youtube-dl/issues/11605)
2028 if 'initialization' in representation_ms_info:
2029 initialization_template = prepare_template(
2031 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2032 # $Time$ shall not be included for @initialization thus
2033 # only $Bandwidth$ remains
2035 representation_ms_info['initialization_url'] = initialization_template % {
2036 'Bandwidth': bandwidth,
2039 def location_key(location):
2040 return 'url' if re.match(r'^https?://', location) else 'path'
2042 if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2044 media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2045 media_location_key = location_key(media_template)
2047 # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2048 # can't be used at the same time
2049 if '%(Number' in media_template and 's' not in representation_ms_info:
2050 segment_duration = None
2051 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2052 segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2053 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2054 representation_ms_info['fragments'] = [{
2055 media_location_key: media_template % {
2056 'Number': segment_number,
2057 'Bandwidth': bandwidth,
2059 'duration': segment_duration,
2060 } for segment_number in range(
2061 representation_ms_info['start_number'],
2062 representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2064 # $Number*$ or $Time$ in media template with S list available
2065 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2066 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2067 representation_ms_info['fragments'] = []
2070 segment_number = representation_ms_info['start_number']
2072 def add_segment_url():
2073 segment_url = media_template % {
2074 'Time': segment_time,
2075 'Bandwidth': bandwidth,
2076 'Number': segment_number,
2078 representation_ms_info['fragments'].append({
2079 media_location_key: segment_url,
2080 'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2083 for num, s in enumerate(representation_ms_info['s']):
2084 segment_time = s.get('t') or segment_time
2088 for r in range(s.get('r', 0)):
2089 segment_time += segment_d
2092 segment_time += segment_d
2093 elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2095 # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2096 # or any YouTube dashsegments video
2099 timescale = representation_ms_info['timescale']
2100 for s in representation_ms_info['s']:
2101 duration = float_or_none(s['d'], timescale)
2102 for r in range(s.get('r', 0) + 1):
2103 segment_uri = representation_ms_info['segment_urls'][segment_index]
2105 location_key(segment_uri): segment_uri,
2106 'duration': duration,
2109 representation_ms_info['fragments'] = fragments
2110 elif 'segment_urls' in representation_ms_info:
2111 # Segment URLs with no SegmentTimeline
2112 # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2113 # https://github.com/rg3/youtube-dl/pull/14844
2115 segment_duration = float_or_none(
2116 representation_ms_info['segment_duration'],
2117 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2118 for segment_url in representation_ms_info['segment_urls']:
2120 location_key(segment_url): segment_url,
2122 if segment_duration:
2123 fragment['duration'] = segment_duration
2124 fragments.append(fragment)
2125 representation_ms_info['fragments'] = fragments
2126 # NB: MPD manifest may contain direct URLs to unfragmented media.
2127 # No fragments key is present in this case.
2128 if 'fragments' in representation_ms_info:
2130 'fragment_base_url': base_url,
2132 'protocol': 'http_dash_segments',
2134 if 'initialization_url' in representation_ms_info:
2135 initialization_url = representation_ms_info['initialization_url']
2136 if not f.get('url'):
2137 f['url'] = initialization_url
2138 f['fragments'].append({location_key(initialization_url): initialization_url})
2139 f['fragments'].extend(representation_ms_info['fragments'])
2140 # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2141 # is not necessarily unique within a Period thus formats with
2142 # the same `format_id` are quite possible. There are numerous examples
2143 # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2144 # https://github.com/rg3/youtube-dl/issues/13919)
2145 full_info = formats_dict.get(representation_id, {}).copy()
2147 formats.append(full_info)
2149 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2152 def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2153 res = self._download_xml_handle(
2155 note=note or 'Downloading ISM manifest',
2156 errnote=errnote or 'Failed to download ISM manifest',
2162 return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2164 def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2166 Parse formats
from ISM manifest
.
2168 1. [MS
-SSTR
]: Smooth Streaming Protocol
,
2169 https
://msdn
.microsoft
.com
/en
-us
/library
/ff469518
.aspx
2171 if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2174 duration = int(ism_doc.attrib['Duration'])
2175 timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2178 for stream in ism_doc.findall('StreamIndex'):
2179 stream_type = stream.get('Type')
2180 if stream_type not in ('video', 'audio'):
2182 url_pattern = stream.attrib['Url']
2183 stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2184 stream_name = stream.get('Name')
2185 for track in stream.findall('QualityLevel'):
2186 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2187 # TODO: add support for WVC1 and WMAP
2188 if fourcc not in ('H264', 'AVC1', 'AACL'):
2189 self.report_warning('%s is not a supported codec' % fourcc)
2191 tbr = int(track.attrib['Bitrate']) // 1000
2192 # [1] does not mention Width and Height attributes. However,
2193 # they're often present while MaxWidth and MaxHeight are
2194 # missing, so should be used as fallbacks
2195 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2196 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2197 sampling_rate = int_or_none(track.get('SamplingRate'))
2199 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2200 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2206 stream_fragments = stream.findall('c')
2207 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2208 fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2209 fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2210 fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2211 if not fragment_ctx['duration']:
2213 next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2215 next_fragment_time = duration
2216 fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2217 for _ in range(fragment_repeat):
2219 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2220 'duration': fragment_ctx['duration'] / stream_timescale,
2222 fragment_ctx['time'] += fragment_ctx['duration']
2226 format_id.append(ism_id)
2228 format_id.append(stream_name)
2229 format_id.append(compat_str(tbr))
2232 'format_id': '-'.join(format_id),
2234 'manifest_url': ism_url,
2235 'ext': 'ismv' if stream_type == 'video' else 'isma',
2239 'asr': sampling_rate,
2240 'vcodec': 'none' if stream_type == 'audio' else fourcc,
2241 'acodec': 'none' if stream_type == 'video' else fourcc,
2243 'fragments': fragments,
2244 '_download_params': {
2245 'duration': duration,
2246 'timescale': stream_timescale,
2247 'width': width or 0,
2248 'height': height or 0,
2250 'codec_private_data': track.get('CodecPrivateData'),
2251 'sampling_rate': sampling_rate,
2252 'channels': int_or_none(track.get('Channels', 2)),
2253 'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2254 'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2259 def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2260 def absolute_url(item_url):
2261 return urljoin(base_url, item_url)
2263 def parse_content_type(content_type):
2264 if not content_type:
2266 ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2268 mimetype, codecs = ctr.groups()
2269 f = parse_codecs(codecs)
2270 f['ext'] = mimetype2ext(mimetype)
2274 def _media_formats(src, cur_media_type, type_info={}):
2275 full_url = absolute_url(src)
2276 ext = type_info.get('ext') or determine_ext(full_url)
2278 is_plain_url = False
2279 formats = self._extract_m3u8_formats(
2280 full_url, video_id, ext='mp4',
2281 entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2282 preference=preference, fatal=False)
2284 is_plain_url = False
2285 formats = self._extract_mpd_formats(
2286 full_url, video_id, mpd_id=mpd_id, fatal=False)
2291 'vcodec': 'none' if cur_media_type == 'audio' else None,
2293 return is_plain_url, formats
2296 # amp-video and amp-audio are very similar to their HTML5 counterparts
2297 # so we wll include them right here (see
2298 # https://www.ampproject.org/docs/reference/components/amp-video)
2299 media_tags = [(media_tag, media_type, '')
2300 for media_tag, media_type
2301 in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2302 media_tags.extend(re.findall(
2303 # We only allow video|audio followed by a whitespace or '>'.
2304 # Allowing more characters may end up in significant slow down (see
2305 # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2306 # http://www.porntrex.com/maps/videositemap.xml).
2307 r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2308 for media_tag, media_type, media_content in media_tags:
2313 media_attributes = extract_attributes(media_tag)
2314 src = media_attributes.get('src')
2316 _, formats = _media_formats(src, media_type)
2317 media_info['formats'].extend(formats)
2318 media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2320 for source_tag in re.findall(r'<source[^>]+>', media_content):
2321 source_attributes = extract_attributes(source_tag)
2322 src = source_attributes.get('src')
2325 f = parse_content_type(source_attributes.get('type'))
2326 is_plain_url, formats = _media_formats(src, media_type, f)
2328 # res attribute is not standard but seen several times
2331 'height': int_or_none(source_attributes.get('res')),
2332 'format_id': source_attributes.get('label'),
2334 f.update(formats[0])
2335 media_info['formats'].append(f)
2337 media_info['formats'].extend(formats)
2338 for track_tag in re.findall(r'<track[^>]+>', media_content):
2339 track_attributes = extract_attributes(track_tag)
2340 kind = track_attributes.get('kind')
2341 if not kind or kind in ('subtitles', 'captions'):
2342 src = track_attributes.get('src')
2345 lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2346 media_info['subtitles'].setdefault(lang, []).append({
2347 'url': absolute_url(src),
2349 if media_info['formats'] or media_info['subtitles']:
2350 entries.append(media_info)
2353 def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2355 hdcore_sign = 'hdcore=3.7.0'
2356 f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2357 hds_host = hosts.get('hds')
2359 f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2360 if 'hdcore=' not in f4m_url:
2361 f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2362 f4m_formats = self._extract_f4m_formats(
2363 f4m_url, video_id, f4m_id='hds', fatal=False)
2364 for entry in f4m_formats:
2365 entry.update({'extra_param_to_segment_url': hdcore_sign})
2366 formats.extend(f4m_formats)
2367 m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2368 hls_host = hosts.get('hls')
2370 m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2371 formats.extend(self._extract_m3u8_formats(
2372 m3u8_url, video_id, 'mp4', 'm3u8_native',
2373 m3u8_id='hls', fatal=False))
2376 def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2377 query = compat_urlparse.urlparse(url).query
2378 url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2380 r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2381 url_base = mobj.group('url')
2382 http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2385 def manifest_url(manifest):
2386 m_url = '%s/%s' % (http_base_url, manifest)
2388 m_url += '?%s' % query
2391 if 'm3u8' not in skip_protocols:
2392 formats.extend(self._extract_m3u8_formats(
2393 manifest_url('playlist.m3u8'), video_id, 'mp4',
2394 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2395 if 'f4m' not in skip_protocols:
2396 formats.extend(self._extract_f4m_formats(
2397 manifest_url('manifest.f4m'),
2398 video_id, f4m_id='hds', fatal=False))
2399 if 'dash' not in skip_protocols:
2400 formats.extend(self._extract_mpd_formats(
2401 manifest_url('manifest.mpd'),
2402 video_id, mpd_id='dash', fatal=False))
2403 if re.search(r'(?:/smil:|\.smil)', url_base):
2404 if 'smil' not in skip_protocols:
2405 rtmp_formats = self._extract_smil_formats(
2406 manifest_url('jwplayer.smil'),
2407 video_id, fatal=False)
2408 for rtmp_format in rtmp_formats:
2409 rtsp_format = rtmp_format.copy()
2410 rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2411 del rtsp_format['play_path']
2412 del rtsp_format['ext']
2413 rtsp_format.update({
2414 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2415 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2418 formats.extend([rtmp_format, rtsp_format])
2420 for protocol in ('rtmp', 'rtsp'):
2421 if protocol not in skip_protocols:
2423 'url': '%s:%s' % (protocol, url_base),
2424 'format_id': protocol,
2425 'protocol': protocol,
2429 def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2431 r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2435 jwplayer_data = self._parse_json(mobj.group('options'),
2437 transform_source=transform_source)
2438 except ExtractorError:
2441 if isinstance(jwplayer_data, dict):
2442 return jwplayer_data
2444 def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2445 jwplayer_data = self._find_jwplayer_data(
2446 webpage, video_id, transform_source=js_to_json)
2447 return self._parse_jwplayer_data(
2448 jwplayer_data, video_id, *args, **kwargs)
2450 def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2451 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2452 # JWPlayer backward compatibility: flattened playlists
2453 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2454 if 'playlist' not in jwplayer_data:
2455 jwplayer_data = {'playlist': [jwplayer_data]}
2459 # JWPlayer backward compatibility: single playlist item
2460 # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2461 if not isinstance(jwplayer_data['playlist'], list):
2462 jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2464 for video_data in jwplayer_data['playlist']:
2465 # JWPlayer backward compatibility: flattened sources
2466 # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2467 if 'sources' not in video_data:
2468 video_data['sources'] = [video_data]
2470 this_video_id = video_id or video_data['mediaid']
2472 formats = self._parse_jwplayer_formats(
2473 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2474 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2477 tracks = video_data.get('tracks')
2478 if tracks and isinstance(tracks, list):
2479 for track in tracks:
2480 if not isinstance(track, dict):
2482 track_kind = track.get('kind')
2483 if not track_kind or not isinstance(track_kind, compat_str):
2485 if track_kind.lower() not in ('captions', 'subtitles'):
2487 track_url = urljoin(base_url, track.get('file'))
2490 subtitles.setdefault(track.get('label') or 'en', []).append({
2491 'url': self._proto_relative_url(track_url)
2495 'id': this_video_id,
2496 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2497 'description': video_data.get('description'),
2498 'thumbnail': self._proto_relative_url(video_data.get('image')),
2499 'timestamp': int_or_none(video_data.get('pubdate')),
2500 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2501 'subtitles': subtitles,
2503 # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2504 if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2506 '_type': 'url_transparent',
2507 'url': formats[0]['url'],
2510 self._sort_formats(formats)
2511 entry['formats'] = formats
2512 entries.append(entry)
2513 if len(entries) == 1:
2516 return self.playlist_result(entries)
2518 def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2519 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2522 for source in jwplayer_sources_data:
2523 if not isinstance(source, dict):
2525 source_url = self._proto_relative_url(source.get('file'))
2529 source_url = compat_urlparse.urljoin(base_url, source_url)
2530 if source_url in urls:
2532 urls.append(source_url)
2533 source_type = source.get('type') or ''
2534 ext = mimetype2ext(source_type) or determine_ext(source_url)
2535 if source_type == 'hls' or ext == 'm3u8':
2536 formats.extend(self._extract_m3u8_formats(
2537 source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2538 m3u8_id=m3u8_id, fatal=False))
2539 elif source_type == 'dash' or ext == 'mpd':
2540 formats.extend(self._extract_mpd_formats(
2541 source_url, video_id, mpd_id=mpd_id, fatal=False))
2543 formats.extend(self._extract_smil_formats(
2544 source_url, video_id, fatal=False))
2545 # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2546 elif source_type.startswith('audio') or ext in (
2547 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2554 height = int_or_none(source.get('height'))
2556 # Often no height is provided but there is a label in
2557 # format like "1080p", "720p SD", or 1080.
2558 height = int_or_none(self._search_regex(
2559 r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2560 'height', default=None))
2563 'width': int_or_none(source.get('width')),
2565 'tbr': int_or_none(source.get('bitrate')),
2568 if source_url.startswith('rtmp'):
2569 a_format['ext'] = 'flv'
2570 # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2571 # of jwplayer.flash.swf
2572 rtmp_url_parts = re.split(
2573 r'((?:mp4|mp3|flv):)', source_url, 1)
2574 if len(rtmp_url_parts) == 3:
2575 rtmp_url, prefix, play_path = rtmp_url_parts
2578 'play_path': prefix + play_path,
2581 a_format.update(rtmp_params)
2582 formats.append(a_format)
2585 def _live_title(self, name):
2586 """ Generate the title
for a live video
"""
2587 now = datetime.datetime.now()
2588 now_str = now.strftime('%Y-%m-%d %H:%M')
2589 return name + ' ' + now_str
2591 def _int(self, v, name, fatal=False, **kwargs):
2592 res = int_or_none(v, **kwargs)
2593 if 'get_attr' in kwargs:
2594 print(getattr(v, kwargs['get_attr']))
2596 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2598 raise ExtractorError(msg)
2600 self._downloader.report_warning(msg)
2603 def _float(self, v, name, fatal=False, **kwargs):
2604 res = float_or_none(v, **kwargs)
2606 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2608 raise ExtractorError(msg)
2610 self._downloader.report_warning(msg)
2613 def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2614 path='/', secure=False, discard=False, rest={}, **kwargs):
2615 cookie = compat_cookiejar.Cookie(
2616 0, name, value, port, port is not None, domain, True,
2617 domain.startswith('.'), path, True, secure, expire_time,
2618 discard, None, None, rest)
2619 self._downloader.cookiejar.set_cookie(cookie)
2621 def _get_cookies(self, url):
2622 """ Return a compat_cookies
.SimpleCookie
with the cookies
for the url
"""
2623 req = sanitized_Request(url)
2624 self._downloader.cookiejar.add_cookie_header(req)
2625 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2627 def get_testcases(self, include_onlymatching=False):
2628 t = getattr(self, '_TEST', None)
2630 assert not hasattr(self, '_TESTS'), \
2631 '%s has _TEST and _TESTS' % type(self).__name__
2634 tests = getattr(self, '_TESTS', [])
2636 if not include_onlymatching and t.get('only_matching', False):
2638 t['name'] = type(self).__name__[:-len('IE')]
2641 def is_suitable(self, age_limit):
2642 """ Test whether the extractor
is generally suitable
for the given
2643 age
limit (i
.e
. pornographic sites are
not, all others usually are
) """
2645 any_restricted = False
2646 for tc in self.get_testcases(include_onlymatching=False):
2647 if tc.get('playlist', []):
2648 tc = tc['playlist'][0]
2649 is_restricted = age_restricted(
2650 tc.get('info_dict', {}).get('age_limit'), age_limit)
2651 if not is_restricted:
2653 any_restricted = any_restricted or is_restricted
2654 return not any_restricted
2656 def extract_subtitles(self, *args, **kwargs):
2657 if (self._downloader.params.get('writesubtitles', False) or
2658 self._downloader.params.get('listsubtitles')):
2659 return self._get_subtitles(*args, **kwargs)
2662 def _get_subtitles(self, *args, **kwargs):
2663 raise NotImplementedError('This method must be implemented by subclasses')
2666 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2667 """ Merge subtitle items
for one language
. Items
with duplicated URLs
2668 will be dropped
. """
2669 list1_urls = set([item['url'] for item in subtitle_list1])
2670 ret = list(subtitle_list1)
2671 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2675 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2676 """ Merge two subtitle dictionaries
, language by language
. """
2677 ret = dict(subtitle_dict1)
2678 for lang in subtitle_dict2:
2679 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2682 def extract_automatic_captions(self, *args, **kwargs):
2683 if (self._downloader.params.get('writeautomaticsub', False) or
2684 self._downloader.params.get('listsubtitles')):
2685 return self._get_automatic_captions(*args, **kwargs)
2688 def _get_automatic_captions(self, *args, **kwargs):
2689 raise NotImplementedError('This method must be implemented by subclasses')
2691 def mark_watched(self, *args, **kwargs):
2692 if (self._downloader.params.get('mark_watched', False) and
2693 (self._get_login_info()[0] is not None or
2694 self._downloader.params.get('cookiefile') is not None)):
2695 self._mark_watched(*args, **kwargs)
2697 def _mark_watched(self, *args, **kwargs):
2698 raise NotImplementedError('This method must be implemented by subclasses')
2700 def geo_verification_headers(self):
2702 geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2703 if geo_verification_proxy:
2704 headers['Ytdl-request-proxy'] = geo_verification_proxy
2707 def _generic_id(self, url):
2708 return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2710 def _generic_title(self, url):
2711 return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2714 class SearchInfoExtractor(InfoExtractor):
2716 Base
class for paged search queries extractors
.
2717 They accept URLs
in the format
_SEARCH_KEY(|all|
[0-9]):{query}
2718 Instances should define _SEARCH_KEY
and _MAX_RESULTS
.
2722 def _make_valid_url(cls):
2723 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2726 def suitable(cls, url):
2727 return re.match(cls._make_valid_url(), url) is not None
2729 def _real_extract(self, query):
2730 mobj = re.match(self._make_valid_url(), query)
2732 raise ExtractorError('Invalid search query "%s"' % query)
2734 prefix = mobj.group('prefix')
2735 query = mobj.group('query')
2737 return self._get_n_results(query, 1)
2738 elif prefix == 'all':
2739 return self._get_n_results(query, self._MAX_RESULTS)
2743 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2744 elif n > self._MAX_RESULTS:
2745 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2746 n = self._MAX_RESULTS
2747 return self._get_n_results(query, n)
2749 def _get_n_results(self, query, n):
2750 """Get a specified number of results
for a query
"""
2751 raise NotImplementedError('This method must be implemented by subclasses')
2754 def SEARCH_KEY(self):
2755 return self._SEARCH_KEY