]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
debian/watch: Skip pgpsigurlmangle for the moment.
[youtubedl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import math
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_cookies,
18 compat_getpass,
19 compat_http_client,
20 compat_urllib_error,
21 compat_urllib_parse,
22 compat_urlparse,
23 compat_str,
24 compat_etree_fromstring,
25 )
26 from ..utils import (
27 NO_DEFAULT,
28 age_restricted,
29 bug_reports_message,
30 clean_html,
31 compiled_regex_type,
32 determine_ext,
33 error_to_compat_str,
34 ExtractorError,
35 fix_xml_ampersands,
36 float_or_none,
37 int_or_none,
38 parse_iso8601,
39 RegexNotFoundError,
40 sanitize_filename,
41 sanitized_Request,
42 unescapeHTML,
43 unified_strdate,
44 url_basename,
45 xpath_text,
46 xpath_with_ns,
47 determine_protocol,
48 parse_duration,
49 mimetype2ext,
50 )
51
52
53 class InfoExtractor(object):
54 """Information Extractor class.
55
56 Information extractors are the classes that, given a URL, extract
57 information about the video (or videos) the URL refers to. This
58 information includes the real video URL, the video title, author and
59 others. The information is stored in a dictionary which is then
60 passed to the YoutubeDL. The YoutubeDL processes this
61 information possibly downloading the video to the file system, among
62 other possible outcomes.
63
64 The type field determines the type of the result.
65 By far the most common value (and the default if _type is missing) is
66 "video", which indicates a single video.
67
68 For a video, the dictionaries must include the following fields:
69
70 id: Video identifier.
71 title: Video title, unescaped.
72
73 Additionally, it must contain either a formats entry or a url one:
74
75 formats: A list of dictionaries for each format available, ordered
76 from worst to best quality.
77
78 Potential fields:
79 * url Mandatory. The URL of the video file
80 * ext Will be calculated from URL if missing
81 * format A human-readable description of the format
82 ("mp4 container with h264/opus").
83 Calculated from the format_id, width, height.
84 and format_note fields if missing.
85 * format_id A short description of the format
86 ("mp4_h264_opus" or "19").
87 Technically optional, but strongly recommended.
88 * format_note Additional info about the format
89 ("3D" or "DASH video")
90 * width Width of the video, if known
91 * height Height of the video, if known
92 * resolution Textual description of width and height
93 * tbr Average bitrate of audio and video in KBit/s
94 * abr Average audio bitrate in KBit/s
95 * acodec Name of the audio codec in use
96 * asr Audio sampling rate in Hertz
97 * vbr Average video bitrate in KBit/s
98 * fps Frame rate
99 * vcodec Name of the video codec in use
100 * container Name of the container format
101 * filesize The number of bytes, if known in advance
102 * filesize_approx An estimate for the number of bytes
103 * player_url SWF Player URL (used for rtmpdump).
104 * protocol The protocol that will be used for the actual
105 download, lower-case.
106 "http", "https", "rtsp", "rtmp", "rtmpe",
107 "m3u8", or "m3u8_native".
108 * preference Order number of this format. If this field is
109 present and not None, the formats get sorted
110 by this field, regardless of all other values.
111 -1 for default (order by other properties),
112 -2 or smaller for less than default.
113 < -1000 to hide the format (if there is
114 another one which is strictly better)
115 * language Language code, e.g. "de" or "en-US".
116 * language_preference Is this in the language mentioned in
117 the URL?
118 10 if it's what the URL is about,
119 -1 for default (don't know),
120 -10 otherwise, other values reserved for now.
121 * quality Order number of the video quality of this
122 format, irrespective of the file format.
123 -1 for default (order by other properties),
124 -2 or smaller for less than default.
125 * source_preference Order number for this video source
126 (quality takes higher priority)
127 -1 for default (order by other properties),
128 -2 or smaller for less than default.
129 * http_headers A dictionary of additional HTTP headers
130 to add to the request.
131 * stretched_ratio If given and not 1, indicates that the
132 video's pixels are not square.
133 width : height ratio as float.
134 * no_resume The server does not support resuming the
135 (HTTP or RTMP) download. Boolean.
136
137 url: Final video URL.
138 ext: Video filename extension.
139 format: The video format, defaults to ext (used for --get-format)
140 player_url: SWF Player URL (used for rtmpdump).
141
142 The following fields are optional:
143
144 alt_title: A secondary title of the video.
145 display_id An alternative identifier for the video, not necessarily
146 unique, but available before title. Typically, id is
147 something like "4234987", title "Dancing naked mole rats",
148 and display_id "dancing-naked-mole-rats"
149 thumbnails: A list of dictionaries, with the following entries:
150 * "id" (optional, string) - Thumbnail format ID
151 * "url"
152 * "preference" (optional, int) - quality of the image
153 * "width" (optional, int)
154 * "height" (optional, int)
155 * "resolution" (optional, string "{width}x{height"},
156 deprecated)
157 thumbnail: Full URL to a video thumbnail image.
158 description: Full video description.
159 uploader: Full name of the video uploader.
160 creator: The main artist who created the video.
161 release_date: The date (YYYYMMDD) when the video was released.
162 timestamp: UNIX timestamp of the moment the video became available.
163 upload_date: Video upload date (YYYYMMDD).
164 If not explicitly set, calculated from timestamp.
165 uploader_id: Nickname or id of the video uploader.
166 location: Physical location where the video was filmed.
167 subtitles: The available subtitles as a dictionary in the format
168 {language: subformats}. "subformats" is a list sorted from
169 lower to higher preference, each element is a dictionary
170 with the "ext" entry and one of:
171 * "data": The subtitles file contents
172 * "url": A URL pointing to the subtitles file
173 "ext" will be calculated from URL if missing
174 automatic_captions: Like 'subtitles', used by the YoutubeIE for
175 automatically generated captions
176 duration: Length of the video in seconds, as an integer or float.
177 view_count: How many users have watched the video on the platform.
178 like_count: Number of positive ratings of the video
179 dislike_count: Number of negative ratings of the video
180 repost_count: Number of reposts of the video
181 average_rating: Average rating give by users, the scale used depends on the webpage
182 comment_count: Number of comments on the video
183 comments: A list of comments, each with one or more of the following
184 properties (all but one of text or html optional):
185 * "author" - human-readable name of the comment author
186 * "author_id" - user ID of the comment author
187 * "id" - Comment ID
188 * "html" - Comment as HTML
189 * "text" - Plain text of the comment
190 * "timestamp" - UNIX timestamp of comment
191 * "parent" - ID of the comment this one is replying to.
192 Set to "root" to indicate that this is a
193 comment to the original video.
194 age_limit: Age restriction for the video, as an integer (years)
195 webpage_url: The URL to the video webpage, if given to youtube-dl it
196 should allow to get the same result again. (It will be set
197 by YoutubeDL if it's missing)
198 categories: A list of categories that the video falls in, for example
199 ["Sports", "Berlin"]
200 tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"]
201 is_live: True, False, or None (=unknown). Whether this video is a
202 live stream that goes on instead of a fixed-length video.
203 start_time: Time in seconds where the reproduction should start, as
204 specified in the URL.
205 end_time: Time in seconds where the reproduction should end, as
206 specified in the URL.
207
208 The following fields should only be used when the video belongs to some logical
209 chapter or section:
210
211 chapter: Name or title of the chapter the video belongs to.
212 chapter_number: Number of the chapter the video belongs to, as an integer.
213 chapter_id: Id of the chapter the video belongs to, as a unicode string.
214
215 The following fields should only be used when the video is an episode of some
216 series or programme:
217
218 series: Title of the series or programme the video episode belongs to.
219 season: Title of the season the video episode belongs to.
220 season_number: Number of the season the video episode belongs to, as an integer.
221 season_id: Id of the season the video episode belongs to, as a unicode string.
222 episode: Title of the video episode. Unlike mandatory video title field,
223 this field should denote the exact title of the video episode
224 without any kind of decoration.
225 episode_number: Number of the video episode within a season, as an integer.
226 episode_id: Id of the video episode, as a unicode string.
227
228 Unless mentioned otherwise, the fields should be Unicode strings.
229
230 Unless mentioned otherwise, None is equivalent to absence of information.
231
232
233 _type "playlist" indicates multiple videos.
234 There must be a key "entries", which is a list, an iterable, or a PagedList
235 object, each element of which is a valid dictionary by this specification.
236
237 Additionally, playlists can have "title", "description" and "id" attributes
238 with the same semantics as videos (see above).
239
240
241 _type "multi_video" indicates that there are multiple videos that
242 form a single show, for examples multiple acts of an opera or TV episode.
243 It must have an entries key like a playlist and contain all the keys
244 required for a video at the same time.
245
246
247 _type "url" indicates that the video must be extracted from another
248 location, possibly by a different extractor. Its only required key is:
249 "url" - the next URL to extract.
250 The key "ie_key" can be set to the class name (minus the trailing "IE",
251 e.g. "Youtube") if the extractor class is known in advance.
252 Additionally, the dictionary may have any properties of the resolved entity
253 known in advance, for example "title" if the title of the referred video is
254 known ahead of time.
255
256
257 _type "url_transparent" entities have the same specification as "url", but
258 indicate that the given additional information is more precise than the one
259 associated with the resolved URL.
260 This is useful when a site employs a video service that hosts the video and
261 its technical metadata, but that video service does not embed a useful
262 title, description etc.
263
264
265 Subclasses of this one should re-define the _real_initialize() and
266 _real_extract() methods and define a _VALID_URL regexp.
267 Probably, they should also be added to the list of extractors.
268
269 Finally, the _WORKING attribute should be set to False for broken IEs
270 in order to warn the users and skip the tests.
271 """
272
273 _ready = False
274 _downloader = None
275 _WORKING = True
276
277 def __init__(self, downloader=None):
278 """Constructor. Receives an optional downloader."""
279 self._ready = False
280 self.set_downloader(downloader)
281
282 @classmethod
283 def suitable(cls, url):
284 """Receives a URL and returns True if suitable for this IE."""
285
286 # This does not use has/getattr intentionally - we want to know whether
287 # we have cached the regexp for *this* class, whereas getattr would also
288 # match the superclass
289 if '_VALID_URL_RE' not in cls.__dict__:
290 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
291 return cls._VALID_URL_RE.match(url) is not None
292
293 @classmethod
294 def _match_id(cls, url):
295 if '_VALID_URL_RE' not in cls.__dict__:
296 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
297 m = cls._VALID_URL_RE.match(url)
298 assert m
299 return m.group('id')
300
301 @classmethod
302 def working(cls):
303 """Getter method for _WORKING."""
304 return cls._WORKING
305
306 def initialize(self):
307 """Initializes an instance (authentication, etc)."""
308 if not self._ready:
309 self._real_initialize()
310 self._ready = True
311
312 def extract(self, url):
313 """Extracts URL information and returns it in list of dicts."""
314 try:
315 self.initialize()
316 return self._real_extract(url)
317 except ExtractorError:
318 raise
319 except compat_http_client.IncompleteRead as e:
320 raise ExtractorError('A network error has occurred.', cause=e, expected=True)
321 except (KeyError, StopIteration) as e:
322 raise ExtractorError('An extractor error has occurred.', cause=e)
323
324 def set_downloader(self, downloader):
325 """Sets the downloader for this IE."""
326 self._downloader = downloader
327
328 def _real_initialize(self):
329 """Real initialization process. Redefine in subclasses."""
330 pass
331
332 def _real_extract(self, url):
333 """Real extraction process. Redefine in subclasses."""
334 pass
335
336 @classmethod
337 def ie_key(cls):
338 """A string for getting the InfoExtractor with get_info_extractor"""
339 return compat_str(cls.__name__[:-2])
340
341 @property
342 def IE_NAME(self):
343 return compat_str(type(self).__name__[:-2])
344
345 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
346 """ Returns the response handle """
347 if note is None:
348 self.report_download_webpage(video_id)
349 elif note is not False:
350 if video_id is None:
351 self.to_screen('%s' % (note,))
352 else:
353 self.to_screen('%s: %s' % (video_id, note))
354 try:
355 return self._downloader.urlopen(url_or_request)
356 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
357 if errnote is False:
358 return False
359 if errnote is None:
360 errnote = 'Unable to download webpage'
361
362 errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
363 if fatal:
364 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
365 else:
366 self._downloader.report_warning(errmsg)
367 return False
368
369 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
370 """ Returns a tuple (page content as string, URL handle) """
371 # Strip hashes from the URL (#1038)
372 if isinstance(url_or_request, (compat_str, str)):
373 url_or_request = url_or_request.partition('#')[0]
374
375 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
376 if urlh is False:
377 assert not fatal
378 return False
379 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
380 return (content, urlh)
381
382 @staticmethod
383 def _guess_encoding_from_content(content_type, webpage_bytes):
384 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
385 if m:
386 encoding = m.group(1)
387 else:
388 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
389 webpage_bytes[:1024])
390 if m:
391 encoding = m.group(1).decode('ascii')
392 elif webpage_bytes.startswith(b'\xff\xfe'):
393 encoding = 'utf-16'
394 else:
395 encoding = 'utf-8'
396
397 return encoding
398
399 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
400 content_type = urlh.headers.get('Content-Type', '')
401 webpage_bytes = urlh.read()
402 if prefix is not None:
403 webpage_bytes = prefix + webpage_bytes
404 if not encoding:
405 encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
406 if self._downloader.params.get('dump_intermediate_pages', False):
407 try:
408 url = url_or_request.get_full_url()
409 except AttributeError:
410 url = url_or_request
411 self.to_screen('Dumping request to ' + url)
412 dump = base64.b64encode(webpage_bytes).decode('ascii')
413 self._downloader.to_screen(dump)
414 if self._downloader.params.get('write_pages', False):
415 try:
416 url = url_or_request.get_full_url()
417 except AttributeError:
418 url = url_or_request
419 basen = '%s_%s' % (video_id, url)
420 if len(basen) > 240:
421 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
422 basen = basen[:240 - len(h)] + h
423 raw_filename = basen + '.dump'
424 filename = sanitize_filename(raw_filename, restricted=True)
425 self.to_screen('Saving request to ' + filename)
426 # Working around MAX_PATH limitation on Windows (see
427 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
428 if os.name == 'nt':
429 absfilepath = os.path.abspath(filename)
430 if len(absfilepath) > 259:
431 filename = '\\\\?\\' + absfilepath
432 with open(filename, 'wb') as outf:
433 outf.write(webpage_bytes)
434
435 try:
436 content = webpage_bytes.decode(encoding, 'replace')
437 except LookupError:
438 content = webpage_bytes.decode('utf-8', 'replace')
439
440 if ('<title>Access to this site is blocked</title>' in content and
441 'Websense' in content[:512]):
442 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
443 blocked_iframe = self._html_search_regex(
444 r'<iframe src="([^"]+)"', content,
445 'Websense information URL', default=None)
446 if blocked_iframe:
447 msg += ' Visit %s for more details' % blocked_iframe
448 raise ExtractorError(msg, expected=True)
449 if '<title>The URL you requested has been blocked</title>' in content[:512]:
450 msg = (
451 'Access to this webpage has been blocked by Indian censorship. '
452 'Use a VPN or proxy server (with --proxy) to route around it.')
453 block_msg = self._html_search_regex(
454 r'</h1><p>(.*?)</p>',
455 content, 'block message', default=None)
456 if block_msg:
457 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
458 raise ExtractorError(msg, expected=True)
459
460 return content
461
462 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
463 """ Returns the data of the page as a string """
464 success = False
465 try_count = 0
466 while success is False:
467 try:
468 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
469 success = True
470 except compat_http_client.IncompleteRead as e:
471 try_count += 1
472 if try_count >= tries:
473 raise e
474 self._sleep(timeout, video_id)
475 if res is False:
476 return res
477 else:
478 content, _ = res
479 return content
480
481 def _download_xml(self, url_or_request, video_id,
482 note='Downloading XML', errnote='Unable to download XML',
483 transform_source=None, fatal=True, encoding=None):
484 """Return the xml as an xml.etree.ElementTree.Element"""
485 xml_string = self._download_webpage(
486 url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
487 if xml_string is False:
488 return xml_string
489 if transform_source:
490 xml_string = transform_source(xml_string)
491 return compat_etree_fromstring(xml_string.encode('utf-8'))
492
493 def _download_json(self, url_or_request, video_id,
494 note='Downloading JSON metadata',
495 errnote='Unable to download JSON metadata',
496 transform_source=None,
497 fatal=True, encoding=None):
498 json_string = self._download_webpage(
499 url_or_request, video_id, note, errnote, fatal=fatal,
500 encoding=encoding)
501 if (not fatal) and json_string is False:
502 return None
503 return self._parse_json(
504 json_string, video_id, transform_source=transform_source, fatal=fatal)
505
506 def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
507 if transform_source:
508 json_string = transform_source(json_string)
509 try:
510 return json.loads(json_string)
511 except ValueError as ve:
512 errmsg = '%s: Failed to parse JSON ' % video_id
513 if fatal:
514 raise ExtractorError(errmsg, cause=ve)
515 else:
516 self.report_warning(errmsg + str(ve))
517
518 def report_warning(self, msg, video_id=None):
519 idstr = '' if video_id is None else '%s: ' % video_id
520 self._downloader.report_warning(
521 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
522
523 def to_screen(self, msg):
524 """Print msg to screen, prefixing it with '[ie_name]'"""
525 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
526
527 def report_extraction(self, id_or_name):
528 """Report information extraction."""
529 self.to_screen('%s: Extracting information' % id_or_name)
530
531 def report_download_webpage(self, video_id):
532 """Report webpage download."""
533 self.to_screen('%s: Downloading webpage' % video_id)
534
535 def report_age_confirmation(self):
536 """Report attempt to confirm age."""
537 self.to_screen('Confirming age')
538
539 def report_login(self):
540 """Report attempt to log in."""
541 self.to_screen('Logging in')
542
543 @staticmethod
544 def raise_login_required(msg='This video is only available for registered users'):
545 raise ExtractorError(
546 '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
547 expected=True)
548
549 @staticmethod
550 def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
551 raise ExtractorError(
552 '%s. You might want to use --proxy to workaround.' % msg,
553 expected=True)
554
555 # Methods for following #608
556 @staticmethod
557 def url_result(url, ie=None, video_id=None, video_title=None):
558 """Returns a URL that points to a page that should be processed"""
559 # TODO: ie should be the class used for getting the info
560 video_info = {'_type': 'url',
561 'url': url,
562 'ie_key': ie}
563 if video_id is not None:
564 video_info['id'] = video_id
565 if video_title is not None:
566 video_info['title'] = video_title
567 return video_info
568
569 @staticmethod
570 def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
571 """Returns a playlist"""
572 video_info = {'_type': 'playlist',
573 'entries': entries}
574 if playlist_id:
575 video_info['id'] = playlist_id
576 if playlist_title:
577 video_info['title'] = playlist_title
578 if playlist_description:
579 video_info['description'] = playlist_description
580 return video_info
581
582 def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
583 """
584 Perform a regex search on the given string, using a single or a list of
585 patterns returning the first matching group.
586 In case of failure return a default value or raise a WARNING or a
587 RegexNotFoundError, depending on fatal, specifying the field name.
588 """
589 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
590 mobj = re.search(pattern, string, flags)
591 else:
592 for p in pattern:
593 mobj = re.search(p, string, flags)
594 if mobj:
595 break
596
597 if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
598 _name = '\033[0;34m%s\033[0m' % name
599 else:
600 _name = name
601
602 if mobj:
603 if group is None:
604 # return the first matching group
605 return next(g for g in mobj.groups() if g is not None)
606 else:
607 return mobj.group(group)
608 elif default is not NO_DEFAULT:
609 return default
610 elif fatal:
611 raise RegexNotFoundError('Unable to extract %s' % _name)
612 else:
613 self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
614 return None
615
616 def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
617 """
618 Like _search_regex, but strips HTML tags and unescapes entities.
619 """
620 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
621 if res:
622 return clean_html(res).strip()
623 else:
624 return res
625
626 def _get_login_info(self):
627 """
628 Get the login info as (username, password)
629 It will look in the netrc file using the _NETRC_MACHINE value
630 If there's no info available, return (None, None)
631 """
632 if self._downloader is None:
633 return (None, None)
634
635 username = None
636 password = None
637 downloader_params = self._downloader.params
638
639 # Attempt to use provided username and password or .netrc data
640 if downloader_params.get('username') is not None:
641 username = downloader_params['username']
642 password = downloader_params['password']
643 elif downloader_params.get('usenetrc', False):
644 try:
645 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
646 if info is not None:
647 username = info[0]
648 password = info[2]
649 else:
650 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
651 except (IOError, netrc.NetrcParseError) as err:
652 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
653
654 return (username, password)
655
656 def _get_tfa_info(self, note='two-factor verification code'):
657 """
658 Get the two-factor authentication info
659 TODO - asking the user will be required for sms/phone verify
660 currently just uses the command line option
661 If there's no info available, return None
662 """
663 if self._downloader is None:
664 return None
665 downloader_params = self._downloader.params
666
667 if downloader_params.get('twofactor') is not None:
668 return downloader_params['twofactor']
669
670 return compat_getpass('Type %s and press [Return]: ' % note)
671
672 # Helper functions for extracting OpenGraph info
673 @staticmethod
674 def _og_regexes(prop):
675 content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
676 property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
677 % {'prop': re.escape(prop)})
678 template = r'<meta[^>]+?%s[^>]+?%s'
679 return [
680 template % (property_re, content_re),
681 template % (content_re, property_re),
682 ]
683
684 @staticmethod
685 def _meta_regex(prop):
686 return r'''(?isx)<meta
687 (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
688 [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
689
690 def _og_search_property(self, prop, html, name=None, **kargs):
691 if name is None:
692 name = 'OpenGraph %s' % prop
693 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
694 if escaped is None:
695 return None
696 return unescapeHTML(escaped)
697
698 def _og_search_thumbnail(self, html, **kargs):
699 return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
700
701 def _og_search_description(self, html, **kargs):
702 return self._og_search_property('description', html, fatal=False, **kargs)
703
704 def _og_search_title(self, html, **kargs):
705 return self._og_search_property('title', html, **kargs)
706
707 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
708 regexes = self._og_regexes('video') + self._og_regexes('video:url')
709 if secure:
710 regexes = self._og_regexes('video:secure_url') + regexes
711 return self._html_search_regex(regexes, html, name, **kargs)
712
713 def _og_search_url(self, html, **kargs):
714 return self._og_search_property('url', html, **kargs)
715
716 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
717 if display_name is None:
718 display_name = name
719 return self._html_search_regex(
720 self._meta_regex(name),
721 html, display_name, fatal=fatal, group='content', **kwargs)
722
723 def _dc_search_uploader(self, html):
724 return self._html_search_meta('dc.creator', html, 'uploader')
725
726 def _rta_search(self, html):
727 # See http://www.rtalabel.org/index.php?content=howtofaq#single
728 if re.search(r'(?ix)<meta\s+name="rating"\s+'
729 r' content="RTA-5042-1996-1400-1577-RTA"',
730 html):
731 return 18
732 return 0
733
734 def _media_rating_search(self, html):
735 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
736 rating = self._html_search_meta('rating', html)
737
738 if not rating:
739 return None
740
741 RATING_TABLE = {
742 'safe for kids': 0,
743 'general': 8,
744 '14 years': 14,
745 'mature': 17,
746 'restricted': 19,
747 }
748 return RATING_TABLE.get(rating.lower())
749
750 def _family_friendly_search(self, html):
751 # See http://schema.org/VideoObject
752 family_friendly = self._html_search_meta('isFamilyFriendly', html)
753
754 if not family_friendly:
755 return None
756
757 RATING_TABLE = {
758 '1': 0,
759 'true': 0,
760 '0': 18,
761 'false': 18,
762 }
763 return RATING_TABLE.get(family_friendly.lower())
764
765 def _twitter_search_player(self, html):
766 return self._html_search_meta('twitter:player', html,
767 'twitter card player')
768
769 def _search_json_ld(self, html, video_id, **kwargs):
770 json_ld = self._search_regex(
771 r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
772 html, 'JSON-LD', group='json_ld', **kwargs)
773 if not json_ld:
774 return {}
775 return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
776
777 def _json_ld(self, json_ld, video_id, fatal=True):
778 if isinstance(json_ld, compat_str):
779 json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
780 if not json_ld:
781 return {}
782 info = {}
783 if json_ld.get('@context') == 'http://schema.org':
784 item_type = json_ld.get('@type')
785 if item_type == 'TVEpisode':
786 info.update({
787 'episode': unescapeHTML(json_ld.get('name')),
788 'episode_number': int_or_none(json_ld.get('episodeNumber')),
789 'description': unescapeHTML(json_ld.get('description')),
790 })
791 part_of_season = json_ld.get('partOfSeason')
792 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
793 info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
794 part_of_series = json_ld.get('partOfSeries')
795 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
796 info['series'] = unescapeHTML(part_of_series.get('name'))
797 elif item_type == 'Article':
798 info.update({
799 'timestamp': parse_iso8601(json_ld.get('datePublished')),
800 'title': unescapeHTML(json_ld.get('headline')),
801 'description': unescapeHTML(json_ld.get('articleBody')),
802 })
803 return dict((k, v) for k, v in info.items() if v is not None)
804
805 @staticmethod
806 def _hidden_inputs(html):
807 html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
808 hidden_inputs = {}
809 for input in re.findall(r'(?i)<input([^>]+)>', html):
810 if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
811 continue
812 name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
813 if not name:
814 continue
815 value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
816 if not value:
817 continue
818 hidden_inputs[name.group('value')] = value.group('value')
819 return hidden_inputs
820
821 def _form_hidden_inputs(self, form_id, html):
822 form = self._search_regex(
823 r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
824 html, '%s form' % form_id, group='form')
825 return self._hidden_inputs(form)
826
827 def _sort_formats(self, formats, field_preference=None):
828 if not formats:
829 raise ExtractorError('No video formats found')
830
831 for f in formats:
832 # Automatically determine tbr when missing based on abr and vbr (improves
833 # formats sorting in some cases)
834 if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
835 f['tbr'] = f['abr'] + f['vbr']
836
837 def _formats_key(f):
838 # TODO remove the following workaround
839 from ..utils import determine_ext
840 if not f.get('ext') and 'url' in f:
841 f['ext'] = determine_ext(f['url'])
842
843 if isinstance(field_preference, (list, tuple)):
844 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
845
846 preference = f.get('preference')
847 if preference is None:
848 preference = 0
849 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
850 preference -= 0.5
851
852 proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
853
854 if f.get('vcodec') == 'none': # audio only
855 if self._downloader.params.get('prefer_free_formats'):
856 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
857 else:
858 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
859 ext_preference = 0
860 try:
861 audio_ext_preference = ORDER.index(f['ext'])
862 except ValueError:
863 audio_ext_preference = -1
864 else:
865 if self._downloader.params.get('prefer_free_formats'):
866 ORDER = ['flv', 'mp4', 'webm']
867 else:
868 ORDER = ['webm', 'flv', 'mp4']
869 try:
870 ext_preference = ORDER.index(f['ext'])
871 except ValueError:
872 ext_preference = -1
873 audio_ext_preference = 0
874
875 return (
876 preference,
877 f.get('language_preference') if f.get('language_preference') is not None else -1,
878 f.get('quality') if f.get('quality') is not None else -1,
879 f.get('tbr') if f.get('tbr') is not None else -1,
880 f.get('filesize') if f.get('filesize') is not None else -1,
881 f.get('vbr') if f.get('vbr') is not None else -1,
882 f.get('height') if f.get('height') is not None else -1,
883 f.get('width') if f.get('width') is not None else -1,
884 proto_preference,
885 ext_preference,
886 f.get('abr') if f.get('abr') is not None else -1,
887 audio_ext_preference,
888 f.get('fps') if f.get('fps') is not None else -1,
889 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
890 f.get('source_preference') if f.get('source_preference') is not None else -1,
891 f.get('format_id') if f.get('format_id') is not None else '',
892 )
893 formats.sort(key=_formats_key)
894
895 def _check_formats(self, formats, video_id):
896 if formats:
897 formats[:] = filter(
898 lambda f: self._is_valid_url(
899 f['url'], video_id,
900 item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
901 formats)
902
903 @staticmethod
904 def _remove_duplicate_formats(formats):
905 format_urls = set()
906 unique_formats = []
907 for f in formats:
908 if f['url'] not in format_urls:
909 format_urls.add(f['url'])
910 unique_formats.append(f)
911 formats[:] = unique_formats
912
913 def _is_valid_url(self, url, video_id, item='video'):
914 url = self._proto_relative_url(url, scheme='http:')
915 # For now assume non HTTP(S) URLs always valid
916 if not (url.startswith('http://') or url.startswith('https://')):
917 return True
918 try:
919 self._request_webpage(url, video_id, 'Checking %s URL' % item)
920 return True
921 except ExtractorError as e:
922 if isinstance(e.cause, compat_urllib_error.URLError):
923 self.to_screen(
924 '%s: %s URL is invalid, skipping' % (video_id, item))
925 return False
926 raise
927
928 def http_scheme(self):
929 """ Either "http:" or "https:", depending on the user's preferences """
930 return (
931 'http:'
932 if self._downloader.params.get('prefer_insecure', False)
933 else 'https:')
934
935 def _proto_relative_url(self, url, scheme=None):
936 if url is None:
937 return url
938 if url.startswith('//'):
939 if scheme is None:
940 scheme = self.http_scheme()
941 return scheme + url
942 else:
943 return url
944
945 def _sleep(self, timeout, video_id, msg_template=None):
946 if msg_template is None:
947 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
948 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
949 self.to_screen(msg)
950 time.sleep(timeout)
951
952 def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
953 transform_source=lambda s: fix_xml_ampersands(s).strip(),
954 fatal=True):
955 manifest = self._download_xml(
956 manifest_url, video_id, 'Downloading f4m manifest',
957 'Unable to download f4m manifest',
958 # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
959 # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
960 transform_source=transform_source,
961 fatal=fatal)
962
963 if manifest is False:
964 return []
965
966 formats = []
967 manifest_version = '1.0'
968 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
969 if not media_nodes:
970 manifest_version = '2.0'
971 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
972 base_url = xpath_text(
973 manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
974 'base URL', default=None)
975 if base_url:
976 base_url = base_url.strip()
977 for i, media_el in enumerate(media_nodes):
978 if manifest_version == '2.0':
979 media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
980 if not media_url:
981 continue
982 manifest_url = (
983 media_url if media_url.startswith('http://') or media_url.startswith('https://')
984 else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
985 # If media_url is itself a f4m manifest do the recursive extraction
986 # since bitrates in parent manifest (this one) and media_url manifest
987 # may differ leading to inability to resolve the format by requested
988 # bitrate in f4m downloader
989 if determine_ext(manifest_url) == 'f4m':
990 formats.extend(self._extract_f4m_formats(
991 manifest_url, video_id, preference, f4m_id, fatal=fatal))
992 continue
993 tbr = int_or_none(media_el.attrib.get('bitrate'))
994 formats.append({
995 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
996 'url': manifest_url,
997 'ext': 'flv',
998 'tbr': tbr,
999 'width': int_or_none(media_el.attrib.get('width')),
1000 'height': int_or_none(media_el.attrib.get('height')),
1001 'preference': preference,
1002 })
1003 self._sort_formats(formats)
1004
1005 return formats
1006
1007 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1008 entry_protocol='m3u8', preference=None,
1009 m3u8_id=None, note=None, errnote=None,
1010 fatal=True):
1011
1012 formats = [{
1013 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1014 'url': m3u8_url,
1015 'ext': ext,
1016 'protocol': 'm3u8',
1017 'preference': preference - 1 if preference else -1,
1018 'resolution': 'multiple',
1019 'format_note': 'Quality selection URL',
1020 }]
1021
1022 format_url = lambda u: (
1023 u
1024 if re.match(r'^https?://', u)
1025 else compat_urlparse.urljoin(m3u8_url, u))
1026
1027 res = self._download_webpage_handle(
1028 m3u8_url, video_id,
1029 note=note or 'Downloading m3u8 information',
1030 errnote=errnote or 'Failed to download m3u8 information',
1031 fatal=fatal)
1032 if res is False:
1033 return []
1034 m3u8_doc, urlh = res
1035 m3u8_url = urlh.geturl()
1036 # A Media Playlist Tag MUST NOT appear in a Master Playlist
1037 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1038 # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
1039 # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1040 if '#EXT-X-TARGETDURATION' in m3u8_doc:
1041 return [{
1042 'url': m3u8_url,
1043 'format_id': m3u8_id,
1044 'ext': ext,
1045 'protocol': entry_protocol,
1046 'preference': preference,
1047 }]
1048 last_info = None
1049 last_media = None
1050 kv_rex = re.compile(
1051 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
1052 for line in m3u8_doc.splitlines():
1053 if line.startswith('#EXT-X-STREAM-INF:'):
1054 last_info = {}
1055 for m in kv_rex.finditer(line):
1056 v = m.group('val')
1057 if v.startswith('"'):
1058 v = v[1:-1]
1059 last_info[m.group('key')] = v
1060 elif line.startswith('#EXT-X-MEDIA:'):
1061 last_media = {}
1062 for m in kv_rex.finditer(line):
1063 v = m.group('val')
1064 if v.startswith('"'):
1065 v = v[1:-1]
1066 last_media[m.group('key')] = v
1067 elif line.startswith('#') or not line.strip():
1068 continue
1069 else:
1070 if last_info is None:
1071 formats.append({'url': format_url(line)})
1072 continue
1073 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
1074 format_id = []
1075 if m3u8_id:
1076 format_id.append(m3u8_id)
1077 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
1078 format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
1079 f = {
1080 'format_id': '-'.join(format_id),
1081 'url': format_url(line.strip()),
1082 'tbr': tbr,
1083 'ext': ext,
1084 'protocol': entry_protocol,
1085 'preference': preference,
1086 }
1087 codecs = last_info.get('CODECS')
1088 if codecs:
1089 # TODO: looks like video codec is not always necessarily goes first
1090 va_codecs = codecs.split(',')
1091 if va_codecs[0]:
1092 f['vcodec'] = va_codecs[0]
1093 if len(va_codecs) > 1 and va_codecs[1]:
1094 f['acodec'] = va_codecs[1]
1095 resolution = last_info.get('RESOLUTION')
1096 if resolution:
1097 width_str, height_str = resolution.split('x')
1098 f['width'] = int(width_str)
1099 f['height'] = int(height_str)
1100 if last_media is not None:
1101 f['m3u8_media'] = last_media
1102 last_media = None
1103 formats.append(f)
1104 last_info = {}
1105 self._sort_formats(formats)
1106 return formats
1107
1108 @staticmethod
1109 def _xpath_ns(path, namespace=None):
1110 if not namespace:
1111 return path
1112 out = []
1113 for c in path.split('/'):
1114 if not c or c == '.':
1115 out.append(c)
1116 else:
1117 out.append('{%s}%s' % (namespace, c))
1118 return '/'.join(out)
1119
1120 def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
1121 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1122
1123 if smil is False:
1124 assert not fatal
1125 return []
1126
1127 namespace = self._parse_smil_namespace(smil)
1128
1129 return self._parse_smil_formats(
1130 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1131
1132 def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1133 smil = self._download_smil(smil_url, video_id, fatal=fatal)
1134 if smil is False:
1135 return {}
1136 return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1137
1138 def _download_smil(self, smil_url, video_id, fatal=True):
1139 return self._download_xml(
1140 smil_url, video_id, 'Downloading SMIL file',
1141 'Unable to download SMIL file', fatal=fatal)
1142
1143 def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1144 namespace = self._parse_smil_namespace(smil)
1145
1146 formats = self._parse_smil_formats(
1147 smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1148 subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1149
1150 video_id = os.path.splitext(url_basename(smil_url))[0]
1151 title = None
1152 description = None
1153 upload_date = None
1154 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1155 name = meta.attrib.get('name')
1156 content = meta.attrib.get('content')
1157 if not name or not content:
1158 continue
1159 if not title and name == 'title':
1160 title = content
1161 elif not description and name in ('description', 'abstract'):
1162 description = content
1163 elif not upload_date and name == 'date':
1164 upload_date = unified_strdate(content)
1165
1166 thumbnails = [{
1167 'id': image.get('type'),
1168 'url': image.get('src'),
1169 'width': int_or_none(image.get('width')),
1170 'height': int_or_none(image.get('height')),
1171 } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1172
1173 return {
1174 'id': video_id,
1175 'title': title or video_id,
1176 'description': description,
1177 'upload_date': upload_date,
1178 'thumbnails': thumbnails,
1179 'formats': formats,
1180 'subtitles': subtitles,
1181 }
1182
1183 def _parse_smil_namespace(self, smil):
1184 return self._search_regex(
1185 r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1186
1187 def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1188 base = smil_url
1189 for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1190 b = meta.get('base') or meta.get('httpBase')
1191 if b:
1192 base = b
1193 break
1194
1195 formats = []
1196 rtmp_count = 0
1197 http_count = 0
1198 m3u8_count = 0
1199
1200 srcs = []
1201 videos = smil.findall(self._xpath_ns('.//video', namespace))
1202 for video in videos:
1203 src = video.get('src')
1204 if not src or src in srcs:
1205 continue
1206 srcs.append(src)
1207
1208 bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
1209 filesize = int_or_none(video.get('size') or video.get('fileSize'))
1210 width = int_or_none(video.get('width'))
1211 height = int_or_none(video.get('height'))
1212 proto = video.get('proto')
1213 ext = video.get('ext')
1214 src_ext = determine_ext(src)
1215 streamer = video.get('streamer') or base
1216
1217 if proto == 'rtmp' or streamer.startswith('rtmp'):
1218 rtmp_count += 1
1219 formats.append({
1220 'url': streamer,
1221 'play_path': src,
1222 'ext': 'flv',
1223 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1224 'tbr': bitrate,
1225 'filesize': filesize,
1226 'width': width,
1227 'height': height,
1228 })
1229 if transform_rtmp_url:
1230 streamer, src = transform_rtmp_url(streamer, src)
1231 formats[-1].update({
1232 'url': streamer,
1233 'play_path': src,
1234 })
1235 continue
1236
1237 src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1238 src_url = src_url.strip()
1239
1240 if proto == 'm3u8' or src_ext == 'm3u8':
1241 m3u8_formats = self._extract_m3u8_formats(
1242 src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1243 if len(m3u8_formats) == 1:
1244 m3u8_count += 1
1245 m3u8_formats[0].update({
1246 'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1247 'tbr': bitrate,
1248 'width': width,
1249 'height': height,
1250 })
1251 formats.extend(m3u8_formats)
1252 continue
1253
1254 if src_ext == 'f4m':
1255 f4m_url = src_url
1256 if not f4m_params:
1257 f4m_params = {
1258 'hdcore': '3.2.0',
1259 'plugin': 'flowplayer-3.2.0.1',
1260 }
1261 f4m_url += '&' if '?' in f4m_url else '?'
1262 f4m_url += compat_urllib_parse.urlencode(f4m_params)
1263 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1264 continue
1265
1266 if src_url.startswith('http') and self._is_valid_url(src, video_id):
1267 http_count += 1
1268 formats.append({
1269 'url': src_url,
1270 'ext': ext or src_ext or 'flv',
1271 'format_id': 'http-%d' % (bitrate or http_count),
1272 'tbr': bitrate,
1273 'filesize': filesize,
1274 'width': width,
1275 'height': height,
1276 })
1277 continue
1278
1279 self._sort_formats(formats)
1280
1281 return formats
1282
1283 def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1284 urls = []
1285 subtitles = {}
1286 for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1287 src = textstream.get('src')
1288 if not src or src in urls:
1289 continue
1290 urls.append(src)
1291 ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
1292 lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1293 subtitles.setdefault(lang, []).append({
1294 'url': src,
1295 'ext': ext,
1296 })
1297 return subtitles
1298
1299 def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1300 xspf = self._download_xml(
1301 playlist_url, playlist_id, 'Downloading xpsf playlist',
1302 'Unable to download xspf manifest', fatal=fatal)
1303 if xspf is False:
1304 return []
1305 return self._parse_xspf(xspf, playlist_id)
1306
1307 def _parse_xspf(self, playlist, playlist_id):
1308 NS_MAP = {
1309 'xspf': 'http://xspf.org/ns/0/',
1310 's1': 'http://static.streamone.nl/player/ns/0',
1311 }
1312
1313 entries = []
1314 for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1315 title = xpath_text(
1316 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1317 description = xpath_text(
1318 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1319 thumbnail = xpath_text(
1320 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1321 duration = float_or_none(
1322 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1323
1324 formats = [{
1325 'url': location.text,
1326 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1327 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1328 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1329 } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1330 self._sort_formats(formats)
1331
1332 entries.append({
1333 'id': playlist_id,
1334 'title': title,
1335 'description': description,
1336 'thumbnail': thumbnail,
1337 'duration': duration,
1338 'formats': formats,
1339 })
1340 return entries
1341
1342 def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1343 res = self._download_webpage_handle(
1344 mpd_url, video_id,
1345 note=note or 'Downloading MPD manifest',
1346 errnote=errnote or 'Failed to download MPD manifest',
1347 fatal=fatal)
1348 if res is False:
1349 return []
1350 mpd, urlh = res
1351 mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
1352
1353 return self._parse_mpd_formats(
1354 compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
1355
1356 def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
1357 if mpd_doc.get('type') == 'dynamic':
1358 return []
1359
1360 namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1361
1362 def _add_ns(path):
1363 return self._xpath_ns(path, namespace)
1364
1365 def is_drm_protected(element):
1366 return element.find(_add_ns('ContentProtection')) is not None
1367
1368 def extract_multisegment_info(element, ms_parent_info):
1369 ms_info = ms_parent_info.copy()
1370 segment_list = element.find(_add_ns('SegmentList'))
1371 if segment_list is not None:
1372 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1373 if segment_urls_e:
1374 ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1375 initialization = segment_list.find(_add_ns('Initialization'))
1376 if initialization is not None:
1377 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1378 else:
1379 segment_template = element.find(_add_ns('SegmentTemplate'))
1380 if segment_template is not None:
1381 start_number = segment_template.get('startNumber')
1382 if start_number:
1383 ms_info['start_number'] = int(start_number)
1384 segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
1385 if segment_timeline is not None:
1386 s_e = segment_timeline.findall(_add_ns('S'))
1387 if s_e:
1388 ms_info['total_number'] = 0
1389 for s in s_e:
1390 ms_info['total_number'] += 1 + int(s.get('r', '0'))
1391 else:
1392 timescale = segment_template.get('timescale')
1393 if timescale:
1394 ms_info['timescale'] = int(timescale)
1395 segment_duration = segment_template.get('duration')
1396 if segment_duration:
1397 ms_info['segment_duration'] = int(segment_duration)
1398 media_template = segment_template.get('media')
1399 if media_template:
1400 ms_info['media_template'] = media_template
1401 initialization = segment_template.get('initialization')
1402 if initialization:
1403 ms_info['initialization_url'] = initialization
1404 else:
1405 initialization = segment_template.find(_add_ns('Initialization'))
1406 if initialization is not None:
1407 ms_info['initialization_url'] = initialization.attrib['sourceURL']
1408 return ms_info
1409
1410 mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1411 formats = []
1412 for period in mpd_doc.findall(_add_ns('Period')):
1413 period_duration = parse_duration(period.get('duration')) or mpd_duration
1414 period_ms_info = extract_multisegment_info(period, {
1415 'start_number': 1,
1416 'timescale': 1,
1417 })
1418 for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1419 if is_drm_protected(adaptation_set):
1420 continue
1421 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1422 for representation in adaptation_set.findall(_add_ns('Representation')):
1423 if is_drm_protected(representation):
1424 continue
1425 representation_attrib = adaptation_set.attrib.copy()
1426 representation_attrib.update(representation.attrib)
1427 mime_type = representation_attrib.get('mimeType')
1428 content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
1429 if content_type == 'text':
1430 # TODO implement WebVTT downloading
1431 pass
1432 elif content_type == 'video' or content_type == 'audio':
1433 base_url = ''
1434 for element in (representation, adaptation_set, period, mpd_doc):
1435 base_url_e = element.find(_add_ns('BaseURL'))
1436 if base_url_e is not None:
1437 base_url = base_url_e.text + base_url
1438 if re.match(r'^https?://', base_url):
1439 break
1440 if mpd_base_url and not re.match(r'^https?://', base_url):
1441 if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1442 mpd_base_url += '/'
1443 base_url = mpd_base_url + base_url
1444 representation_id = representation_attrib.get('id')
1445 lang = representation_attrib.get('lang')
1446 url_el = representation.find(_add_ns('BaseURL'))
1447 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1448 f = {
1449 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1450 'url': base_url,
1451 'width': int_or_none(representation_attrib.get('width')),
1452 'height': int_or_none(representation_attrib.get('height')),
1453 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
1454 'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1455 'fps': int_or_none(representation_attrib.get('frameRate')),
1456 'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
1457 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
1458 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1459 'format_note': 'DASH %s' % content_type,
1460 'filesize': filesize,
1461 }
1462 representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1463 if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
1464 if 'total_number' not in representation_ms_info and 'segment_duration':
1465 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
1466 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1467 media_template = representation_ms_info['media_template']
1468 media_template = media_template.replace('$RepresentationID$', representation_id)
1469 media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
1470 media_template.replace('$$', '$')
1471 representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1472 if 'segment_urls' in representation_ms_info:
1473 f.update({
1474 'segment_urls': representation_ms_info['segment_urls'],
1475 'protocol': 'http_dash_segments',
1476 })
1477 if 'initialization_url' in representation_ms_info:
1478 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
1479 f.update({
1480 'initialization_url': initialization_url,
1481 })
1482 if not f.get('url'):
1483 f['url'] = initialization_url
1484 try:
1485 existing_format = next(
1486 fo for fo in formats
1487 if fo['format_id'] == representation_id)
1488 except StopIteration:
1489 full_info = formats_dict.get(representation_id, {}).copy()
1490 full_info.update(f)
1491 formats.append(full_info)
1492 else:
1493 existing_format.update(f)
1494 else:
1495 self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1496 self._sort_formats(formats)
1497 return formats
1498
1499 def _live_title(self, name):
1500 """ Generate the title for a live video """
1501 now = datetime.datetime.now()
1502 now_str = now.strftime('%Y-%m-%d %H:%M')
1503 return name + ' ' + now_str
1504
1505 def _int(self, v, name, fatal=False, **kwargs):
1506 res = int_or_none(v, **kwargs)
1507 if 'get_attr' in kwargs:
1508 print(getattr(v, kwargs['get_attr']))
1509 if res is None:
1510 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1511 if fatal:
1512 raise ExtractorError(msg)
1513 else:
1514 self._downloader.report_warning(msg)
1515 return res
1516
1517 def _float(self, v, name, fatal=False, **kwargs):
1518 res = float_or_none(v, **kwargs)
1519 if res is None:
1520 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
1521 if fatal:
1522 raise ExtractorError(msg)
1523 else:
1524 self._downloader.report_warning(msg)
1525 return res
1526
1527 def _set_cookie(self, domain, name, value, expire_time=None):
1528 cookie = compat_cookiejar.Cookie(
1529 0, name, value, None, None, domain, None,
1530 None, '/', True, False, expire_time, '', None, None, None)
1531 self._downloader.cookiejar.set_cookie(cookie)
1532
1533 def _get_cookies(self, url):
1534 """ Return a compat_cookies.SimpleCookie with the cookies for the url """
1535 req = sanitized_Request(url)
1536 self._downloader.cookiejar.add_cookie_header(req)
1537 return compat_cookies.SimpleCookie(req.get_header('Cookie'))
1538
1539 def get_testcases(self, include_onlymatching=False):
1540 t = getattr(self, '_TEST', None)
1541 if t:
1542 assert not hasattr(self, '_TESTS'), \
1543 '%s has _TEST and _TESTS' % type(self).__name__
1544 tests = [t]
1545 else:
1546 tests = getattr(self, '_TESTS', [])
1547 for t in tests:
1548 if not include_onlymatching and t.get('only_matching', False):
1549 continue
1550 t['name'] = type(self).__name__[:-len('IE')]
1551 yield t
1552
1553 def is_suitable(self, age_limit):
1554 """ Test whether the extractor is generally suitable for the given
1555 age limit (i.e. pornographic sites are not, all others usually are) """
1556
1557 any_restricted = False
1558 for tc in self.get_testcases(include_onlymatching=False):
1559 if 'playlist' in tc:
1560 tc = tc['playlist'][0]
1561 is_restricted = age_restricted(
1562 tc.get('info_dict', {}).get('age_limit'), age_limit)
1563 if not is_restricted:
1564 return True
1565 any_restricted = any_restricted or is_restricted
1566 return not any_restricted
1567
1568 def extract_subtitles(self, *args, **kwargs):
1569 if (self._downloader.params.get('writesubtitles', False) or
1570 self._downloader.params.get('listsubtitles')):
1571 return self._get_subtitles(*args, **kwargs)
1572 return {}
1573
1574 def _get_subtitles(self, *args, **kwargs):
1575 raise NotImplementedError('This method must be implemented by subclasses')
1576
1577 @staticmethod
1578 def _merge_subtitle_items(subtitle_list1, subtitle_list2):
1579 """ Merge subtitle items for one language. Items with duplicated URLs
1580 will be dropped. """
1581 list1_urls = set([item['url'] for item in subtitle_list1])
1582 ret = list(subtitle_list1)
1583 ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
1584 return ret
1585
1586 @classmethod
1587 def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
1588 """ Merge two subtitle dictionaries, language by language. """
1589 ret = dict(subtitle_dict1)
1590 for lang in subtitle_dict2:
1591 ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
1592 return ret
1593
1594 def extract_automatic_captions(self, *args, **kwargs):
1595 if (self._downloader.params.get('writeautomaticsub', False) or
1596 self._downloader.params.get('listsubtitles')):
1597 return self._get_automatic_captions(*args, **kwargs)
1598 return {}
1599
1600 def _get_automatic_captions(self, *args, **kwargs):
1601 raise NotImplementedError('This method must be implemented by subclasses')
1602
1603
1604 class SearchInfoExtractor(InfoExtractor):
1605 """
1606 Base class for paged search queries extractors.
1607 They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
1608 Instances should define _SEARCH_KEY and _MAX_RESULTS.
1609 """
1610
1611 @classmethod
1612 def _make_valid_url(cls):
1613 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
1614
1615 @classmethod
1616 def suitable(cls, url):
1617 return re.match(cls._make_valid_url(), url) is not None
1618
1619 def _real_extract(self, query):
1620 mobj = re.match(self._make_valid_url(), query)
1621 if mobj is None:
1622 raise ExtractorError('Invalid search query "%s"' % query)
1623
1624 prefix = mobj.group('prefix')
1625 query = mobj.group('query')
1626 if prefix == '':
1627 return self._get_n_results(query, 1)
1628 elif prefix == 'all':
1629 return self._get_n_results(query, self._MAX_RESULTS)
1630 else:
1631 n = int(prefix)
1632 if n <= 0:
1633 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
1634 elif n > self._MAX_RESULTS:
1635 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
1636 n = self._MAX_RESULTS
1637 return self._get_n_results(query, n)
1638
1639 def _get_n_results(self, query, n):
1640 """Get a specified number of results for a query"""
1641 raise NotImplementedError('This method must be implemented by subclasses')
1642
1643 @property
1644 def SEARCH_KEY(self):
1645 return self._SEARCH_KEY