10 import xml
.etree
.ElementTree
15 compat_urllib_parse_urlparse
,
26 _NO_DEFAULT
= object()
29 class InfoExtractor(object):
30 """Information Extractor class.
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
40 The dictionaries must include the following fields:
43 title: Video title, unescaped.
45 Additionally, it must contain either a formats entry or a url one:
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
88 ext: Video filename extension.
89 format: The video format, defaults to ext (used for --get-format)
90 player_url: SWF Player URL (used for rtmpdump).
92 The following fields are optional:
94 display_id An alternative identifier for the video, not necessarily
95 unique, but available before title. Typically, id is
96 something like "4234987", title "Dancing naked mole rats",
97 and display_id "dancing-naked-mole-rats"
98 thumbnails: A list of dictionaries, with the following entries:
100 * "width" (optional, int)
101 * "height" (optional, int)
102 * "resolution" (optional, string "{width}x{height"},
104 thumbnail: Full URL to a video thumbnail image.
105 description: One-line video description.
106 uploader: Full name of the video uploader.
107 timestamp: UNIX timestamp of the moment the video became available.
108 upload_date: Video upload date (YYYYMMDD).
109 If not explicitly set, calculated from timestamp.
110 uploader_id: Nickname or id of the video uploader.
111 location: Physical location of the video.
112 subtitles: The subtitle file contents as a dictionary in the format
113 {language: subtitles}.
114 duration: Length of the video in seconds, as an integer.
115 view_count: How many users have watched the video on the platform.
116 like_count: Number of positive ratings of the video
117 dislike_count: Number of negative ratings of the video
118 comment_count: Number of comments on the video
119 age_limit: Age restriction for the video, as an integer (years)
120 webpage_url: The url to the video webpage, if given to youtube-dl it
121 should allow to get the same result again. (It will be set
122 by YoutubeDL if it's missing)
123 categories: A list of categories that the video falls in, for example
126 Unless mentioned otherwise, the fields should be Unicode strings.
128 Subclasses of this one should re-define the _real_initialize() and
129 _real_extract() methods and define a _VALID_URL regexp.
130 Probably, they should also be added to the list of extractors.
132 Finally, the _WORKING attribute should be set to False for broken IEs
133 in order to warn the users and skip the tests.
140 def __init__(self
, downloader
=None):
141 """Constructor. Receives an optional downloader."""
143 self
.set_downloader(downloader
)
146 def suitable(cls
, url
):
147 """Receives a URL and returns True if suitable for this IE."""
149 # This does not use has/getattr intentionally - we want to know whether
150 # we have cached the regexp for *this* class, whereas getattr would also
151 # match the superclass
152 if '_VALID_URL_RE' not in cls
.__dict
__:
153 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
154 return cls
._VALID
_URL
_RE
.match(url
) is not None
158 """Getter method for _WORKING."""
161 def initialize(self
):
162 """Initializes an instance (authentication, etc)."""
164 self
._real
_initialize
()
167 def extract(self
, url
):
168 """Extracts URL information and returns it in list of dicts."""
170 return self
._real
_extract
(url
)
172 def set_downloader(self
, downloader
):
173 """Sets the downloader for this IE."""
174 self
._downloader
= downloader
176 def _real_initialize(self
):
177 """Real initialization process. Redefine in subclasses."""
180 def _real_extract(self
, url
):
181 """Real extraction process. Redefine in subclasses."""
186 """A string for getting the InfoExtractor with get_info_extractor"""
187 return cls
.__name
__[:-2]
191 return type(self
).__name
__[:-2]
193 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
194 """ Returns the response handle """
196 self
.report_download_webpage(video_id
)
197 elif note
is not False:
199 self
.to_screen(u
'%s' % (note
,))
201 self
.to_screen(u
'%s: %s' % (video_id
, note
))
203 return self
._downloader
.urlopen(url_or_request
)
204 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
208 errnote
= u
'Unable to download webpage'
209 errmsg
= u
'%s: %s' % (errnote
, compat_str(err
))
211 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
213 self
._downloader
.report_warning(errmsg
)
216 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
217 """ Returns a tuple (page content as string, URL handle) """
219 # Strip hashes from the URL (#1038)
220 if isinstance(url_or_request
, (compat_str
, str)):
221 url_or_request
= url_or_request
.partition('#')[0]
223 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
)
227 content_type
= urlh
.headers
.get('Content-Type', '')
228 webpage_bytes
= urlh
.read()
229 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
231 encoding
= m
.group(1)
233 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
234 webpage_bytes[:1024])
236 encoding = m.group(1).decode('ascii')
237 elif webpage_bytes.startswith(b'\xff\xfe'):
241 if self._downloader.params.get('dump_intermediate_pages', False):
243 url = url_or_request.get_full_url()
244 except AttributeError:
246 self.to_screen(u'Dumping request to ' + url)
247 dump = base64.b64encode(webpage_bytes).decode('ascii')
248 self._downloader.to_screen(dump)
249 if self._downloader.params.get('write_pages', False):
251 url = url_or_request.get_full_url()
252 except AttributeError:
254 basen = '%s_%s' % (video_id, url)
256 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
257 basen = basen[:240 - len(h)] + h
258 raw_filename = basen + '.dump'
259 filename = sanitize_filename(raw_filename, restricted=True)
260 self.to_screen(u'Saving request to ' + filename)
261 with open(filename, 'wb') as outf:
262 outf.write(webpage_bytes)
265 content = webpage_bytes.decode(encoding, 'replace')
267 content = webpage_bytes.decode('utf-8', 'replace')
269 if (u'<title>Access to this site is blocked</title>' in content and
270 u'Websense' in content[:512]):
271 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
272 blocked_iframe = self._html_search_regex(
273 r'<iframe src="([^
"]+)"', content,
274 u'Websense information URL
', default=None)
276 msg += u' Visit
%s for more details
' % blocked_iframe
277 raise ExtractorError(msg, expected=True)
279 return (content, urlh)
281 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
282 """ Returns the data of the page as a string """
283 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
290 def _download_xml(self, url_or_request, video_id,
291 note=u'Downloading XML
', errnote=u'Unable to download XML
',
292 transform_source=None, fatal=True):
293 """Return the xml as an xml.etree.ElementTree.Element"""
294 xml_string = self._download_webpage(
295 url_or_request, video_id, note, errnote, fatal=fatal)
296 if xml_string is False:
299 xml_string = transform_source(xml_string)
300 return xml.etree.ElementTree.fromstring(xml_string.encode('utf
-8'))
302 def _download_json(self, url_or_request, video_id,
303 note=u'Downloading JSON metadata
',
304 errnote=u'Unable to download JSON metadata
',
305 transform_source=None,
307 json_string = self._download_webpage(
308 url_or_request, video_id, note, errnote, fatal=fatal)
309 if (not fatal) and json_string is False:
312 json_string = transform_source(json_string)
314 return json.loads(json_string)
315 except ValueError as ve:
316 raise ExtractorError('Failed to download JSON
', cause=ve)
318 def report_warning(self, msg, video_id=None):
319 idstr = u'' if video_id is None else u'%s: ' % video_id
320 self._downloader.report_warning(
321 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
323 def to_screen(self, msg):
324 """Print msg to screen, prefixing it with '[ie_name
]'"""
325 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
327 def report_extraction(self, id_or_name):
328 """Report information extraction."""
329 self.to_screen(u'%s: Extracting information
' % id_or_name)
331 def report_download_webpage(self, video_id):
332 """Report webpage download."""
333 self.to_screen(u'%s: Downloading webpage
' % video_id)
335 def report_age_confirmation(self):
336 """Report attempt to confirm age."""
337 self.to_screen(u'Confirming age
')
339 def report_login(self):
340 """Report attempt to log in."""
341 self.to_screen(u'Logging
in')
343 #Methods for following #608
345 def url_result(url, ie=None, video_id=None):
346 """Returns a url that points to a page that should be processed"""
347 #TODO: ie should be the class used for getting the info
348 video_info = {'_type
': 'url
',
351 if video_id is not None:
352 video_info['id'] = video_id
355 def playlist_result(entries, playlist_id=None, playlist_title=None):
356 """Returns a playlist"""
357 video_info = {'_type
': 'playlist
',
360 video_info['id'] = playlist_id
362 video_info['title
'] = playlist_title
365 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
367 Perform a regex search on the given string, using a single or a list of
368 patterns returning the first matching group.
369 In case of failure return a default value or raise a WARNING or a
370 RegexNotFoundError, depending on fatal, specifying the field name.
372 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
373 mobj = re.search(pattern, string, flags)
376 mobj = re.search(p, string, flags)
380 if os.name != 'nt
' and sys.stderr.isatty():
381 _name = u'\033[0;34m
%s\033[0m
' % name
386 # return the first matching group
387 return next(g for g in mobj.groups() if g is not None)
388 elif default is not _NO_DEFAULT:
391 raise RegexNotFoundError(u'Unable to extract
%s' % _name)
393 self._downloader.report_warning(u'unable to extract
%s; '
394 u'please report this issue on http
://yt
-dl
.org
/bug
' % _name)
397 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
399 Like _search_regex, but strips HTML tags and unescapes entities.
401 res = self._search_regex(pattern, string, name, default, fatal, flags)
403 return clean_html(res).strip()
407 def _get_login_info(self):
409 Get the the login info as (username, password)
410 It will look in the netrc file using the _NETRC_MACHINE value
411 If there's no info available
, return (None, None)
413 if self._downloader is None:
418 downloader_params = self._downloader.params
420 # Attempt to use provided username and password or .netrc data
421 if downloader_params.get('username', None) is not None:
422 username = downloader_params['username']
423 password = downloader_params['password']
424 elif downloader_params.get('usenetrc', False):
426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
432 except (IOError, netrc.NetrcParseError) as err:
433 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
435 return (username, password)
437 # Helper functions for extracting OpenGraph info
439 def _og_regexes(prop):
440 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
441 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
442 template = r'<meta[^>]+?%s[^>]+?%s'
444 template % (property_re, content_re),
445 template % (content_re, property_re),
448 def _og_search_property(self, prop, html, name=None, **kargs):
450 name = 'OpenGraph %s' % prop
451 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
454 return unescapeHTML(escaped)
456 def _og_search_thumbnail(self, html, **kargs):
457 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
459 def _og_search_description(self, html, **kargs):
460 return self._og_search_property('description', html, fatal=False, **kargs)
462 def _og_search_title(self, html, **kargs):
463 return self._og_search_property('title', html, **kargs)
465 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
466 regexes = self._og_regexes('video')
467 if secure: regexes = self._og_regexes('video:secure_url') + regexes
468 return self._html_search_regex(regexes, html, name, **kargs)
470 def _og_search_url(self, html, **kargs):
471 return self._og_search_property('url', html, **kargs)
473 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
474 if display_name is None:
476 return self._html_search_regex(
478 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
479 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
480 html, display_name, fatal=fatal, **kwargs)
482 def _dc_search_uploader(self, html):
483 return self._html_search_meta('dc.creator', html, 'uploader')
485 def _rta_search(self, html):
486 # See http://www.rtalabel.org/index.php?content=howtofaq#single
487 if re.search(r'(?ix)<meta\s+name="rating"\s+'
488 r' content="RTA-5042-1996-1400-1577-RTA"',
493 def _media_rating_search(self, html):
494 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
495 rating = self._html_search_meta('rating', html)
507 return RATING_TABLE.get(rating.lower(), None)
509 def _twitter_search_player(self, html):
510 return self._html_search_meta('twitter:player', html,
511 'twitter card player')
513 def _sort_formats(self, formats):
515 raise ExtractorError(u'No video formats found')
518 # TODO remove the following workaround
519 from ..utils import determine_ext
520 if not f.get('ext') and 'url' in f:
521 f['ext'] = determine_ext(f['url'])
523 preference = f.get('preference')
524 if preference is None:
525 proto = f.get('protocol')
527 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
529 preference = 0 if proto in ['http', 'https'] else -0.1
530 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
533 if f.get('vcodec') == 'none': # audio only
534 if self._downloader.params.get('prefer_free_formats'):
535 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
537 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
540 audio_ext_preference = ORDER.index(f['ext'])
542 audio_ext_preference = -1
544 if self._downloader.params.get('prefer_free_formats'):
545 ORDER = [u'flv', u'mp4', u'webm']
547 ORDER = [u'webm', u'flv', u'mp4']
549 ext_preference = ORDER.index(f['ext'])
552 audio_ext_preference = 0
556 f.get('quality') if f.get('quality') is not None else -1,
557 f.get('height') if f.get('height') is not None else -1,
558 f.get('width') if f.get('width') is not None else -1,
560 f.get('tbr') if f.get('tbr') is not None else -1,
561 f.get('vbr') if f.get('vbr') is not None else -1,
562 f.get('abr') if f.get('abr') is not None else -1,
563 audio_ext_preference,
564 f.get('filesize') if f.get('filesize') is not None else -1,
565 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
568 formats.sort(key=_formats_key)
570 def http_scheme(self):
571 """ Either
"https:" or "https:", depending on the user
's preferences """
574 if self._downloader.params.get('prefer_insecure
', False)
577 def _proto_relative_url(self, url, scheme=None):
580 if url.startswith('//'):
582 scheme = self.http_scheme()
587 def _sleep(self, timeout, video_id, msg_template=None):
588 if msg_template is None:
589 msg_template = u'%(video_id)s: Waiting
for %(timeout)s seconds
'
590 msg = msg_template % {'video_id
': video_id, 'timeout
': timeout}
594 def _extract_f4m_formats(self, manifest_url, video_id):
595 manifest = self._download_xml(
596 manifest_url, video_id, 'Downloading f4m manifest
',
597 'Unable to download f4m manifest
')
600 for media_el in manifest.findall('{http
://ns
.adobe
.com
/f4m
/1.0}media
'):
604 'tbr
': int_or_none(media_el.attrib.get('bitrate
')),
605 'width
': int_or_none(media_el.attrib.get('width
')),
606 'height
': int_or_none(media_el.attrib.get('height
')),
608 self._sort_formats(formats)
613 class SearchInfoExtractor(InfoExtractor):
615 Base class for paged search queries extractors.
616 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
617 Instances should define _SEARCH_KEY and _MAX_RESULTS.
621 def _make_valid_url(cls):
622 return r'%s(?P
<prefix
>|
[1-9][0-9]*|all
):(?P
<query
>[\s\S
]+)' % cls._SEARCH_KEY
625 def suitable(cls, url):
626 return re.match(cls._make_valid_url(), url) is not None
628 def _real_extract(self, query):
629 mobj = re.match(self._make_valid_url(), query)
631 raise ExtractorError(u'Invalid search query
"%s"' % query)
633 prefix = mobj.group('prefix
')
634 query = mobj.group('query
')
636 return self._get_n_results(query, 1)
637 elif prefix == 'all
':
638 return self._get_n_results(query, self._MAX_RESULTS)
642 raise ExtractorError(u'invalid download number
%s for query
"%s"' % (n, query))
643 elif n > self._MAX_RESULTS:
644 self._downloader.report_warning(u'%s returns
max %i results (you requested
%i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
645 n = self._MAX_RESULTS
646 return self._get_n_results(query, n)
648 def _get_n_results(self, query, n):
649 """Get a specified number of results for a query"""
650 raise NotImplementedError("This method must be implemented by subclasses")
653 def SEARCH_KEY(self):
654 return self._SEARCH_KEY