Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..compat import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21 )
  22 from ..utils import (
  23     clean_html,
  24     compiled_regex_type,
  25     ExtractorError,
  26     float_or_none,
  27     int_or_none,
  28     RegexNotFoundError,
  29     sanitize_filename,
  30     unescapeHTML,
  31 )
  32 _NO_DEFAULT = object()
  33
  34
  35 class InfoExtractor(object):
  36     """Information Extractor class.
  37
  38     Information extractors are the classes that, given a URL, extract
  39     information about the video (or videos) the URL refers to. This
  40     information includes the real video URL, the video title, author and
  41     others. The information is stored in a dictionary which is then
  42     passed to the FileDownloader. The FileDownloader processes this
  43     information possibly downloading the video to the file system, among
  44     other possible outcomes.
  45
  46     The type field determines the the type of the result.
  47     By far the most common value (and the default if _type is missing) is
  48     "video", which indicates a single video.
  49
  50     For a video, the dictionaries must include the following fields:
  51
  52     id:             Video identifier.
  53     title:          Video title, unescaped.
  54
  55     Additionally, it must contain either a formats entry or a url one:
  56
  57     formats:        A list of dictionaries for each format available, ordered
  58                     from worst to best quality.
  59
  60                     Potential fields:
  61                     * url        Mandatory. The URL of the video file
  62                     * ext        Will be calculated from url if missing
  63                     * format     A human-readable description of the format
  64                                  ("mp4 container with h264/opus").
  65                                  Calculated from the format_id, width, height.
  66                                  and format_note fields if missing.
  67                     * format_id  A short description of the format
  68                                  ("mp4_h264_opus" or "19").
  69                                 Technically optional, but strongly recommended.
  70                     * format_note Additional info about the format
  71                                  ("3D" or "DASH video")
  72                     * width      Width of the video, if known
  73                     * height     Height of the video, if known
  74                     * resolution Textual description of width and height
  75                     * tbr        Average bitrate of audio and video in KBit/s
  76                     * abr        Average audio bitrate in KBit/s
  77                     * acodec     Name of the audio codec in use
  78                     * asr        Audio sampling rate in Hertz
  79                     * vbr        Average video bitrate in KBit/s
  80                     * fps        Frame rate
  81                     * vcodec     Name of the video codec in use
  82                     * container  Name of the container format
  83                     * filesize   The number of bytes, if known in advance
  84                     * filesize_approx  An estimate for the number of bytes
  85                     * player_url SWF Player URL (used for rtmpdump).
  86                     * protocol   The protocol that will be used for the actual
  87                                  download, lower-case.
  88                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  89                     * preference Order number of this format. If this field is
  90                                  present and not None, the formats get sorted
  91                                  by this field, regardless of all other values.
  92                                  -1 for default (order by other properties),
  93                                  -2 or smaller for less than default.
  94                     * language_preference  Is this in the correct requested
  95                                  language?
  96                                  10 if it's what the URL is about,
  97                                  -1 for default (don't know),
  98                                  -10 otherwise, other values reserved for now.
  99                     * quality    Order number of the video quality of this
 100                                  format, irrespective of the file format.
 101                                  -1 for default (order by other properties),
 102                                  -2 or smaller for less than default.
 103                     * source_preference  Order number for this video source
 104                                   (quality takes higher priority)
 105                                  -1 for default (order by other properties),
 106                                  -2 or smaller for less than default.
 107                     * http_referer  HTTP Referer header value to set.
 108                     * http_method  HTTP method to use for the download.
 109                     * http_headers  A dictionary of additional HTTP headers
 110                                  to add to the request.
 111                     * http_post_data  Additional data to send with a POST
 112                                  request.
 113     url:            Final video URL.
 114     ext:            Video filename extension.
 115     format:         The video format, defaults to ext (used for --get-format)
 116     player_url:     SWF Player URL (used for rtmpdump).
 117
 118     The following fields are optional:
 119
 120     display_id      An alternative identifier for the video, not necessarily
 121                     unique, but available before title. Typically, id is
 122                     something like "4234987", title "Dancing naked mole rats",
 123                     and display_id "dancing-naked-mole-rats"
 124     thumbnails:     A list of dictionaries, with the following entries:
 125                         * "url"
 126                         * "width" (optional, int)
 127                         * "height" (optional, int)
 128                         * "resolution" (optional, string "{width}x{height"},
 129                                         deprecated)
 130     thumbnail:      Full URL to a video thumbnail image.
 131     description:    One-line video description.
 132     uploader:       Full name of the video uploader.
 133     timestamp:      UNIX timestamp of the moment the video became available.
 134     upload_date:    Video upload date (YYYYMMDD).
 135                     If not explicitly set, calculated from timestamp.
 136     uploader_id:    Nickname or id of the video uploader.
 137     location:       Physical location where the video was filmed.
 138     subtitles:      The subtitle file contents as a dictionary in the format
 139                     {language: subtitles}.
 140     duration:       Length of the video in seconds, as an integer.
 141     view_count:     How many users have watched the video on the platform.
 142     like_count:     Number of positive ratings of the video
 143     dislike_count:  Number of negative ratings of the video
 144     comment_count:  Number of comments on the video
 145     age_limit:      Age restriction for the video, as an integer (years)
 146     webpage_url:    The url to the video webpage, if given to youtube-dl it
 147                     should allow to get the same result again. (It will be set
 148                     by YoutubeDL if it's missing)
 149     categories:     A list of categories that the video falls in, for example
 150                     ["Sports", "Berlin"]
 151     is_live:        True, False, or None (=unknown). Whether this video is a
 152                     live stream that goes on instead of a fixed-length video.
 153
 154     Unless mentioned otherwise, the fields should be Unicode strings.
 155
 156     Unless mentioned otherwise, None is equivalent to absence of information.
 157
 158
 159     _type "playlist" indicates multiple videos.
 160     There must be a key "entries", which is a list or a PagedList object, each
 161     element of which is a valid dictionary under this specfication.
 162
 163     Additionally, playlists can have "title" and "id" attributes with the same
 164     semantics as videos (see above).
 165
 166
 167     _type "multi_video" indicates that there are multiple videos that
 168     form a single show, for examples multiple acts of an opera or TV episode.
 169     It must have an entries key like a playlist and contain all the keys
 170     required for a video at the same time.
 171
 172
 173     _type "url" indicates that the video must be extracted from another
 174     location, possibly by a different extractor. Its only required key is:
 175     "url" - the next URL to extract.
 176
 177     Additionally, it may have properties believed to be identical to the
 178     resolved entity, for example "title" if the title of the referred video is
 179     known ahead of time.
 180
 181
 182     _type "url_transparent" entities have the same specification as "url", but
 183     indicate that the given additional information is more precise than the one
 184     associated with the resolved URL.
 185     This is useful when a site employs a video service that hosts the video and
 186     its technical metadata, but that video service does not embed a useful
 187     title, description etc.
 188
 189
 190     Subclasses of this one should re-define the _real_initialize() and
 191     _real_extract() methods and define a _VALID_URL regexp.
 192     Probably, they should also be added to the list of extractors.
 193
 194     Finally, the _WORKING attribute should be set to False for broken IEs
 195     in order to warn the users and skip the tests.
 196     """
 197
 198     _ready = False
 199     _downloader = None
 200     _WORKING = True
 201
 202     def __init__(self, downloader=None):
 203         """Constructor. Receives an optional downloader."""
 204         self._ready = False
 205         self.set_downloader(downloader)
 206
 207     @classmethod
 208     def suitable(cls, url):
 209         """Receives a URL and returns True if suitable for this IE."""
 210
 211         # This does not use has/getattr intentionally - we want to know whether
 212         # we have cached the regexp for *this* class, whereas getattr would also
 213         # match the superclass
 214         if '_VALID_URL_RE' not in cls.__dict__:
 215             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 216         return cls._VALID_URL_RE.match(url) is not None
 217
 218     @classmethod
 219     def _match_id(cls, url):
 220         if '_VALID_URL_RE' not in cls.__dict__:
 221             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 222         m = cls._VALID_URL_RE.match(url)
 223         assert m
 224         return m.group('id')
 225
 226     @classmethod
 227     def working(cls):
 228         """Getter method for _WORKING."""
 229         return cls._WORKING
 230
 231     def initialize(self):
 232         """Initializes an instance (authentication, etc)."""
 233         if not self._ready:
 234             self._real_initialize()
 235             self._ready = True
 236
 237     def extract(self, url):
 238         """Extracts URL information and returns it in list of dicts."""
 239         self.initialize()
 240         return self._real_extract(url)
 241
 242     def set_downloader(self, downloader):
 243         """Sets the downloader for this IE."""
 244         self._downloader = downloader
 245
 246     def _real_initialize(self):
 247         """Real initialization process. Redefine in subclasses."""
 248         pass
 249
 250     def _real_extract(self, url):
 251         """Real extraction process. Redefine in subclasses."""
 252         pass
 253
 254     @classmethod
 255     def ie_key(cls):
 256         """A string for getting the InfoExtractor with get_info_extractor"""
 257         return cls.__name__[:-2]
 258
 259     @property
 260     def IE_NAME(self):
 261         return type(self).__name__[:-2]
 262
 263     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 264         """ Returns the response handle """
 265         if note is None:
 266             self.report_download_webpage(video_id)
 267         elif note is not False:
 268             if video_id is None:
 269                 self.to_screen('%s' % (note,))
 270             else:
 271                 self.to_screen('%s: %s' % (video_id, note))
 272         try:
 273             return self._downloader.urlopen(url_or_request)
 274         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 275             if errnote is False:
 276                 return False
 277             if errnote is None:
 278                 errnote = 'Unable to download webpage'
 279             errmsg = '%s: %s' % (errnote, compat_str(err))
 280             if fatal:
 281                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 282             else:
 283                 self._downloader.report_warning(errmsg)
 284                 return False
 285
 286     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 287         """ Returns a tuple (page content as string, URL handle) """
 288         # Strip hashes from the URL (#1038)
 289         if isinstance(url_or_request, (compat_str, str)):
 290             url_or_request = url_or_request.partition('#')[0]
 291
 292         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 293         if urlh is False:
 294             assert not fatal
 295             return False
 296         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 297         return (content, urlh)
 298
 299     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 300         content_type = urlh.headers.get('Content-Type', '')
 301         webpage_bytes = urlh.read()
 302         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 303         if m:
 304             encoding = m.group(1)
 305         else:
 306             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 307                           webpage_bytes[:1024])
 308             if m:
 309                 encoding = m.group(1).decode('ascii')
 310             elif webpage_bytes.startswith(b'\xff\xfe'):
 311                 encoding = 'utf-16'
 312             else:
 313                 encoding = 'utf-8'
 314         if self._downloader.params.get('dump_intermediate_pages', False):
 315             try:
 316                 url = url_or_request.get_full_url()
 317             except AttributeError:
 318                 url = url_or_request
 319             self.to_screen('Dumping request to ' + url)
 320             dump = base64.b64encode(webpage_bytes).decode('ascii')
 321             self._downloader.to_screen(dump)
 322         if self._downloader.params.get('write_pages', False):
 323             try:
 324                 url = url_or_request.get_full_url()
 325             except AttributeError:
 326                 url = url_or_request
 327             basen = '%s_%s' % (video_id, url)
 328             if len(basen) > 240:
 329                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 330                 basen = basen[:240 - len(h)] + h
 331             raw_filename = basen + '.dump'
 332             filename = sanitize_filename(raw_filename, restricted=True)
 333             self.to_screen('Saving request to ' + filename)
 334             # Working around MAX_PATH limitation on Windows (see
 335             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 336             if os.name == 'nt':
 337                 absfilepath = os.path.abspath(filename)
 338                 if len(absfilepath) > 259:
 339                     filename = '\\\\?\\' + absfilepath
 340             with open(filename, 'wb') as outf:
 341                 outf.write(webpage_bytes)
 342
 343         try:
 344             content = webpage_bytes.decode(encoding, 'replace')
 345         except LookupError:
 346             content = webpage_bytes.decode('utf-8', 'replace')
 347
 348         if ('<title>Access to this site is blocked</title>' in content and
 349                 'Websense' in content[:512]):
 350             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 351             blocked_iframe = self._html_search_regex(
 352                 r'<iframe src="([^"]+)"', content,
 353                 'Websense information URL', default=None)
 354             if blocked_iframe:
 355                 msg += ' Visit %s for more details' % blocked_iframe
 356             raise ExtractorError(msg, expected=True)
 357
 358         return content
 359
 360     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 361         """ Returns the data of the page as a string """
 362         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 363         if res is False:
 364             return res
 365         else:
 366             content, _ = res
 367             return content
 368
 369     def _download_xml(self, url_or_request, video_id,
 370                       note='Downloading XML', errnote='Unable to download XML',
 371                       transform_source=None, fatal=True):
 372         """Return the xml as an xml.etree.ElementTree.Element"""
 373         xml_string = self._download_webpage(
 374             url_or_request, video_id, note, errnote, fatal=fatal)
 375         if xml_string is False:
 376             return xml_string
 377         if transform_source:
 378             xml_string = transform_source(xml_string)
 379         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 380
 381     def _download_json(self, url_or_request, video_id,
 382                        note='Downloading JSON metadata',
 383                        errnote='Unable to download JSON metadata',
 384                        transform_source=None,
 385                        fatal=True):
 386         json_string = self._download_webpage(
 387             url_or_request, video_id, note, errnote, fatal=fatal)
 388         if (not fatal) and json_string is False:
 389             return None
 390         if transform_source:
 391             json_string = transform_source(json_string)
 392         try:
 393             return json.loads(json_string)
 394         except ValueError as ve:
 395             errmsg = '%s: Failed to parse JSON ' % video_id
 396             if fatal:
 397                 raise ExtractorError(errmsg, cause=ve)
 398             else:
 399                 self.report_warning(errmsg + str(ve))
 400
 401     def report_warning(self, msg, video_id=None):
 402         idstr = '' if video_id is None else '%s: ' % video_id
 403         self._downloader.report_warning(
 404             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 405
 406     def to_screen(self, msg):
 407         """Print msg to screen, prefixing it with '[ie_name]'"""
 408         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 409
 410     def report_extraction(self, id_or_name):
 411         """Report information extraction."""
 412         self.to_screen('%s: Extracting information' % id_or_name)
 413
 414     def report_download_webpage(self, video_id):
 415         """Report webpage download."""
 416         self.to_screen('%s: Downloading webpage' % video_id)
 417
 418     def report_age_confirmation(self):
 419         """Report attempt to confirm age."""
 420         self.to_screen('Confirming age')
 421
 422     def report_login(self):
 423         """Report attempt to log in."""
 424         self.to_screen('Logging in')
 425
 426     #Methods for following #608
 427     @staticmethod
 428     def url_result(url, ie=None, video_id=None):
 429         """Returns a url that points to a page that should be processed"""
 430         #TODO: ie should be the class used for getting the info
 431         video_info = {'_type': 'url',
 432                       'url': url,
 433                       'ie_key': ie}
 434         if video_id is not None:
 435             video_info['id'] = video_id
 436         return video_info
 437     @staticmethod
 438     def playlist_result(entries, playlist_id=None, playlist_title=None):
 439         """Returns a playlist"""
 440         video_info = {'_type': 'playlist',
 441                       'entries': entries}
 442         if playlist_id:
 443             video_info['id'] = playlist_id
 444         if playlist_title:
 445             video_info['title'] = playlist_title
 446         return video_info
 447
 448     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 449         """
 450         Perform a regex search on the given string, using a single or a list of
 451         patterns returning the first matching group.
 452         In case of failure return a default value or raise a WARNING or a
 453         RegexNotFoundError, depending on fatal, specifying the field name.
 454         """
 455         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 456             mobj = re.search(pattern, string, flags)
 457         else:
 458             for p in pattern:
 459                 mobj = re.search(p, string, flags)
 460                 if mobj:
 461                     break
 462
 463         if os.name != 'nt' and sys.stderr.isatty():
 464             _name = '\033[0;34m%s\033[0m' % name
 465         else:
 466             _name = name
 467
 468         if mobj:
 469             if group is None:
 470                 # return the first matching group
 471                 return next(g for g in mobj.groups() if g is not None)
 472             else:
 473                 return mobj.group(group)
 474         elif default is not _NO_DEFAULT:
 475             return default
 476         elif fatal:
 477             raise RegexNotFoundError('Unable to extract %s' % _name)
 478         else:
 479             self._downloader.report_warning('unable to extract %s; '
 480                 'please report this issue on http://yt-dl.org/bug' % _name)
 481             return None
 482
 483     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
 484         """
 485         Like _search_regex, but strips HTML tags and unescapes entities.
 486         """
 487         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 488         if res:
 489             return clean_html(res).strip()
 490         else:
 491             return res
 492
 493     def _get_login_info(self):
 494         """
 495         Get the the login info as (username, password)
 496         It will look in the netrc file using the _NETRC_MACHINE value
 497         If there's no info available, return (None, None)
 498         """
 499         if self._downloader is None:
 500             return (None, None)
 501
 502         username = None
 503         password = None
 504         downloader_params = self._downloader.params
 505
 506         # Attempt to use provided username and password or .netrc data
 507         if downloader_params.get('username', None) is not None:
 508             username = downloader_params['username']
 509             password = downloader_params['password']
 510         elif downloader_params.get('usenetrc', False):
 511             try:
 512                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 513                 if info is not None:
 514                     username = info[0]
 515                     password = info[2]
 516                 else:
 517                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 518             except (IOError, netrc.NetrcParseError) as err:
 519                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 520
 521         return (username, password)
 522
 523     def _get_tfa_info(self):
 524         """
 525         Get the two-factor authentication info
 526         TODO - asking the user will be required for sms/phone verify
 527         currently just uses the command line option
 528         If there's no info available, return None
 529         """
 530         if self._downloader is None:
 531             return None
 532         downloader_params = self._downloader.params
 533
 534         if downloader_params.get('twofactor', None) is not None:
 535             return downloader_params['twofactor']
 536
 537         return None
 538
 539     # Helper functions for extracting OpenGraph info
 540     @staticmethod
 541     def _og_regexes(prop):
 542         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 543         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 544         template = r'<meta[^>]+?%s[^>]+?%s'
 545         return [
 546             template % (property_re, content_re),
 547             template % (content_re, property_re),
 548         ]
 549
 550     def _og_search_property(self, prop, html, name=None, **kargs):
 551         if name is None:
 552             name = 'OpenGraph %s' % prop
 553         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 554         if escaped is None:
 555             return None
 556         return unescapeHTML(escaped)
 557
 558     def _og_search_thumbnail(self, html, **kargs):
 559         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 560
 561     def _og_search_description(self, html, **kargs):
 562         return self._og_search_property('description', html, fatal=False, **kargs)
 563
 564     def _og_search_title(self, html, **kargs):
 565         return self._og_search_property('title', html, **kargs)
 566
 567     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 568         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 569         if secure:
 570             regexes = self._og_regexes('video:secure_url') + regexes
 571         return self._html_search_regex(regexes, html, name, **kargs)
 572
 573     def _og_search_url(self, html, **kargs):
 574         return self._og_search_property('url', html, **kargs)
 575
 576     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 577         if display_name is None:
 578             display_name = name
 579         return self._html_search_regex(
 580             r'''(?ix)<meta
 581                     (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 582                     [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
 583             html, display_name, fatal=fatal, group='content', **kwargs)
 584
 585     def _dc_search_uploader(self, html):
 586         return self._html_search_meta('dc.creator', html, 'uploader')
 587
 588     def _rta_search(self, html):
 589         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 590         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 591                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 592                      html):
 593             return 18
 594         return 0
 595
 596     def _media_rating_search(self, html):
 597         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 598         rating = self._html_search_meta('rating', html)
 599
 600         if not rating:
 601             return None
 602
 603         RATING_TABLE = {
 604             'safe for kids': 0,
 605             'general': 8,
 606             '14 years': 14,
 607             'mature': 17,
 608             'restricted': 19,
 609         }
 610         return RATING_TABLE.get(rating.lower(), None)
 611
 612     def _twitter_search_player(self, html):
 613         return self._html_search_meta('twitter:player', html,
 614             'twitter card player')
 615
 616     def _sort_formats(self, formats):
 617         if not formats:
 618             raise ExtractorError('No video formats found')
 619
 620         def _formats_key(f):
 621             # TODO remove the following workaround
 622             from ..utils import determine_ext
 623             if not f.get('ext') and 'url' in f:
 624                 f['ext'] = determine_ext(f['url'])
 625
 626             preference = f.get('preference')
 627             if preference is None:
 628                 proto = f.get('protocol')
 629                 if proto is None:
 630                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 631
 632                 preference = 0 if proto in ['http', 'https'] else -0.1
 633                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 634                     preference -= 0.5
 635
 636             if f.get('vcodec') == 'none':  # audio only
 637                 if self._downloader.params.get('prefer_free_formats'):
 638                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 639                 else:
 640                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 641                 ext_preference = 0
 642                 try:
 643                     audio_ext_preference = ORDER.index(f['ext'])
 644                 except ValueError:
 645                     audio_ext_preference = -1
 646             else:
 647                 if self._downloader.params.get('prefer_free_formats'):
 648                     ORDER = ['flv', 'mp4', 'webm']
 649                 else:
 650                     ORDER = ['webm', 'flv', 'mp4']
 651                 try:
 652                     ext_preference = ORDER.index(f['ext'])
 653                 except ValueError:
 654                     ext_preference = -1
 655                 audio_ext_preference = 0
 656
 657             return (
 658                 preference,
 659                 f.get('language_preference') if f.get('language_preference') is not None else -1,
 660                 f.get('quality') if f.get('quality') is not None else -1,
 661                 f.get('height') if f.get('height') is not None else -1,
 662                 f.get('width') if f.get('width') is not None else -1,
 663                 ext_preference,
 664                 f.get('tbr') if f.get('tbr') is not None else -1,
 665                 f.get('vbr') if f.get('vbr') is not None else -1,
 666                 f.get('abr') if f.get('abr') is not None else -1,
 667                 audio_ext_preference,
 668                 f.get('fps') if f.get('fps') is not None else -1,
 669                 f.get('filesize') if f.get('filesize') is not None else -1,
 670                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 671                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 672                 f.get('format_id'),
 673             )
 674         formats.sort(key=_formats_key)
 675
 676     def http_scheme(self):
 677         """ Either "http:" or "https:", depending on the user's preferences """
 678         return (
 679             'http:'
 680             if self._downloader.params.get('prefer_insecure', False)
 681             else 'https:')
 682
 683     def _proto_relative_url(self, url, scheme=None):
 684         if url is None:
 685             return url
 686         if url.startswith('//'):
 687             if scheme is None:
 688                 scheme = self.http_scheme()
 689             return scheme + url
 690         else:
 691             return url
 692
 693     def _sleep(self, timeout, video_id, msg_template=None):
 694         if msg_template is None:
 695             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 696         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 697         self.to_screen(msg)
 698         time.sleep(timeout)
 699
 700     def _extract_f4m_formats(self, manifest_url, video_id):
 701         manifest = self._download_xml(
 702             manifest_url, video_id, 'Downloading f4m manifest',
 703             'Unable to download f4m manifest')
 704
 705         formats = []
 706         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 707         for i, media_el in enumerate(media_nodes):
 708             tbr = int_or_none(media_el.attrib.get('bitrate'))
 709             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 710             formats.append({
 711                 'format_id': format_id,
 712                 'url': manifest_url,
 713                 'ext': 'flv',
 714                 'tbr': tbr,
 715                 'width': int_or_none(media_el.attrib.get('width')),
 716                 'height': int_or_none(media_el.attrib.get('height')),
 717             })
 718         self._sort_formats(formats)
 719
 720         return formats
 721
 722     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 723                               entry_protocol='m3u8', preference=None):
 724
 725         formats = [{
 726             'format_id': 'm3u8-meta',
 727             'url': m3u8_url,
 728             'ext': ext,
 729             'protocol': 'm3u8',
 730             'preference': -1,
 731             'resolution': 'multiple',
 732             'format_note': 'Quality selection URL',
 733         }]
 734
 735         format_url = lambda u: (
 736             u
 737             if re.match(r'^https?://', u)
 738             else compat_urlparse.urljoin(m3u8_url, u))
 739
 740         m3u8_doc = self._download_webpage(
 741             m3u8_url, video_id,
 742             note='Downloading m3u8 information',
 743             errnote='Failed to download m3u8 information')
 744         last_info = None
 745         kv_rex = re.compile(
 746             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 747         for line in m3u8_doc.splitlines():
 748             if line.startswith('#EXT-X-STREAM-INF:'):
 749                 last_info = {}
 750                 for m in kv_rex.finditer(line):
 751                     v = m.group('val')
 752                     if v.startswith('"'):
 753                         v = v[1:-1]
 754                     last_info[m.group('key')] = v
 755             elif line.startswith('#') or not line.strip():
 756                 continue
 757             else:
 758                 if last_info is None:
 759                     formats.append({'url': format_url(line)})
 760                     continue
 761                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 762
 763                 f = {
 764                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 765                     'url': format_url(line.strip()),
 766                     'tbr': tbr,
 767                     'ext': ext,
 768                     'protocol': entry_protocol,
 769                     'preference': preference,
 770                 }
 771                 codecs = last_info.get('CODECS')
 772                 if codecs:
 773                     # TODO: looks like video codec is not always necessarily goes first
 774                     va_codecs = codecs.split(',')
 775                     if va_codecs[0]:
 776                         f['vcodec'] = va_codecs[0].partition('.')[0]
 777                     if len(va_codecs) > 1 and va_codecs[1]:
 778                         f['acodec'] = va_codecs[1].partition('.')[0]
 779                 resolution = last_info.get('RESOLUTION')
 780                 if resolution:
 781                     width_str, height_str = resolution.split('x')
 782                     f['width'] = int(width_str)
 783                     f['height'] = int(height_str)
 784                 formats.append(f)
 785                 last_info = {}
 786         self._sort_formats(formats)
 787         return formats
 788
 789     def _live_title(self, name):
 790         """ Generate the title for a live video """
 791         now = datetime.datetime.now()
 792         now_str = now.strftime("%Y-%m-%d %H:%M")
 793         return name + ' ' + now_str
 794
 795     def _int(self, v, name, fatal=False, **kwargs):
 796         res = int_or_none(v, **kwargs)
 797         if 'get_attr' in kwargs:
 798             print(getattr(v, kwargs['get_attr']))
 799         if res is None:
 800             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 801             if fatal:
 802                 raise ExtractorError(msg)
 803             else:
 804                 self._downloader.report_warning(msg)
 805         return res
 806
 807     def _float(self, v, name, fatal=False, **kwargs):
 808         res = float_or_none(v, **kwargs)
 809         if res is None:
 810             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 811             if fatal:
 812                 raise ExtractorError(msg)
 813             else:
 814                 self._downloader.report_warning(msg)
 815         return res
 816
 817
 818 class SearchInfoExtractor(InfoExtractor):
 819     """
 820     Base class for paged search queries extractors.
 821     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 822     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 823     """
 824
 825     @classmethod
 826     def _make_valid_url(cls):
 827         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 828
 829     @classmethod
 830     def suitable(cls, url):
 831         return re.match(cls._make_valid_url(), url) is not None
 832
 833     def _real_extract(self, query):
 834         mobj = re.match(self._make_valid_url(), query)
 835         if mobj is None:
 836             raise ExtractorError('Invalid search query "%s"' % query)
 837
 838         prefix = mobj.group('prefix')
 839         query = mobj.group('query')
 840         if prefix == '':
 841             return self._get_n_results(query, 1)
 842         elif prefix == 'all':
 843             return self._get_n_results(query, self._MAX_RESULTS)
 844         else:
 845             n = int(prefix)
 846             if n <= 0:
 847                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 848             elif n > self._MAX_RESULTS:
 849                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 850                 n = self._MAX_RESULTS
 851             return self._get_n_results(query, n)
 852
 853     def _get_n_results(self, query, n):
 854         """Get a specified number of results for a query"""
 855         raise NotImplementedError("This method must be implemented by subclasses")
 856
 857     @property
 858     def SEARCH_KEY(self):
 859         return self._SEARCH_KEY