Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import re
  10 import socket
  11 import sys
  12 import time
  13 import xml.etree.ElementTree
  14
  15 from ..utils import (
  16     compat_http_client,
  17     compat_urllib_error,
  18     compat_urllib_parse_urlparse,
  19     compat_urlparse,
  20     compat_str,
  21
  22     clean_html,
  23     compiled_regex_type,
  24     ExtractorError,
  25     float_or_none,
  26     int_or_none,
  27     RegexNotFoundError,
  28     sanitize_filename,
  29     unescapeHTML,
  30 )
  31 _NO_DEFAULT = object()
  32
  33
  34 class InfoExtractor(object):
  35     """Information Extractor class.
  36
  37     Information extractors are the classes that, given a URL, extract
  38     information about the video (or videos) the URL refers to. This
  39     information includes the real video URL, the video title, author and
  40     others. The information is stored in a dictionary which is then
  41     passed to the FileDownloader. The FileDownloader processes this
  42     information possibly downloading the video to the file system, among
  43     other possible outcomes.
  44
  45     The dictionaries must include the following fields:
  46
  47     id:             Video identifier.
  48     title:          Video title, unescaped.
  49
  50     Additionally, it must contain either a formats entry or a url one:
  51
  52     formats:        A list of dictionaries for each format available, ordered
  53                     from worst to best quality.
  54
  55                     Potential fields:
  56                     * url        Mandatory. The URL of the video file
  57                     * ext        Will be calculated from url if missing
  58                     * format     A human-readable description of the format
  59                                  ("mp4 container with h264/opus").
  60                                  Calculated from the format_id, width, height.
  61                                  and format_note fields if missing.
  62                     * format_id  A short description of the format
  63                                  ("mp4_h264_opus" or "19").
  64                                 Technically optional, but strongly recommended.
  65                     * format_note Additional info about the format
  66                                  ("3D" or "DASH video")
  67                     * width      Width of the video, if known
  68                     * height     Height of the video, if known
  69                     * resolution Textual description of width and height
  70                     * tbr        Average bitrate of audio and video in KBit/s
  71                     * abr        Average audio bitrate in KBit/s
  72                     * acodec     Name of the audio codec in use
  73                     * asr        Audio sampling rate in Hertz
  74                     * vbr        Average video bitrate in KBit/s
  75                     * fps        Frame rate
  76                     * vcodec     Name of the video codec in use
  77                     * container  Name of the container format
  78                     * filesize   The number of bytes, if known in advance
  79                     * filesize_approx  An estimate for the number of bytes
  80                     * player_url SWF Player URL (used for rtmpdump).
  81                     * protocol   The protocol that will be used for the actual
  82                                  download, lower-case.
  83                                  "http", "https", "rtsp", "rtmp", "m3u8" or so.
  84                     * preference Order number of this format. If this field is
  85                                  present and not None, the formats get sorted
  86                                  by this field, regardless of all other values.
  87                                  -1 for default (order by other properties),
  88                                  -2 or smaller for less than default.
  89                     * quality    Order number of the video quality of this
  90                                  format, irrespective of the file format.
  91                                  -1 for default (order by other properties),
  92                                  -2 or smaller for less than default.
  93                     * source_preference  Order number for this video source
  94                                   (quality takes higher priority)
  95                                  -1 for default (order by other properties),
  96                                  -2 or smaller for less than default.
  97                     * http_referer  HTTP Referer header value to set.
  98                     * http_method  HTTP method to use for the download.
  99                     * http_headers  A dictionary of additional HTTP headers
 100                                  to add to the request.
 101                     * http_post_data  Additional data to send with a POST
 102                                  request.
 103     url:            Final video URL.
 104     ext:            Video filename extension.
 105     format:         The video format, defaults to ext (used for --get-format)
 106     player_url:     SWF Player URL (used for rtmpdump).
 107
 108     The following fields are optional:
 109
 110     display_id      An alternative identifier for the video, not necessarily
 111                     unique, but available before title. Typically, id is
 112                     something like "4234987", title "Dancing naked mole rats",
 113                     and display_id "dancing-naked-mole-rats"
 114     thumbnails:     A list of dictionaries, with the following entries:
 115                         * "url"
 116                         * "width" (optional, int)
 117                         * "height" (optional, int)
 118                         * "resolution" (optional, string "{width}x{height"},
 119                                         deprecated)
 120     thumbnail:      Full URL to a video thumbnail image.
 121     description:    One-line video description.
 122     uploader:       Full name of the video uploader.
 123     timestamp:      UNIX timestamp of the moment the video became available.
 124     upload_date:    Video upload date (YYYYMMDD).
 125                     If not explicitly set, calculated from timestamp.
 126     uploader_id:    Nickname or id of the video uploader.
 127     location:       Physical location where the video was filmed.
 128     subtitles:      The subtitle file contents as a dictionary in the format
 129                     {language: subtitles}.
 130     duration:       Length of the video in seconds, as an integer.
 131     view_count:     How many users have watched the video on the platform.
 132     like_count:     Number of positive ratings of the video
 133     dislike_count:  Number of negative ratings of the video
 134     comment_count:  Number of comments on the video
 135     age_limit:      Age restriction for the video, as an integer (years)
 136     webpage_url:    The url to the video webpage, if given to youtube-dl it
 137                     should allow to get the same result again. (It will be set
 138                     by YoutubeDL if it's missing)
 139     categories:     A list of categories that the video falls in, for example
 140                     ["Sports", "Berlin"]
 141     is_live:        True, False, or None (=unknown). Whether this video is a
 142                     live stream that goes on instead of a fixed-length video.
 143
 144     Unless mentioned otherwise, the fields should be Unicode strings.
 145
 146     Unless mentioned otherwise, None is equivalent to absence of information.
 147
 148     Subclasses of this one should re-define the _real_initialize() and
 149     _real_extract() methods and define a _VALID_URL regexp.
 150     Probably, they should also be added to the list of extractors.
 151
 152     Finally, the _WORKING attribute should be set to False for broken IEs
 153     in order to warn the users and skip the tests.
 154     """
 155
 156     _ready = False
 157     _downloader = None
 158     _WORKING = True
 159
 160     def __init__(self, downloader=None):
 161         """Constructor. Receives an optional downloader."""
 162         self._ready = False
 163         self.set_downloader(downloader)
 164
 165     @classmethod
 166     def suitable(cls, url):
 167         """Receives a URL and returns True if suitable for this IE."""
 168
 169         # This does not use has/getattr intentionally - we want to know whether
 170         # we have cached the regexp for *this* class, whereas getattr would also
 171         # match the superclass
 172         if '_VALID_URL_RE' not in cls.__dict__:
 173             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 174         return cls._VALID_URL_RE.match(url) is not None
 175
 176     @classmethod
 177     def _match_id(cls, url):
 178         if '_VALID_URL_RE' not in cls.__dict__:
 179             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 180         m = cls._VALID_URL_RE.match(url)
 181         assert m
 182         return m.group('id')
 183
 184     @classmethod
 185     def working(cls):
 186         """Getter method for _WORKING."""
 187         return cls._WORKING
 188
 189     def initialize(self):
 190         """Initializes an instance (authentication, etc)."""
 191         if not self._ready:
 192             self._real_initialize()
 193             self._ready = True
 194
 195     def extract(self, url):
 196         """Extracts URL information and returns it in list of dicts."""
 197         self.initialize()
 198         return self._real_extract(url)
 199
 200     def set_downloader(self, downloader):
 201         """Sets the downloader for this IE."""
 202         self._downloader = downloader
 203
 204     def _real_initialize(self):
 205         """Real initialization process. Redefine in subclasses."""
 206         pass
 207
 208     def _real_extract(self, url):
 209         """Real extraction process. Redefine in subclasses."""
 210         pass
 211
 212     @classmethod
 213     def ie_key(cls):
 214         """A string for getting the InfoExtractor with get_info_extractor"""
 215         return cls.__name__[:-2]
 216
 217     @property
 218     def IE_NAME(self):
 219         return type(self).__name__[:-2]
 220
 221     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 222         """ Returns the response handle """
 223         if note is None:
 224             self.report_download_webpage(video_id)
 225         elif note is not False:
 226             if video_id is None:
 227                 self.to_screen('%s' % (note,))
 228             else:
 229                 self.to_screen('%s: %s' % (video_id, note))
 230         try:
 231             return self._downloader.urlopen(url_or_request)
 232         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 233             if errnote is False:
 234                 return False
 235             if errnote is None:
 236                 errnote = 'Unable to download webpage'
 237             errmsg = '%s: %s' % (errnote, compat_str(err))
 238             if fatal:
 239                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 240             else:
 241                 self._downloader.report_warning(errmsg)
 242                 return False
 243
 244     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 245         """ Returns a tuple (page content as string, URL handle) """
 246         # Strip hashes from the URL (#1038)
 247         if isinstance(url_or_request, (compat_str, str)):
 248             url_or_request = url_or_request.partition('#')[0]
 249
 250         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
 251         if urlh is False:
 252             assert not fatal
 253             return False
 254         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
 255         return (content, urlh)
 256
 257     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
 258         content_type = urlh.headers.get('Content-Type', '')
 259         webpage_bytes = urlh.read()
 260         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 261         if m:
 262             encoding = m.group(1)
 263         else:
 264             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 265                           webpage_bytes[:1024])
 266             if m:
 267                 encoding = m.group(1).decode('ascii')
 268             elif webpage_bytes.startswith(b'\xff\xfe'):
 269                 encoding = 'utf-16'
 270             else:
 271                 encoding = 'utf-8'
 272         if self._downloader.params.get('dump_intermediate_pages', False):
 273             try:
 274                 url = url_or_request.get_full_url()
 275             except AttributeError:
 276                 url = url_or_request
 277             self.to_screen('Dumping request to ' + url)
 278             dump = base64.b64encode(webpage_bytes).decode('ascii')
 279             self._downloader.to_screen(dump)
 280         if self._downloader.params.get('write_pages', False):
 281             try:
 282                 url = url_or_request.get_full_url()
 283             except AttributeError:
 284                 url = url_or_request
 285             basen = '%s_%s' % (video_id, url)
 286             if len(basen) > 240:
 287                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 288                 basen = basen[:240 - len(h)] + h
 289             raw_filename = basen + '.dump'
 290             filename = sanitize_filename(raw_filename, restricted=True)
 291             self.to_screen('Saving request to ' + filename)
 292             # Working around MAX_PATH limitation on Windows (see
 293             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 294             if os.name == 'nt':
 295                 absfilepath = os.path.abspath(filename)
 296                 if len(absfilepath) > 259:
 297                     filename = '\\\\?\\' + absfilepath
 298             with open(filename, 'wb') as outf:
 299                 outf.write(webpage_bytes)
 300
 301         try:
 302             content = webpage_bytes.decode(encoding, 'replace')
 303         except LookupError:
 304             content = webpage_bytes.decode('utf-8', 'replace')
 305
 306         if ('<title>Access to this site is blocked</title>' in content and
 307                 'Websense' in content[:512]):
 308             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 309             blocked_iframe = self._html_search_regex(
 310                 r'<iframe src="([^"]+)"', content,
 311                 'Websense information URL', default=None)
 312             if blocked_iframe:
 313                 msg += ' Visit %s for more details' % blocked_iframe
 314             raise ExtractorError(msg, expected=True)
 315
 316         return content
 317
 318     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
 319         """ Returns the data of the page as a string """
 320         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
 321         if res is False:
 322             return res
 323         else:
 324             content, _ = res
 325             return content
 326
 327     def _download_xml(self, url_or_request, video_id,
 328                       note='Downloading XML', errnote='Unable to download XML',
 329                       transform_source=None, fatal=True):
 330         """Return the xml as an xml.etree.ElementTree.Element"""
 331         xml_string = self._download_webpage(
 332             url_or_request, video_id, note, errnote, fatal=fatal)
 333         if xml_string is False:
 334             return xml_string
 335         if transform_source:
 336             xml_string = transform_source(xml_string)
 337         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
 338
 339     def _download_json(self, url_or_request, video_id,
 340                        note='Downloading JSON metadata',
 341                        errnote='Unable to download JSON metadata',
 342                        transform_source=None,
 343                        fatal=True):
 344         json_string = self._download_webpage(
 345             url_or_request, video_id, note, errnote, fatal=fatal)
 346         if (not fatal) and json_string is False:
 347             return None
 348         if transform_source:
 349             json_string = transform_source(json_string)
 350         try:
 351             return json.loads(json_string)
 352         except ValueError as ve:
 353             errmsg = '%s: Failed to parse JSON ' % video_id
 354             if fatal:
 355                 raise ExtractorError(errmsg, cause=ve)
 356             else:
 357                 self.report_warning(errmsg + str(ve))
 358
 359     def report_warning(self, msg, video_id=None):
 360         idstr = '' if video_id is None else '%s: ' % video_id
 361         self._downloader.report_warning(
 362             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 363
 364     def to_screen(self, msg):
 365         """Print msg to screen, prefixing it with '[ie_name]'"""
 366         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 367
 368     def report_extraction(self, id_or_name):
 369         """Report information extraction."""
 370         self.to_screen('%s: Extracting information' % id_or_name)
 371
 372     def report_download_webpage(self, video_id):
 373         """Report webpage download."""
 374         self.to_screen('%s: Downloading webpage' % video_id)
 375
 376     def report_age_confirmation(self):
 377         """Report attempt to confirm age."""
 378         self.to_screen('Confirming age')
 379
 380     def report_login(self):
 381         """Report attempt to log in."""
 382         self.to_screen('Logging in')
 383
 384     #Methods for following #608
 385     @staticmethod
 386     def url_result(url, ie=None, video_id=None):
 387         """Returns a url that points to a page that should be processed"""
 388         #TODO: ie should be the class used for getting the info
 389         video_info = {'_type': 'url',
 390                       'url': url,
 391                       'ie_key': ie}
 392         if video_id is not None:
 393             video_info['id'] = video_id
 394         return video_info
 395     @staticmethod
 396     def playlist_result(entries, playlist_id=None, playlist_title=None):
 397         """Returns a playlist"""
 398         video_info = {'_type': 'playlist',
 399                       'entries': entries}
 400         if playlist_id:
 401             video_info['id'] = playlist_id
 402         if playlist_title:
 403             video_info['title'] = playlist_title
 404         return video_info
 405
 406     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 407         """
 408         Perform a regex search on the given string, using a single or a list of
 409         patterns returning the first matching group.
 410         In case of failure return a default value or raise a WARNING or a
 411         RegexNotFoundError, depending on fatal, specifying the field name.
 412         """
 413         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 414             mobj = re.search(pattern, string, flags)
 415         else:
 416             for p in pattern:
 417                 mobj = re.search(p, string, flags)
 418                 if mobj:
 419                     break
 420
 421         if os.name != 'nt' and sys.stderr.isatty():
 422             _name = '\033[0;34m%s\033[0m' % name
 423         else:
 424             _name = name
 425
 426         if mobj:
 427             # return the first matching group
 428             return next(g for g in mobj.groups() if g is not None)
 429         elif default is not _NO_DEFAULT:
 430             return default
 431         elif fatal:
 432             raise RegexNotFoundError('Unable to extract %s' % _name)
 433         else:
 434             self._downloader.report_warning('unable to extract %s; '
 435                 'please report this issue on http://yt-dl.org/bug' % _name)
 436             return None
 437
 438     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
 439         """
 440         Like _search_regex, but strips HTML tags and unescapes entities.
 441         """
 442         res = self._search_regex(pattern, string, name, default, fatal, flags)
 443         if res:
 444             return clean_html(res).strip()
 445         else:
 446             return res
 447
 448     def _get_login_info(self):
 449         """
 450         Get the the login info as (username, password)
 451         It will look in the netrc file using the _NETRC_MACHINE value
 452         If there's no info available, return (None, None)
 453         """
 454         if self._downloader is None:
 455             return (None, None)
 456
 457         username = None
 458         password = None
 459         downloader_params = self._downloader.params
 460
 461         # Attempt to use provided username and password or .netrc data
 462         if downloader_params.get('username', None) is not None:
 463             username = downloader_params['username']
 464             password = downloader_params['password']
 465         elif downloader_params.get('usenetrc', False):
 466             try:
 467                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 468                 if info is not None:
 469                     username = info[0]
 470                     password = info[2]
 471                 else:
 472                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 473             except (IOError, netrc.NetrcParseError) as err:
 474                 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
 475
 476         return (username, password)
 477
 478     def _get_tfa_info(self):
 479         """
 480         Get the two-factor authentication info
 481         TODO - asking the user will be required for sms/phone verify
 482         currently just uses the command line option
 483         If there's no info available, return None
 484         """
 485         if self._downloader is None:
 486             return None
 487         downloader_params = self._downloader.params
 488
 489         if downloader_params.get('twofactor', None) is not None:
 490             return downloader_params['twofactor']
 491
 492         return None
 493
 494     # Helper functions for extracting OpenGraph info
 495     @staticmethod
 496     def _og_regexes(prop):
 497         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
 498         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
 499         template = r'<meta[^>]+?%s[^>]+?%s'
 500         return [
 501             template % (property_re, content_re),
 502             template % (content_re, property_re),
 503         ]
 504
 505     def _og_search_property(self, prop, html, name=None, **kargs):
 506         if name is None:
 507             name = 'OpenGraph %s' % prop
 508         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
 509         if escaped is None:
 510             return None
 511         return unescapeHTML(escaped)
 512
 513     def _og_search_thumbnail(self, html, **kargs):
 514         return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
 515
 516     def _og_search_description(self, html, **kargs):
 517         return self._og_search_property('description', html, fatal=False, **kargs)
 518
 519     def _og_search_title(self, html, **kargs):
 520         return self._og_search_property('title', html, **kargs)
 521
 522     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 523         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 524         if secure:
 525             regexes = self._og_regexes('video:secure_url') + regexes
 526         return self._html_search_regex(regexes, html, name, **kargs)
 527
 528     def _og_search_url(self, html, **kargs):
 529         return self._og_search_property('url', html, **kargs)
 530
 531     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 532         if display_name is None:
 533             display_name = name
 534         return self._html_search_regex(
 535             r'''(?ix)<meta
 536                     (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
 537                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
 538             html, display_name, fatal=fatal, **kwargs)
 539
 540     def _dc_search_uploader(self, html):
 541         return self._html_search_meta('dc.creator', html, 'uploader')
 542
 543     def _rta_search(self, html):
 544         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 545         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 546                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 547                      html):
 548             return 18
 549         return 0
 550
 551     def _media_rating_search(self, html):
 552         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 553         rating = self._html_search_meta('rating', html)
 554
 555         if not rating:
 556             return None
 557
 558         RATING_TABLE = {
 559             'safe for kids': 0,
 560             'general': 8,
 561             '14 years': 14,
 562             'mature': 17,
 563             'restricted': 19,
 564         }
 565         return RATING_TABLE.get(rating.lower(), None)
 566
 567     def _twitter_search_player(self, html):
 568         return self._html_search_meta('twitter:player', html,
 569             'twitter card player')
 570
 571     def _sort_formats(self, formats):
 572         if not formats:
 573             raise ExtractorError('No video formats found')
 574
 575         def _formats_key(f):
 576             # TODO remove the following workaround
 577             from ..utils import determine_ext
 578             if not f.get('ext') and 'url' in f:
 579                 f['ext'] = determine_ext(f['url'])
 580
 581             preference = f.get('preference')
 582             if preference is None:
 583                 proto = f.get('protocol')
 584                 if proto is None:
 585                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
 586
 587                 preference = 0 if proto in ['http', 'https'] else -0.1
 588                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
 589                     preference -= 0.5
 590
 591             if f.get('vcodec') == 'none':  # audio only
 592                 if self._downloader.params.get('prefer_free_formats'):
 593                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
 594                 else:
 595                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
 596                 ext_preference = 0
 597                 try:
 598                     audio_ext_preference = ORDER.index(f['ext'])
 599                 except ValueError:
 600                     audio_ext_preference = -1
 601             else:
 602                 if self._downloader.params.get('prefer_free_formats'):
 603                     ORDER = ['flv', 'mp4', 'webm']
 604                 else:
 605                     ORDER = ['webm', 'flv', 'mp4']
 606                 try:
 607                     ext_preference = ORDER.index(f['ext'])
 608                 except ValueError:
 609                     ext_preference = -1
 610                 audio_ext_preference = 0
 611
 612             return (
 613                 preference,
 614                 f.get('quality') if f.get('quality') is not None else -1,
 615                 f.get('height') if f.get('height') is not None else -1,
 616                 f.get('width') if f.get('width') is not None else -1,
 617                 ext_preference,
 618                 f.get('tbr') if f.get('tbr') is not None else -1,
 619                 f.get('vbr') if f.get('vbr') is not None else -1,
 620                 f.get('abr') if f.get('abr') is not None else -1,
 621                 audio_ext_preference,
 622                 f.get('fps') if f.get('fps') is not None else -1,
 623                 f.get('filesize') if f.get('filesize') is not None else -1,
 624                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
 625                 f.get('source_preference') if f.get('source_preference') is not None else -1,
 626                 f.get('format_id'),
 627             )
 628         formats.sort(key=_formats_key)
 629
 630     def http_scheme(self):
 631         """ Either "http:" or "https:", depending on the user's preferences """
 632         return (
 633             'http:'
 634             if self._downloader.params.get('prefer_insecure', False)
 635             else 'https:')
 636
 637     def _proto_relative_url(self, url, scheme=None):
 638         if url is None:
 639             return url
 640         if url.startswith('//'):
 641             if scheme is None:
 642                 scheme = self.http_scheme()
 643             return scheme + url
 644         else:
 645             return url
 646
 647     def _sleep(self, timeout, video_id, msg_template=None):
 648         if msg_template is None:
 649             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
 650         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
 651         self.to_screen(msg)
 652         time.sleep(timeout)
 653
 654     def _extract_f4m_formats(self, manifest_url, video_id):
 655         manifest = self._download_xml(
 656             manifest_url, video_id, 'Downloading f4m manifest',
 657             'Unable to download f4m manifest')
 658
 659         formats = []
 660         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
 661         for i, media_el in enumerate(media_nodes):
 662             tbr = int_or_none(media_el.attrib.get('bitrate'))
 663             format_id = 'f4m-%d' % (i if tbr is None else tbr)
 664             formats.append({
 665                 'format_id': format_id,
 666                 'url': manifest_url,
 667                 'ext': 'flv',
 668                 'tbr': tbr,
 669                 'width': int_or_none(media_el.attrib.get('width')),
 670                 'height': int_or_none(media_el.attrib.get('height')),
 671             })
 672         self._sort_formats(formats)
 673
 674         return formats
 675
 676     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
 677                               entry_protocol='m3u8', preference=None):
 678
 679         formats = [{
 680             'format_id': 'm3u8-meta',
 681             'url': m3u8_url,
 682             'ext': ext,
 683             'protocol': 'm3u8',
 684             'preference': -1,
 685             'resolution': 'multiple',
 686             'format_note': 'Quality selection URL',
 687         }]
 688
 689         format_url = lambda u: (
 690             u
 691             if re.match(r'^https?://', u)
 692             else compat_urlparse.urljoin(m3u8_url, u))
 693
 694         m3u8_doc = self._download_webpage(
 695             m3u8_url, video_id,
 696             note='Downloading m3u8 information',
 697             errnote='Failed to download m3u8 information')
 698         last_info = None
 699         kv_rex = re.compile(
 700             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
 701         for line in m3u8_doc.splitlines():
 702             if line.startswith('#EXT-X-STREAM-INF:'):
 703                 last_info = {}
 704                 for m in kv_rex.finditer(line):
 705                     v = m.group('val')
 706                     if v.startswith('"'):
 707                         v = v[1:-1]
 708                     last_info[m.group('key')] = v
 709             elif line.startswith('#') or not line.strip():
 710                 continue
 711             else:
 712                 if last_info is None:
 713                     formats.append({'url': format_url(line)})
 714                     continue
 715                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
 716
 717                 f = {
 718                     'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
 719                     'url': format_url(line.strip()),
 720                     'tbr': tbr,
 721                     'ext': ext,
 722                     'protocol': entry_protocol,
 723                     'preference': preference,
 724                 }
 725                 codecs = last_info.get('CODECS')
 726                 if codecs:
 727                     # TODO: looks like video codec is not always necessarily goes first
 728                     va_codecs = codecs.split(',')
 729                     if va_codecs[0]:
 730                         f['vcodec'] = va_codecs[0].partition('.')[0]
 731                     if len(va_codecs) > 1 and va_codecs[1]:
 732                         f['acodec'] = va_codecs[1].partition('.')[0]
 733                 resolution = last_info.get('RESOLUTION')
 734                 if resolution:
 735                     width_str, height_str = resolution.split('x')
 736                     f['width'] = int(width_str)
 737                     f['height'] = int(height_str)
 738                 formats.append(f)
 739                 last_info = {}
 740         self._sort_formats(formats)
 741         return formats
 742
 743     def _live_title(self, name):
 744         """ Generate the title for a live video """
 745         now = datetime.datetime.now()
 746         now_str = now.strftime("%Y-%m-%d %H:%M")
 747         return name + ' ' + now_str
 748
 749     def _int(self, v, name, fatal=False, **kwargs):
 750         res = int_or_none(v, **kwargs)
 751         if 'get_attr' in kwargs:
 752             print(getattr(v, kwargs['get_attr']))
 753         if res is None:
 754             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 755             if fatal:
 756                 raise ExtractorError(msg)
 757             else:
 758                 self._downloader.report_warning(msg)
 759         return res
 760
 761     def _float(self, v, name, fatal=False, **kwargs):
 762         res = float_or_none(v, **kwargs)
 763         if res is None:
 764             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
 765             if fatal:
 766                 raise ExtractorError(msg)
 767             else:
 768                 self._downloader.report_warning(msg)
 769         return res
 770
 771
 772 class SearchInfoExtractor(InfoExtractor):
 773     """
 774     Base class for paged search queries extractors.
 775     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
 776     Instances should define _SEARCH_KEY and _MAX_RESULTS.
 777     """
 778
 779     @classmethod
 780     def _make_valid_url(cls):
 781         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
 782
 783     @classmethod
 784     def suitable(cls, url):
 785         return re.match(cls._make_valid_url(), url) is not None
 786
 787     def _real_extract(self, query):
 788         mobj = re.match(self._make_valid_url(), query)
 789         if mobj is None:
 790             raise ExtractorError('Invalid search query "%s"' % query)
 791
 792         prefix = mobj.group('prefix')
 793         query = mobj.group('query')
 794         if prefix == '':
 795             return self._get_n_results(query, 1)
 796         elif prefix == 'all':
 797             return self._get_n_results(query, self._MAX_RESULTS)
 798         else:
 799             n = int(prefix)
 800             if n <= 0:
 801                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
 802             elif n > self._MAX_RESULTS:
 803                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
 804                 n = self._MAX_RESULTS
 805             return self._get_n_results(query, n)
 806
 807     def _get_n_results(self, query, n):
 808         """Get a specified number of results for a query"""
 809         raise NotImplementedError("This method must be implemented by subclasses")
 810
 811     @property
 812     def SEARCH_KEY(self):
 813         return self._SEARCH_KEY