]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/common.py
Add note about mplayer2 and backporters.
[youtubedl] / youtube_dl / extractor / common.py
index df1a4417bb99fe6cf415b293ce068787d1568dc8..5e263f8b5a2cf46fbb26e928f5df85c87c42dfde 100644 (file)
@@ -10,30 +10,39 @@ import re
 import socket
 import sys
 import time
-import xml.etree.ElementTree
 
 from ..compat import (
     compat_cookiejar,
-    compat_HTTPError,
+    compat_cookies,
+    compat_getpass,
     compat_http_client,
     compat_urllib_error,
+    compat_urllib_parse,
     compat_urllib_parse_urlparse,
+    compat_urllib_request,
     compat_urlparse,
     compat_str,
+    compat_etree_fromstring,
 )
 from ..utils import (
+    NO_DEFAULT,
     age_restricted,
+    bug_reports_message,
     clean_html,
     compiled_regex_type,
+    determine_ext,
     ExtractorError,
+    fix_xml_ampersands,
     float_or_none,
-    HEADRequest,
     int_or_none,
     RegexNotFoundError,
     sanitize_filename,
     unescapeHTML,
+    unified_strdate,
+    url_basename,
+    xpath_text,
+    xpath_with_ns,
 )
-_NO_DEFAULT = object()
 
 
 class InfoExtractor(object):
@@ -47,7 +56,7 @@ class InfoExtractor(object):
     information possibly downloading the video to the file system, among
     other possible outcomes.
 
-    The type field determines the the type of the result.
+    The type field determines the type of the result.
     By far the most common value (and the default if _type is missing) is
     "video", which indicates a single video.
 
@@ -63,7 +72,7 @@ class InfoExtractor(object):
 
                     Potential fields:
                     * url        Mandatory. The URL of the video file
-                    * ext        Will be calculated from url if missing
+                    * ext        Will be calculated from URL if missing
                     * format     A human-readable description of the format
                                  ("mp4 container with h264/opus").
                                  Calculated from the format_id, width, height.
@@ -111,11 +120,8 @@ class InfoExtractor(object):
                                   (quality takes higher priority)
                                  -1 for default (order by other properties),
                                  -2 or smaller for less than default.
-                    * http_method  HTTP method to use for the download.
                     * http_headers  A dictionary of additional HTTP headers
                                  to add to the request.
-                    * http_post_data  Additional data to send with a POST
-                                 request.
                     * stretched_ratio  If given and not 1, indicates that the
                                  video's pixels are not square.
                                  width : height ratio as float.
@@ -146,17 +152,27 @@ class InfoExtractor(object):
     description:    Full video description.
     uploader:       Full name of the video uploader.
     creator:        The main artist who created the video.
+    release_date:   The date (YYYYMMDD) when the video was released.
     timestamp:      UNIX timestamp of the moment the video became available.
     upload_date:    Video upload date (YYYYMMDD).
                     If not explicitly set, calculated from timestamp.
     uploader_id:    Nickname or id of the video uploader.
     location:       Physical location where the video was filmed.
-    subtitles:      The subtitle file contents as a dictionary in the format
-                    {language: subtitles}.
+    subtitles:      The available subtitles as a dictionary in the format
+                    {language: subformats}. "subformats" is a list sorted from
+                    lower to higher preference, each element is a dictionary
+                    with the "ext" entry and one of:
+                        * "data": The subtitles file contents
+                        * "url": A URL pointing to the subtitles file
+                    "ext" will be calculated from URL if missing
+    automatic_captions: Like 'subtitles', used by the YoutubeIE for
+                    automatically generated captions
     duration:       Length of the video in seconds, as an integer.
     view_count:     How many users have watched the video on the platform.
     like_count:     Number of positive ratings of the video
     dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
+    average_rating: Average rating give by users, the scale used depends on the webpage
     comment_count:  Number of comments on the video
     comments:       A list of comments, each with one or more of the following
                     properties (all but one of text or html optional):
@@ -170,13 +186,18 @@ class InfoExtractor(object):
                                      Set to "root" to indicate that this is a
                                      comment to the original video.
     age_limit:      Age restriction for the video, as an integer (years)
-    webpage_url:    The url to the video webpage, if given to youtube-dl it
+    webpage_url:    The URL to the video webpage, if given to youtube-dl it
                     should allow to get the same result again. (It will be set
                     by YoutubeDL if it's missing)
     categories:     A list of categories that the video falls in, for example
                     ["Sports", "Berlin"]
+    tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
     is_live:        True, False, or None (=unknown). Whether this video is a
                     live stream that goes on instead of a fixed-length video.
+    start_time:     Time in seconds where the reproduction should start, as
+                    specified in the URL.
+    end_time:       Time in seconds where the reproduction should end, as
+                    specified in the URL.
 
     Unless mentioned otherwise, the fields should be Unicode strings.
 
@@ -187,8 +208,8 @@ class InfoExtractor(object):
     There must be a key "entries", which is a list, an iterable, or a PagedList
     object, each element of which is a valid dictionary by this specification.
 
-    Additionally, playlists can have "title" and "id" attributes with the same
-    semantics as videos (see above).
+    Additionally, playlists can have "title", "description" and "id" attributes
+    with the same semantics as videos (see above).
 
 
     _type "multi_video" indicates that there are multiple videos that
@@ -264,8 +285,15 @@ class InfoExtractor(object):
 
     def extract(self, url):
         """Extracts URL information and returns it in list of dicts."""
-        self.initialize()
-        return self._real_extract(url)
+        try:
+            self.initialize()
+            return self._real_extract(url)
+        except ExtractorError:
+            raise
+        except compat_http_client.IncompleteRead as e:
+            raise ExtractorError('A network error has occured.', cause=e, expected=True)
+        except (KeyError, StopIteration) as e:
+            raise ExtractorError('An extractor error has occured.', cause=e)
 
     def set_downloader(self, downloader):
         """Sets the downloader for this IE."""
@@ -282,11 +310,11 @@ class InfoExtractor(object):
     @classmethod
     def ie_key(cls):
         """A string for getting the InfoExtractor with get_info_extractor"""
-        return cls.__name__[:-2]
+        return compat_str(cls.__name__[:-2])
 
     @property
     def IE_NAME(self):
-        return type(self).__name__[:-2]
+        return compat_str(type(self).__name__[:-2])
 
     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
         """ Returns the response handle """
@@ -311,7 +339,7 @@ class InfoExtractor(object):
                 self._downloader.report_warning(errmsg)
                 return False
 
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
         """ Returns a tuple (page content as string, URL handle) """
         # Strip hashes from the URL (#1038)
         if isinstance(url_or_request, (compat_str, str)):
@@ -321,14 +349,11 @@ class InfoExtractor(object):
         if urlh is False:
             assert not fatal
             return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
         return (content, urlh)
 
-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
-        content_type = urlh.headers.get('Content-Type', '')
-        webpage_bytes = urlh.read()
-        if prefix is not None:
-            webpage_bytes = prefix + webpage_bytes
+    @staticmethod
+    def _guess_encoding_from_content(content_type, webpage_bytes):
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
         if m:
             encoding = m.group(1)
@@ -341,6 +366,16 @@ class InfoExtractor(object):
                 encoding = 'utf-16'
             else:
                 encoding = 'utf-8'
+
+        return encoding
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+        content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
+        if prefix is not None:
+            webpage_bytes = prefix + webpage_bytes
+        if not encoding:
+            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
         if self._downloader.params.get('dump_intermediate_pages', False):
             try:
                 url = url_or_request.get_full_url()
@@ -384,16 +419,26 @@ class InfoExtractor(object):
             if blocked_iframe:
                 msg += ' Visit %s for more details' % blocked_iframe
             raise ExtractorError(msg, expected=True)
+        if '<title>The URL you requested has been blocked</title>' in content[:512]:
+            msg = (
+                'Access to this webpage has been blocked by Indian censorship. '
+                'Use a VPN or proxy server (with --proxy) to route around it.')
+            block_msg = self._html_search_regex(
+                r'</h1><p>(.*?)</p>',
+                content, 'block message', default=None)
+            if block_msg:
+                msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+            raise ExtractorError(msg, expected=True)
 
         return content
 
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
         """ Returns the data of the page as a string """
         success = False
         try_count = 0
         while success is False:
             try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
                 success = True
             except compat_http_client.IncompleteRead as e:
                 try_count += 1
@@ -408,23 +453,24 @@ class InfoExtractor(object):
 
     def _download_xml(self, url_or_request, video_id,
                       note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True):
+                      transform_source=None, fatal=True, encoding=None):
         """Return the xml as an xml.etree.ElementTree.Element"""
         xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
         if xml_string is False:
             return xml_string
         if transform_source:
             xml_string = transform_source(xml_string)
-        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+        return compat_etree_fromstring(xml_string.encode('utf-8'))
 
     def _download_json(self, url_or_request, video_id,
                        note='Downloading JSON metadata',
                        errnote='Unable to download JSON metadata',
                        transform_source=None,
-                       fatal=True):
+                       fatal=True, encoding=None):
         json_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding)
         if (not fatal) and json_string is False:
             return None
         return self._parse_json(
@@ -467,16 +513,30 @@ class InfoExtractor(object):
         """Report attempt to log in."""
         self.to_screen('Logging in')
 
+    @staticmethod
+    def raise_login_required(msg='This video is only available for registered users'):
+        raise ExtractorError(
+            '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
+            expected=True)
+
+    @staticmethod
+    def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'):
+        raise ExtractorError(
+            '%s. You might want to use --proxy to workaround.' % msg,
+            expected=True)
+
     # Methods for following #608
     @staticmethod
-    def url_result(url, ie=None, video_id=None):
-        """Returns a url that points to a page that should be processed"""
+    def url_result(url, ie=None, video_id=None, video_title=None):
+        """Returns a URL that points to a page that should be processed"""
         # TODO: ie should be the class used for getting the info
         video_info = {'_type': 'url',
                       'url': url,
                       'ie_key': ie}
         if video_id is not None:
             video_info['id'] = video_id
+        if video_title is not None:
+            video_info['title'] = video_title
         return video_info
 
     @staticmethod
@@ -492,7 +552,7 @@ class InfoExtractor(object):
             video_info['description'] = playlist_description
         return video_info
 
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Perform a regex search on the given string, using a single or a list of
         patterns returning the first matching group.
@@ -507,7 +567,7 @@ class InfoExtractor(object):
                 if mobj:
                     break
 
-        if os.name != 'nt' and sys.stderr.isatty():
+        if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty():
             _name = '\033[0;34m%s\033[0m' % name
         else:
             _name = name
@@ -518,16 +578,15 @@ class InfoExtractor(object):
                 return next(g for g in mobj.groups() if g is not None)
             else:
                 return mobj.group(group)
-        elif default is not _NO_DEFAULT:
+        elif default is not NO_DEFAULT:
             return default
         elif fatal:
             raise RegexNotFoundError('Unable to extract %s' % _name)
         else:
-            self._downloader.report_warning('unable to extract %s; '
-                                            'please report this issue on http://yt-dl.org/bug' % _name)
+            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
             return None
 
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
         """
         Like _search_regex, but strips HTML tags and unescapes entities.
         """
@@ -539,7 +598,7 @@ class InfoExtractor(object):
 
     def _get_login_info(self):
         """
-        Get the the login info as (username, password)
+        Get the login info as (username, password)
         It will look in the netrc file using the _NETRC_MACHINE value
         If there's no info available, return (None, None)
         """
@@ -567,7 +626,7 @@ class InfoExtractor(object):
 
         return (username, password)
 
-    def _get_tfa_info(self):
+    def _get_tfa_info(self, note='two-factor verification code'):
         """
         Get the two-factor authentication info
         TODO - asking the user will be required for sms/phone verify
@@ -581,19 +640,26 @@ class InfoExtractor(object):
         if downloader_params.get('twofactor', None) is not None:
             return downloader_params['twofactor']
 
-        return None
+        return compat_getpass('Type %s and press [Return]: ' % note)
 
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
-        property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+        property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
+                       % {'prop': re.escape(prop)})
         template = r'<meta[^>]+?%s[^>]+?%s'
         return [
             template % (property_re, content_re),
             template % (content_re, property_re),
         ]
 
+    @staticmethod
+    def _meta_regex(prop):
+        return r'''(?isx)<meta
+                    (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
+                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
+
     def _og_search_property(self, prop, html, name=None, **kargs):
         if name is None:
             name = 'OpenGraph %s' % prop
@@ -603,7 +669,7 @@ class InfoExtractor(object):
         return unescapeHTML(escaped)
 
     def _og_search_thumbnail(self, html, **kargs):
-        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+        return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 
     def _og_search_description(self, html, **kargs):
         return self._og_search_property('description', html, fatal=False, **kargs)
@@ -624,9 +690,7 @@ class InfoExtractor(object):
         if display_name is None:
             display_name = name
         return self._html_search_regex(
-            r'''(?isx)<meta
-                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
-                    [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(name),
+            self._meta_regex(name),
             html, display_name, fatal=fatal, group='content', **kwargs)
 
     def _dc_search_uploader(self, html):
@@ -656,11 +720,48 @@ class InfoExtractor(object):
         }
         return RATING_TABLE.get(rating.lower(), None)
 
+    def _family_friendly_search(self, html):
+        # See http://schema.org/VideoObject
+        family_friendly = self._html_search_meta('isFamilyFriendly', html)
+
+        if not family_friendly:
+            return None
+
+        RATING_TABLE = {
+            '1': 0,
+            'true': 0,
+            '0': 18,
+            'false': 18,
+        }
+        return RATING_TABLE.get(family_friendly.lower(), None)
+
     def _twitter_search_player(self, html):
         return self._html_search_meta('twitter:player', html,
                                       'twitter card player')
 
-    def _sort_formats(self, formats):
+    @staticmethod
+    def _hidden_inputs(html):
+        html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
+        hidden_inputs = {}
+        for input in re.findall(r'(?i)<input([^>]+)>', html):
+            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+                continue
+            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            if not name:
+                continue
+            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
+            if not value:
+                continue
+            hidden_inputs[name.group('value')] = value.group('value')
+        return hidden_inputs
+
+    def _form_hidden_inputs(self, form_id, html):
+        form = self._search_regex(
+            r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            html, '%s form' % form_id, group='form')
+        return self._hidden_inputs(form)
+
+    def _sort_formats(self, formats, field_preference=None):
         if not formats:
             raise ExtractorError('No video formats found')
 
@@ -670,6 +771,9 @@ class InfoExtractor(object):
             if not f.get('ext') and 'url' in f:
                 f['ext'] = determine_ext(f['url'])
 
+            if isinstance(field_preference, (list, tuple)):
+                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
             preference = f.get('preference')
             if preference is None:
                 proto = f.get('protocol')
@@ -706,17 +810,17 @@ class InfoExtractor(object):
                 f.get('language_preference') if f.get('language_preference') is not None else -1,
                 f.get('quality') if f.get('quality') is not None else -1,
                 f.get('tbr') if f.get('tbr') is not None else -1,
+                f.get('filesize') if f.get('filesize') is not None else -1,
                 f.get('vbr') if f.get('vbr') is not None else -1,
-                ext_preference,
                 f.get('height') if f.get('height') is not None else -1,
                 f.get('width') if f.get('width') is not None else -1,
+                ext_preference,
                 f.get('abr') if f.get('abr') is not None else -1,
                 audio_ext_preference,
                 f.get('fps') if f.get('fps') is not None else -1,
-                f.get('filesize') if f.get('filesize') is not None else -1,
                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
                 f.get('source_preference') if f.get('source_preference') is not None else -1,
-                f.get('format_id'),
+                f.get('format_id') if f.get('format_id') is not None else '',
             )
         formats.sort(key=_formats_key)
 
@@ -729,15 +833,17 @@ class InfoExtractor(object):
                 formats)
 
     def _is_valid_url(self, url, video_id, item='video'):
+        url = self._proto_relative_url(url, scheme='http:')
+        # For now assume non HTTP(S) URLs always valid
+        if not (url.startswith('http://') or url.startswith('https://')):
+            return True
         try:
-            self._request_webpage(
-                HEADRequest(url), video_id,
-                'Checking %s URL' % item)
+            self._request_webpage(url, video_id, 'Checking %s URL' % item)
             return True
         except ExtractorError as e:
-            if isinstance(e.cause, compat_HTTPError):
-                self.report_warning(
-                    '%s URL is invalid, skipping' % item, video_id)
+            if isinstance(e.cause, compat_urllib_error.URLError):
+                self.to_screen(
+                    '%s: %s URL is invalid, skipping' % (video_id, item))
                 return False
             raise
 
@@ -765,10 +871,19 @@ class InfoExtractor(object):
         self.to_screen(msg)
         time.sleep(timeout)
 
-    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                             fatal=True):
         manifest = self._download_xml(
             manifest_url, video_id, 'Downloading f4m manifest',
-            'Unable to download f4m manifest')
+            'Unable to download f4m manifest',
+            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+            transform_source=transform_source,
+            fatal=fatal)
+
+        if manifest is False:
+            return manifest
 
         formats = []
         manifest_version = '1.0'
@@ -778,11 +893,25 @@ class InfoExtractor(object):
             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
         for i, media_el in enumerate(media_nodes):
             if manifest_version == '2.0':
-                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/'
-                                + (media_el.attrib.get('href') or media_el.attrib.get('url')))
+                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+                if not media_url:
+                    continue
+                manifest_url = (
+                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
+                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                # If media_url is itself a f4m manifest do the recursive extraction
+                # since bitrates in parent manifest (this one) and media_url manifest
+                # may differ leading to inability to resolve the format by requested
+                # bitrate in f4m downloader
+                if determine_ext(manifest_url) == 'f4m':
+                    f4m_formats = self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
+                    if f4m_formats:
+                        formats.extend(f4m_formats)
+                    continue
             tbr = int_or_none(media_el.attrib.get('bitrate'))
             formats.append({
-                'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
+                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
                 'url': manifest_url,
                 'ext': 'flv',
                 'tbr': tbr,
@@ -796,14 +925,15 @@ class InfoExtractor(object):
 
     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                               entry_protocol='m3u8', preference=None,
-                              m3u8_id=None):
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True):
 
         formats = [{
-            'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
+            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
             'url': m3u8_url,
             'ext': ext,
             'protocol': 'm3u8',
-            'preference': -1,
+            'preference': preference - 1 if preference else -1,
             'resolution': 'multiple',
             'format_note': 'Quality selection URL',
         }]
@@ -813,11 +943,17 @@ class InfoExtractor(object):
             if re.match(r'^https?://', u)
             else compat_urlparse.urljoin(m3u8_url, u))
 
-        m3u8_doc = self._download_webpage(
+        res = self._download_webpage_handle(
             m3u8_url, video_id,
-            note='Downloading m3u8 information',
-            errnote='Failed to download m3u8 information')
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information',
+            fatal=fatal)
+        if res is False:
+            return res
+        m3u8_doc, urlh = res
+        m3u8_url = urlh.geturl()
         last_info = None
+        last_media = None
         kv_rex = re.compile(
             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
         for line in m3u8_doc.splitlines():
@@ -828,6 +964,13 @@ class InfoExtractor(object):
                     if v.startswith('"'):
                         v = v[1:-1]
                     last_info[m.group('key')] = v
+            elif line.startswith('#EXT-X-MEDIA:'):
+                last_media = {}
+                for m in kv_rex.finditer(line):
+                    v = m.group('val')
+                    if v.startswith('"'):
+                        v = v[1:-1]
+                    last_media[m.group('key')] = v
             elif line.startswith('#') or not line.strip():
                 continue
             else:
@@ -835,8 +978,13 @@ class InfoExtractor(object):
                     formats.append({'url': format_url(line)})
                     continue
                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+                format_id = []
+                if m3u8_id:
+                    format_id.append(m3u8_id)
+                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
                 f = {
-                    'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
+                    'format_id': '-'.join(format_id),
                     'url': format_url(line.strip()),
                     'tbr': tbr,
                     'ext': ext,
@@ -856,57 +1004,246 @@ class InfoExtractor(object):
                     width_str, height_str = resolution.split('x')
                     f['width'] = int(width_str)
                     f['height'] = int(height_str)
+                if last_media is not None:
+                    f['m3u8_media'] = last_media
+                    last_media = None
                 formats.append(f)
                 last_info = {}
         self._sort_formats(formats)
         return formats
 
-    # TODO: improve extraction
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-        smil = self._download_xml(
-            smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+    @staticmethod
+    def _xpath_ns(path, namespace=None):
+        if not namespace:
+            return path
+        out = []
+        for c in path.split('/'):
+            if not c or c == '.':
+                out.append(c)
+            else:
+                out.append('{%s}%s' % (namespace, c))
+        return '/'.join(out)
+
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+
         if smil is False:
             assert not fatal
             return []
 
-        base = smil.find('./head/meta').get('base')
+        namespace = self._parse_smil_namespace(smil)
+
+        return self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+
+    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+        if smil is False:
+            return {}
+        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+
+    def _download_smil(self, smil_url, video_id, fatal=True):
+        return self._download_xml(
+            smil_url, video_id, 'Downloading SMIL file',
+            'Unable to download SMIL file', fatal=fatal)
+
+    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+        namespace = self._parse_smil_namespace(smil)
+
+        formats = self._parse_smil_formats(
+            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+
+        video_id = os.path.splitext(url_basename(smil_url))[0]
+        title = None
+        description = None
+        upload_date = None
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            name = meta.attrib.get('name')
+            content = meta.attrib.get('content')
+            if not name or not content:
+                continue
+            if not title and name == 'title':
+                title = content
+            elif not description and name in ('description', 'abstract'):
+                description = content
+            elif not upload_date and name == 'date':
+                upload_date = unified_strdate(content)
+
+        thumbnails = [{
+            'id': image.get('type'),
+            'url': image.get('src'),
+            'width': int_or_none(image.get('width')),
+            'height': int_or_none(image.get('height')),
+        } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
+
+        return {
+            'id': video_id,
+            'title': title or video_id,
+            'description': description,
+            'upload_date': upload_date,
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _parse_smil_namespace(self, smil):
+        return self._search_regex(
+            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+
+    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+        base = smil_url
+        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+            b = meta.get('base') or meta.get('httpBase')
+            if b:
+                base = b
+                break
 
         formats = []
         rtmp_count = 0
-        for video in smil.findall('./body/switch/video'):
+        http_count = 0
+
+        videos = smil.findall(self._xpath_ns('.//video', namespace))
+        for video in videos:
             src = video.get('src')
             if not src:
                 continue
-            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+
+            bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+            filesize = int_or_none(video.get('size') or video.get('fileSize'))
             width = int_or_none(video.get('width'))
             height = int_or_none(video.get('height'))
             proto = video.get('proto')
-            if not proto:
-                if base:
-                    if base.startswith('rtmp'):
-                        proto = 'rtmp'
-                    elif base.startswith('http'):
-                        proto = 'http'
             ext = video.get('ext')
-            if proto == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(src, video_id, ext))
-            elif proto == 'rtmp':
+            src_ext = determine_ext(src)
+            streamer = video.get('streamer') or base
+
+            if proto == 'rtmp' or streamer.startswith('rtmp'):
                 rtmp_count += 1
-                streamer = video.get('streamer') or base
                 formats.append({
                     'url': streamer,
                     'play_path': src,
                     'ext': 'flv',
                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                     'tbr': bitrate,
+                    'filesize': filesize,
+                    'width': width,
+                    'height': height,
+                })
+                if transform_rtmp_url:
+                    streamer, src = transform_rtmp_url(streamer, src)
+                    formats[-1].update({
+                        'url': streamer,
+                        'play_path': src,
+                    })
+                continue
+
+            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+
+            if proto == 'm3u8' or src_ext == 'm3u8':
+                m3u8_formats = self._extract_m3u8_formats(
+                    src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
+                continue
+
+            if src_ext == 'f4m':
+                f4m_url = src_url
+                if not f4m_params:
+                    f4m_params = {
+                        'hdcore': '3.2.0',
+                        'plugin': 'flowplayer-3.2.0.1',
+                    }
+                f4m_url += '&' if '?' in f4m_url else '?'
+                f4m_url += compat_urllib_parse.urlencode(f4m_params)
+                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
+                continue
+
+            if src_url.startswith('http') and self._is_valid_url(src, video_id):
+                http_count += 1
+                formats.append({
+                    'url': src_url,
+                    'ext': ext or src_ext or 'flv',
+                    'format_id': 'http-%d' % (bitrate or http_count),
+                    'tbr': bitrate,
+                    'filesize': filesize,
                     'width': width,
                     'height': height,
                 })
+                continue
+
         self._sort_formats(formats)
 
         return formats
 
+    def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        subtitles = {}
+        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+            src = textstream.get('src')
+            if not src:
+                continue
+            ext = textstream.get('ext') or determine_ext(src)
+            if not ext:
+                type_ = textstream.get('type')
+                SUBTITLES_TYPES = {
+                    'text/vtt': 'vtt',
+                    'text/srt': 'srt',
+                    'application/smptett+xml': 'tt',
+                }
+                if type_ in SUBTITLES_TYPES:
+                    ext = SUBTITLES_TYPES[type_]
+            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
+            subtitles.setdefault(lang, []).append({
+                'url': src,
+                'ext': ext,
+            })
+        return subtitles
+
+    def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
+        xspf = self._download_xml(
+            playlist_url, playlist_id, 'Downloading xpsf playlist',
+            'Unable to download xspf manifest', fatal=fatal)
+        if xspf is False:
+            return []
+        return self._parse_xspf(xspf, playlist_id)
+
+    def _parse_xspf(self, playlist, playlist_id):
+        NS_MAP = {
+            'xspf': 'http://xspf.org/ns/0/',
+            's1': 'http://static.streamone.nl/player/ns/0',
+        }
+
+        entries = []
+        for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
+            title = xpath_text(
+                track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
+            description = xpath_text(
+                track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
+            thumbnail = xpath_text(
+                track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
+            duration = float_or_none(
+                xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
+
+            formats = [{
+                'url': location.text,
+                'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
+                'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
+                'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
+            } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
+            self._sort_formats(formats)
+
+            entries.append({
+                'id': playlist_id,
+                'title': title,
+                'description': description,
+                'thumbnail': thumbnail,
+                'duration': duration,
+                'formats': formats,
+            })
+        return entries
+
     def _live_title(self, name):
         """ Generate the title for a live video """
         now = datetime.datetime.now()
@@ -941,6 +1278,12 @@ class InfoExtractor(object):
             None, '/', True, False, expire_time, '', None, None, None)
         self._downloader.cookiejar.set_cookie(cookie)
 
+    def _get_cookies(self, url):
+        """ Return a compat_cookies.SimpleCookie with the cookies for the url """
+        req = compat_urllib_request.Request(url)
+        self._downloader.cookiejar.add_cookie_header(req)
+        return compat_cookies.SimpleCookie(req.get_header('Cookie'))
+
     def get_testcases(self, include_onlymatching=False):
         t = getattr(self, '_TEST', None)
         if t:
@@ -970,11 +1313,46 @@ class InfoExtractor(object):
             any_restricted = any_restricted or is_restricted
         return not any_restricted
 
+    def extract_subtitles(self, *args, **kwargs):
+        if (self._downloader.params.get('writesubtitles', False) or
+                self._downloader.params.get('listsubtitles')):
+            return self._get_subtitles(*args, **kwargs)
+        return {}
+
+    def _get_subtitles(self, *args, **kwargs):
+        raise NotImplementedError("This method must be implemented by subclasses")
+
+    @staticmethod
+    def _merge_subtitle_items(subtitle_list1, subtitle_list2):
+        """ Merge subtitle items for one language. Items with duplicated URLs
+        will be dropped. """
+        list1_urls = set([item['url'] for item in subtitle_list1])
+        ret = list(subtitle_list1)
+        ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
+        return ret
+
+    @classmethod
+    def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
+        """ Merge two subtitle dictionaries, language by language. """
+        ret = dict(subtitle_dict1)
+        for lang in subtitle_dict2:
+            ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
+        return ret
+
+    def extract_automatic_captions(self, *args, **kwargs):
+        if (self._downloader.params.get('writeautomaticsub', False) or
+                self._downloader.params.get('listsubtitles')):
+            return self._get_automatic_captions(*args, **kwargs)
+        return {}
+
+    def _get_automatic_captions(self, *args, **kwargs):
+        raise NotImplementedError("This method must be implemented by subclasses")
+
 
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.
-    They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
+    They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
     Instances should define _SEARCH_KEY and _MAX_RESULTS.
     """