Update changelog.

[youtubedl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 5e263f8b5a2cf46fbb26e928f5df85c87c42dfde..14f57563529ad5782b3c70c24d4760c78c3b334d 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -10,6 +10,7 @@ import re
  import socket
  import sys
  import time
  import socket
  import sys
  import time
+import math
  
  from ..compat import (
      compat_cookiejar,
  
  from ..compat import (
      compat_cookiejar,
@@ -18,8 +19,6 @@ from ..compat import (
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
      compat_http_client,
      compat_urllib_error,
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
-    compat_urllib_request,
      compat_urlparse,
      compat_str,
      compat_etree_fromstring,
      compat_urlparse,
      compat_str,
      compat_etree_fromstring,
@@ -31,17 +30,23 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      determine_ext,
      clean_html,
      compiled_regex_type,
      determine_ext,
+    error_to_compat_str,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
      ExtractorError,
      fix_xml_ampersands,
      float_or_none,
      int_or_none,
+    parse_iso8601,
      RegexNotFoundError,
      sanitize_filename,
      RegexNotFoundError,
      sanitize_filename,
+    sanitized_Request,
      unescapeHTML,
      unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
      unescapeHTML,
      unified_strdate,
      url_basename,
      xpath_text,
      xpath_with_ns,
+    determine_protocol,
+    parse_duration,
+    mimetype2ext,
  )
  
  
  )
  
  
@@ -107,8 +112,9 @@ class InfoExtractor(object):
                                   -2 or smaller for less than default.
                                   < -1000 to hide the format (if there is
                                      another one which is strictly better)
                                   -2 or smaller for less than default.
                                   < -1000 to hide the format (if there is
                                      another one which is strictly better)
-                    * language_preference  Is this in the correct requested
-                                 language?
+                    * language   Language code, e.g. "de" or "en-US".
+                    * language_preference  Is this in the language mentioned in
+                                 the URL?
                                   10 if it's what the URL is about,
                                   -1 for default (don't know),
                                   -10 otherwise, other values reserved for now.
                                   10 if it's what the URL is about,
                                   -1 for default (don't know),
                                   -10 otherwise, other values reserved for now.
@@ -167,7 +173,7 @@ class InfoExtractor(object):
                      "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
                      "ext" will be calculated from URL if missing
      automatic_captions: Like 'subtitles', used by the YoutubeIE for
                      automatically generated captions
-    duration:       Length of the video in seconds, as an integer.
+    duration:       Length of the video in seconds, as an integer or float.
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
      view_count:     How many users have watched the video on the platform.
      like_count:     Number of positive ratings of the video
      dislike_count:  Number of negative ratings of the video
@@ -199,6 +205,26 @@ class InfoExtractor(object):
      end_time:       Time in seconds where the reproduction should end, as
                      specified in the URL.
  
      end_time:       Time in seconds where the reproduction should end, as
                      specified in the URL.
  
+    The following fields should only be used when the video belongs to some logical
+    chapter or section:
+
+    chapter:        Name or title of the chapter the video belongs to.
+    chapter_number: Number of the chapter the video belongs to, as an integer.
+    chapter_id:     Id of the chapter the video belongs to, as a unicode string.
+
+    The following fields should only be used when the video is an episode of some
+    series or programme:
+
+    series:         Title of the series or programme the video episode belongs to.
+    season:         Title of the season the video episode belongs to.
+    season_number:  Number of the season the video episode belongs to, as an integer.
+    season_id:      Id of the season the video episode belongs to, as a unicode string.
+    episode:        Title of the video episode. Unlike mandatory video title field,
+                    this field should denote the exact title of the video episode
+                    without any kind of decoration.
+    episode_number: Number of the video episode within a season, as an integer.
+    episode_id:     Id of the video episode, as a unicode string.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -291,9 +317,9 @@ class InfoExtractor(object):
          except ExtractorError:
              raise
          except compat_http_client.IncompleteRead as e:
          except ExtractorError:
              raise
          except compat_http_client.IncompleteRead as e:
-            raise ExtractorError('A network error has occured.', cause=e, expected=True)
+            raise ExtractorError('A network error has occurred.', cause=e, expected=True)
          except (KeyError, StopIteration) as e:
          except (KeyError, StopIteration) as e:
-            raise ExtractorError('An extractor error has occured.', cause=e)
+            raise ExtractorError('An extractor error has occurred.', cause=e)
  
      def set_downloader(self, downloader):
          """Sets the downloader for this IE."""
  
      def set_downloader(self, downloader):
          """Sets the downloader for this IE."""
@@ -332,7 +358,8 @@ class InfoExtractor(object):
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
                  return False
              if errnote is None:
                  errnote = 'Unable to download webpage'
-            errmsg = '%s: %s' % (errnote, compat_str(err))
+
+            errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
              if fatal:
                  raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
              else:
@@ -610,7 +637,7 @@ class InfoExtractor(object):
          downloader_params = self._downloader.params
  
          # Attempt to use provided username and password or .netrc data
          downloader_params = self._downloader.params
  
          # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username', None) is not None:
+        if downloader_params.get('username') is not None:
              username = downloader_params['username']
              password = downloader_params['password']
          elif downloader_params.get('usenetrc', False):
              username = downloader_params['username']
              password = downloader_params['password']
          elif downloader_params.get('usenetrc', False):
@@ -622,7 +649,7 @@ class InfoExtractor(object):
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
                  else:
                      raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
  
          return (username, password)
  
  
          return (username, password)
  
@@ -637,7 +664,7 @@ class InfoExtractor(object):
              return None
          downloader_params = self._downloader.params
  
              return None
          downloader_params = self._downloader.params
  
-        if downloader_params.get('twofactor', None) is not None:
+        if downloader_params.get('twofactor') is not None:
              return downloader_params['twofactor']
  
          return compat_getpass('Type %s and press [Return]: ' % note)
              return downloader_params['twofactor']
  
          return compat_getpass('Type %s and press [Return]: ' % note)
@@ -718,7 +745,7 @@ class InfoExtractor(object):
              'mature': 17,
              'restricted': 19,
          }
              'mature': 17,
              'restricted': 19,
          }
-        return RATING_TABLE.get(rating.lower(), None)
+        return RATING_TABLE.get(rating.lower())
  
      def _family_friendly_search(self, html):
          # See http://schema.org/VideoObject
  
      def _family_friendly_search(self, html):
          # See http://schema.org/VideoObject
@@ -733,12 +760,48 @@ class InfoExtractor(object):
              '0': 18,
              'false': 18,
          }
              '0': 18,
              'false': 18,
          }
-        return RATING_TABLE.get(family_friendly.lower(), None)
+        return RATING_TABLE.get(family_friendly.lower())
  
      def _twitter_search_player(self, html):
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
  
      def _twitter_search_player(self, html):
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
+    def _search_json_ld(self, html, video_id, **kwargs):
+        json_ld = self._search_regex(
+            r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
+            html, 'JSON-LD', group='json_ld', **kwargs)
+        if not json_ld:
+            return {}
+        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+
+    def _json_ld(self, json_ld, video_id, fatal=True):
+        if isinstance(json_ld, compat_str):
+            json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
+        if not json_ld:
+            return {}
+        info = {}
+        if json_ld.get('@context') == 'http://schema.org':
+            item_type = json_ld.get('@type')
+            if item_type == 'TVEpisode':
+                info.update({
+                    'episode': unescapeHTML(json_ld.get('name')),
+                    'episode_number': int_or_none(json_ld.get('episodeNumber')),
+                    'description': unescapeHTML(json_ld.get('description')),
+                })
+                part_of_season = json_ld.get('partOfSeason')
+                if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+                    info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+                part_of_series = json_ld.get('partOfSeries')
+                if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+                    info['series'] = unescapeHTML(part_of_series.get('name'))
+            elif item_type == 'Article':
+                info.update({
+                    'timestamp': parse_iso8601(json_ld.get('datePublished')),
+                    'title': unescapeHTML(json_ld.get('headline')),
+                    'description': unescapeHTML(json_ld.get('articleBody')),
+                })
+        return dict((k, v) for k, v in info.items() if v is not None)
+
      @staticmethod
      def _hidden_inputs(html):
          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
      @staticmethod
      def _hidden_inputs(html):
          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@@ -765,6 +828,12 @@ class InfoExtractor(object):
          if not formats:
              raise ExtractorError('No video formats found')
  
          if not formats:
              raise ExtractorError('No video formats found')
  
+        for f in formats:
+            # Automatically determine tbr when missing based on abr and vbr (improves
+            # formats sorting in some cases)
+            if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
+                f['tbr'] = f['abr'] + f['vbr']
+
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext
@@ -776,14 +845,12 @@ class InfoExtractor(object):
  
              preference = f.get('preference')
              if preference is None:
  
              preference = f.get('preference')
              if preference is None:
-                proto = f.get('protocol')
-                if proto is None:
-                    proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
-
-                preference = 0 if proto in ['http', 'https'] else -0.1
+                preference = 0
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
+            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
              if f.get('vcodec') == 'none':  # audio only
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
@@ -814,6 +881,7 @@ class InfoExtractor(object):
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
                  f.get('vbr') if f.get('vbr') is not None else -1,
                  f.get('height') if f.get('height') is not None else -1,
                  f.get('width') if f.get('width') is not None else -1,
+                proto_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
                  ext_preference,
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
@@ -832,6 +900,16 @@ class InfoExtractor(object):
                      item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
                  formats)
  
                      item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
                  formats)
  
+    @staticmethod
+    def _remove_duplicate_formats(formats):
+        format_urls = set()
+        unique_formats = []
+        for f in formats:
+            if f['url'] not in format_urls:
+                format_urls.add(f['url'])
+                unique_formats.append(f)
+        formats[:] = unique_formats
+
      def _is_valid_url(self, url, video_id, item='video'):
          url = self._proto_relative_url(url, scheme='http:')
          # For now assume non HTTP(S) URLs always valid
      def _is_valid_url(self, url, video_id, item='video'):
          url = self._proto_relative_url(url, scheme='http:')
          # For now assume non HTTP(S) URLs always valid
@@ -883,7 +961,7 @@ class InfoExtractor(object):
              fatal=fatal)
  
          if manifest is False:
              fatal=fatal)
  
          if manifest is False:
-            return manifest
+            return []
  
          formats = []
          manifest_version = '1.0'
  
          formats = []
          manifest_version = '1.0'
@@ -891,6 +969,11 @@ class InfoExtractor(object):
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        base_url = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
+            'base URL', default=None)
+        if base_url:
+            base_url = base_url.strip()
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
                  media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
@@ -898,16 +981,14 @@ class InfoExtractor(object):
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
                      continue
                  manifest_url = (
                      media_url if media_url.startswith('http://') or media_url.startswith('https://')
-                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                    else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
                  # If media_url is itself a f4m manifest do the recursive extraction
                  # since bitrates in parent manifest (this one) and media_url manifest
                  # may differ leading to inability to resolve the format by requested
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
-                    f4m_formats = self._extract_f4m_formats(
-                        manifest_url, video_id, preference, f4m_id, fatal=fatal)
-                    if f4m_formats:
-                        formats.extend(f4m_formats)
+                    formats.extend(self._extract_f4m_formats(
+                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -949,9 +1030,21 @@ class InfoExtractor(object):
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
          if res is False:
              errnote=errnote or 'Failed to download m3u8 information',
              fatal=fatal)
          if res is False:
-            return res
+            return []
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
+        # A Media Playlist Tag MUST NOT appear in a Master Playlist
+        # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
+        # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists
+        # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+        if '#EXT-X-TARGETDURATION' in m3u8_doc:
+            return [{
+                'url': m3u8_url,
+                'format_id': m3u8_id,
+                'ext': ext,
+                'protocol': entry_protocol,
+                'preference': preference,
+            }]
          last_info = None
          last_media = None
          kv_rex = re.compile(
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -996,9 +1089,9 @@ class InfoExtractor(object):
                      # TODO: looks like video codec is not always necessarily goes first
                      va_codecs = codecs.split(',')
                      if va_codecs[0]:
                      # TODO: looks like video codec is not always necessarily goes first
                      va_codecs = codecs.split(',')
                      if va_codecs[0]:
-                        f['vcodec'] = va_codecs[0].partition('.')[0]
+                        f['vcodec'] = va_codecs[0]
                      if len(va_codecs) > 1 and va_codecs[1]:
                      if len(va_codecs) > 1 and va_codecs[1]:
-                        f['acodec'] = va_codecs[1].partition('.')[0]
+                        f['acodec'] = va_codecs[1]
                  resolution = last_info.get('RESOLUTION')
                  if resolution:
                      width_str, height_str = resolution.split('x')
                  resolution = last_info.get('RESOLUTION')
                  if resolution:
                      width_str, height_str = resolution.split('x')
@@ -1102,12 +1195,15 @@ class InfoExtractor(object):
          formats = []
          rtmp_count = 0
          http_count = 0
          formats = []
          rtmp_count = 0
          http_count = 0
+        m3u8_count = 0
  
  
+        srcs = []
          videos = smil.findall(self._xpath_ns('.//video', namespace))
          for video in videos:
              src = video.get('src')
          videos = smil.findall(self._xpath_ns('.//video', namespace))
          for video in videos:
              src = video.get('src')
-            if not src:
+            if not src or src in srcs:
                  continue
                  continue
+            srcs.append(src)
  
              bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
              filesize = int_or_none(video.get('size') or video.get('fileSize'))
  
              bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
              filesize = int_or_none(video.get('size') or video.get('fileSize'))
@@ -1139,12 +1235,20 @@ class InfoExtractor(object):
                  continue
  
              src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
                  continue
  
              src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+            src_url = src_url.strip()
  
              if proto == 'm3u8' or src_ext == 'm3u8':
                  m3u8_formats = self._extract_m3u8_formats(
                      src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
  
              if proto == 'm3u8' or src_ext == 'm3u8':
                  m3u8_formats = self._extract_m3u8_formats(
                      src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
-                if m3u8_formats:
-                    formats.extend(m3u8_formats)
+                if len(m3u8_formats) == 1:
+                    m3u8_count += 1
+                    m3u8_formats[0].update({
+                        'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
+                        'tbr': bitrate,
+                        'width': width,
+                        'height': height,
+                    })
+                formats.extend(m3u8_formats)
                  continue
  
              if src_ext == 'f4m':
                  continue
  
              if src_ext == 'f4m':
@@ -1156,9 +1260,7 @@ class InfoExtractor(object):
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
                  f4m_url += compat_urllib_parse.urlencode(f4m_params)
-                f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)
-                if f4m_formats:
-                    formats.extend(f4m_formats)
+                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
                  continue
  
              if src_url.startswith('http') and self._is_valid_url(src, video_id):
                  continue
  
              if src_url.startswith('http') and self._is_valid_url(src, video_id):
@@ -1179,21 +1281,14 @@ class InfoExtractor(object):
          return formats
  
      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
          return formats
  
      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
+        urls = []
          subtitles = {}
          for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
              src = textstream.get('src')
          subtitles = {}
          for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
              src = textstream.get('src')
-            if not src:
+            if not src or src in urls:
                  continue
                  continue
-            ext = textstream.get('ext') or determine_ext(src)
-            if not ext:
-                type_ = textstream.get('type')
-                SUBTITLES_TYPES = {
-                    'text/vtt': 'vtt',
-                    'text/srt': 'srt',
-                    'application/smptett+xml': 'tt',
-                }
-                if type_ in SUBTITLES_TYPES:
-                    ext = SUBTITLES_TYPES[type_]
+            urls.append(src)
+            ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
              lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
              subtitles.setdefault(lang, []).append({
                  'url': src,
              lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
              subtitles.setdefault(lang, []).append({
                  'url': src,
@@ -1244,10 +1339,167 @@ class InfoExtractor(object):
              })
          return entries
  
              })
          return entries
  
+    def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
+        res = self._download_webpage_handle(
+            mpd_url, video_id,
+            note=note or 'Downloading MPD manifest',
+            errnote=errnote or 'Failed to download MPD manifest',
+            fatal=fatal)
+        if res is False:
+            return []
+        mpd, urlh = res
+        mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
+
+        return self._parse_mpd_formats(
+            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
+
+    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+        if mpd_doc.get('type') == 'dynamic':
+            return []
+
+        namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
+
+        def _add_ns(path):
+            return self._xpath_ns(path, namespace)
+
+        def is_drm_protected(element):
+            return element.find(_add_ns('ContentProtection')) is not None
+
+        def extract_multisegment_info(element, ms_parent_info):
+            ms_info = ms_parent_info.copy()
+            segment_list = element.find(_add_ns('SegmentList'))
+            if segment_list is not None:
+                segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
+                if segment_urls_e:
+                    ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
+                initialization = segment_list.find(_add_ns('Initialization'))
+                if initialization is not None:
+                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
+            else:
+                segment_template = element.find(_add_ns('SegmentTemplate'))
+                if segment_template is not None:
+                    start_number = segment_template.get('startNumber')
+                    if start_number:
+                        ms_info['start_number'] = int(start_number)
+                    segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
+                    if segment_timeline is not None:
+                        s_e = segment_timeline.findall(_add_ns('S'))
+                        if s_e:
+                            ms_info['total_number'] = 0
+                            for s in s_e:
+                                ms_info['total_number'] += 1 + int(s.get('r', '0'))
+                    else:
+                        timescale = segment_template.get('timescale')
+                        if timescale:
+                            ms_info['timescale'] = int(timescale)
+                        segment_duration = segment_template.get('duration')
+                        if segment_duration:
+                            ms_info['segment_duration'] = int(segment_duration)
+                    media_template = segment_template.get('media')
+                    if media_template:
+                        ms_info['media_template'] = media_template
+                    initialization = segment_template.get('initialization')
+                    if initialization:
+                        ms_info['initialization_url'] = initialization
+                    else:
+                        initialization = segment_template.find(_add_ns('Initialization'))
+                        if initialization is not None:
+                            ms_info['initialization_url'] = initialization.attrib['sourceURL']
+            return ms_info
+
+        mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
+        formats = []
+        for period in mpd_doc.findall(_add_ns('Period')):
+            period_duration = parse_duration(period.get('duration')) or mpd_duration
+            period_ms_info = extract_multisegment_info(period, {
+                'start_number': 1,
+                'timescale': 1,
+            })
+            for adaptation_set in period.findall(_add_ns('AdaptationSet')):
+                if is_drm_protected(adaptation_set):
+                    continue
+                adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
+                for representation in adaptation_set.findall(_add_ns('Representation')):
+                    if is_drm_protected(representation):
+                        continue
+                    representation_attrib = adaptation_set.attrib.copy()
+                    representation_attrib.update(representation.attrib)
+                    mime_type = representation_attrib.get('mimeType')
+                    content_type = mime_type.split('/')[0] if mime_type else representation_attrib.get('contentType')
+                    if content_type == 'text':
+                        # TODO implement WebVTT downloading
+                        pass
+                    elif content_type == 'video' or content_type == 'audio':
+                        base_url = ''
+                        for element in (representation, adaptation_set, period, mpd_doc):
+                            base_url_e = element.find(_add_ns('BaseURL'))
+                            if base_url_e is not None:
+                                base_url = base_url_e.text + base_url
+                                if re.match(r'^https?://', base_url):
+                                    break
+                        if mpd_base_url and not re.match(r'^https?://', base_url):
+                            if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
+                                mpd_base_url += '/'
+                            base_url = mpd_base_url + base_url
+                        representation_id = representation_attrib.get('id')
+                        lang = representation_attrib.get('lang')
+                        url_el = representation.find(_add_ns('BaseURL'))
+                        filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
+                        f = {
+                            'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
+                            'url': base_url,
+                            'width': int_or_none(representation_attrib.get('width')),
+                            'height': int_or_none(representation_attrib.get('height')),
+                            'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
+                            'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
+                            'fps': int_or_none(representation_attrib.get('frameRate')),
+                            'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
+                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
+                            'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
+                            'format_note': 'DASH %s' % content_type,
+                            'filesize': filesize,
+                        }
+                        representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
+                        if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
+                            if 'total_number' not in representation_ms_info and 'segment_duration':
+                                segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
+                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+                            media_template = representation_ms_info['media_template']
+                            media_template = media_template.replace('$RepresentationID$', representation_id)
+                            media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
+                            media_template.replace('$$', '$')
+                            representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                        if 'segment_urls' in representation_ms_info:
+                            f.update({
+                                'segment_urls': representation_ms_info['segment_urls'],
+                                'protocol': 'http_dash_segments',
+                            })
+                            if 'initialization_url' in representation_ms_info:
+                                initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
+                                f.update({
+                                    'initialization_url': initialization_url,
+                                })
+                                if not f.get('url'):
+                                    f['url'] = initialization_url
+                        try:
+                            existing_format = next(
+                                fo for fo in formats
+                                if fo['format_id'] == representation_id)
+                        except StopIteration:
+                            full_info = formats_dict.get(representation_id, {}).copy()
+                            full_info.update(f)
+                            formats.append(full_info)
+                        else:
+                            existing_format.update(f)
+                    else:
+                        self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
+        self._sort_formats(formats)
+        return formats
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
-        now_str = now.strftime("%Y-%m-%d %H:%M")
+        now_str = now.strftime('%Y-%m-%d %H:%M')
          return name + ' ' + now_str
  
      def _int(self, v, name, fatal=False, **kwargs):
          return name + ' ' + now_str
  
      def _int(self, v, name, fatal=False, **kwargs):
@@ -1280,7 +1532,7 @@ class InfoExtractor(object):
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
  
      def _get_cookies(self, url):
          """ Return a compat_cookies.SimpleCookie with the cookies for the url """
-        req = compat_urllib_request.Request(url)
+        req = sanitized_Request(url)
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))
  
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))
  
@@ -1320,7 +1572,7 @@ class InfoExtractor(object):
          return {}
  
      def _get_subtitles(self, *args, **kwargs):
          return {}
  
      def _get_subtitles(self, *args, **kwargs):
-        raise NotImplementedError("This method must be implemented by subclasses")
+        raise NotImplementedError('This method must be implemented by subclasses')
  
      @staticmethod
      def _merge_subtitle_items(subtitle_list1, subtitle_list2):
  
      @staticmethod
      def _merge_subtitle_items(subtitle_list1, subtitle_list2):
@@ -1346,7 +1598,7 @@ class InfoExtractor(object):
          return {}
  
      def _get_automatic_captions(self, *args, **kwargs):
          return {}
  
      def _get_automatic_captions(self, *args, **kwargs):
-        raise NotImplementedError("This method must be implemented by subclasses")
+        raise NotImplementedError('This method must be implemented by subclasses')
  
  
  class SearchInfoExtractor(InfoExtractor):
  
  
  class SearchInfoExtractor(InfoExtractor):
@@ -1386,7 +1638,7 @@ class SearchInfoExtractor(InfoExtractor):
  
      def _get_n_results(self, query, n):
          """Get a specified number of results for a query"""
  
      def _get_n_results(self, query, n):
          """Get a specified number of results for a query"""
-        raise NotImplementedError("This method must be implemented by subclasses")
+        raise NotImplementedError('This method must be implemented by subclasses')
  
      @property
      def SEARCH_KEY(self):
  
      @property
      def SEARCH_KEY(self):