Initiate new release.

[youtubedl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 7977fa8d00faa01e95665e347fda4c492ab91ec0..b9014fc23e53eaf335d65ee56c3db560d218d642 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -22,17 +22,20 @@ from ..compat import (
      compat_str,
  )
  from ..utils import (
+    NO_DEFAULT,
      age_restricted,
+    bug_reports_message,
      clean_html,
      compiled_regex_type,
+    determine_ext,
      ExtractorError,
+    fix_xml_ampersands,
      float_or_none,
      int_or_none,
      RegexNotFoundError,
      sanitize_filename,
      unescapeHTML,
  )
-_NO_DEFAULT = object()
  
  
  class InfoExtractor(object):
@@ -46,7 +49,7 @@ class InfoExtractor(object):
      information possibly downloading the video to the file system, among
      other possible outcomes.
  
-    The type field determines the the type of the result.
+    The type field determines the type of the result.
      By far the most common value (and the default if _type is missing) is
      "video", which indicates a single video.
  
@@ -110,11 +113,8 @@ class InfoExtractor(object):
                                    (quality takes higher priority)
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
-                    * http_method  HTTP method to use for the download.
                      * http_headers  A dictionary of additional HTTP headers
                                   to add to the request.
-                    * http_post_data  Additional data to send with a POST
-                                 request.
                      * stretched_ratio  If given and not 1, indicates that the
                                   video's pixels are not square.
                                   width : height ratio as float.
@@ -324,7 +324,7 @@ class InfoExtractor(object):
                  self._downloader.report_warning(errmsg)
                  return False
  
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None):
          """ Returns a tuple (page content as string, URL handle) """
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, (compat_str, str)):
@@ -334,14 +334,11 @@ class InfoExtractor(object):
          if urlh is False:
              assert not fatal
              return False
-        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
+        content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
          return (content, urlh)
  
-    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
-        content_type = urlh.headers.get('Content-Type', '')
-        webpage_bytes = urlh.read()
-        if prefix is not None:
-            webpage_bytes = prefix + webpage_bytes
+    @staticmethod
+    def _guess_encoding_from_content(content_type, webpage_bytes):
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
          if m:
              encoding = m.group(1)
@@ -354,6 +351,16 @@ class InfoExtractor(object):
                  encoding = 'utf-16'
              else:
                  encoding = 'utf-8'
+
+        return encoding
+
+    def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
+        content_type = urlh.headers.get('Content-Type', '')
+        webpage_bytes = urlh.read()
+        if prefix is not None:
+            webpage_bytes = prefix + webpage_bytes
+        if not encoding:
+            encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
          if self._downloader.params.get('dump_intermediate_pages', False):
              try:
                  url = url_or_request.get_full_url()
@@ -410,13 +417,13 @@ class InfoExtractor(object):
  
          return content
  
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
          """ Returns the data of the page as a string """
          success = False
          try_count = 0
          while success is False:
              try:
-                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
+                res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding)
                  success = True
              except compat_http_client.IncompleteRead as e:
                  try_count += 1
@@ -431,10 +438,10 @@ class InfoExtractor(object):
  
      def _download_xml(self, url_or_request, video_id,
                        note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True):
+                      transform_source=None, fatal=True, encoding=None):
          """Return the xml as an xml.etree.ElementTree.Element"""
          xml_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding)
          if xml_string is False:
              return xml_string
          if transform_source:
@@ -445,9 +452,10 @@ class InfoExtractor(object):
                         note='Downloading JSON metadata',
                         errnote='Unable to download JSON metadata',
                         transform_source=None,
-                       fatal=True):
+                       fatal=True, encoding=None):
          json_string = self._download_webpage(
-            url_or_request, video_id, note, errnote, fatal=fatal)
+            url_or_request, video_id, note, errnote, fatal=fatal,
+            encoding=encoding)
          if (not fatal) and json_string is False:
              return None
          return self._parse_json(
@@ -492,7 +500,7 @@ class InfoExtractor(object):
  
      # Methods for following #608
      @staticmethod
-    def url_result(url, ie=None, video_id=None):
+    def url_result(url, ie=None, video_id=None, video_title=None):
          """Returns a url that points to a page that should be processed"""
          # TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
@@ -500,6 +508,8 @@ class InfoExtractor(object):
                        'ie_key': ie}
          if video_id is not None:
              video_info['id'] = video_id
+        if video_title is not None:
+            video_info['title'] = video_title
          return video_info
  
      @staticmethod
@@ -515,7 +525,7 @@ class InfoExtractor(object):
              video_info['description'] = playlist_description
          return video_info
  
-    def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
          """
          Perform a regex search on the given string, using a single or a list of
          patterns returning the first matching group.
@@ -541,16 +551,15 @@ class InfoExtractor(object):
                  return next(g for g in mobj.groups() if g is not None)
              else:
                  return mobj.group(group)
-        elif default is not _NO_DEFAULT:
+        elif default is not NO_DEFAULT:
              return default
          elif fatal:
              raise RegexNotFoundError('Unable to extract %s' % _name)
          else:
-            self._downloader.report_warning('unable to extract %s; '
-                                            'please report this issue on http://yt-dl.org/bug' % _name)
+            self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
              return None
  
-    def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
+    def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
          """
          Like _search_regex, but strips HTML tags and unescapes entities.
          """
@@ -562,7 +571,7 @@ class InfoExtractor(object):
  
      def _get_login_info(self):
          """
-        Get the the login info as (username, password)
+        Get the login info as (username, password)
          It will look in the netrc file using the _NETRC_MACHINE value
          If there's no info available, return (None, None)
          """
@@ -698,7 +707,26 @@ class InfoExtractor(object):
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
-    def _sort_formats(self, formats):
+    @staticmethod
+    def _hidden_inputs(html):
+        return dict([
+            (input.group('name'), input.group('value')) for input in re.finditer(
+                r'''(?x)
+                    <input\s+
+                        type=(?P<q_hidden>["\'])hidden(?P=q_hidden)\s+
+                        name=(?P<q_name>["\'])(?P<name>.+?)(?P=q_name)\s+
+                        (?:id=(?P<q_id>["\']).+?(?P=q_id)\s+)?
+                        value=(?P<q_value>["\'])(?P<value>.*?)(?P=q_value)
+                ''', html)
+        ])
+
+    def _form_hidden_inputs(self, form_id, html):
+        form = self._search_regex(
+            r'(?s)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
+            html, '%s form' % form_id, group='form')
+        return self._hidden_inputs(form)
+
+    def _sort_formats(self, formats, field_preference=None):
          if not formats:
              raise ExtractorError('No video formats found')
  
@@ -708,6 +736,9 @@ class InfoExtractor(object):
              if not f.get('ext') and 'url' in f:
                  f['ext'] = determine_ext(f['url'])
  
+            if isinstance(field_preference, (list, tuple)):
+                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+
              preference = f.get('preference')
              if preference is None:
                  proto = f.get('protocol')
@@ -754,7 +785,7 @@ class InfoExtractor(object):
                  f.get('fps') if f.get('fps') is not None else -1,
                  f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
                  f.get('source_preference') if f.get('source_preference') is not None else -1,
-                f.get('format_id'),
+                f.get('format_id') if f.get('format_id') is not None else '',
              )
          formats.sort(key=_formats_key)
  
@@ -767,13 +798,17 @@ class InfoExtractor(object):
                  formats)
  
      def _is_valid_url(self, url, video_id, item='video'):
+        url = self._proto_relative_url(url, scheme='http:')
+        # For now assume non HTTP(S) URLs always valid
+        if not (url.startswith('http://') or url.startswith('https://')):
+            return True
          try:
              self._request_webpage(url, video_id, 'Checking %s URL' % item)
              return True
          except ExtractorError as e:
              if isinstance(e.cause, compat_HTTPError):
-                self.report_warning(
-                    '%s URL is invalid, skipping' % item, video_id)
+                self.to_screen(
+                    '%s: %s URL is invalid, skipping' % (video_id, item))
                  return False
              raise
  
@@ -801,10 +836,14 @@ class InfoExtractor(object):
          self.to_screen(msg)
          time.sleep(timeout)
  
-    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None):
+    def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
+                             transform_source=lambda s: fix_xml_ampersands(s).strip()):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
-            'Unable to download f4m manifest')
+            'Unable to download f4m manifest',
+            # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
+            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+            transform_source=transform_source)
  
          formats = []
          manifest_version = '1.0'
@@ -814,11 +853,22 @@ class InfoExtractor(object):
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
          for i, media_el in enumerate(media_nodes):
              if manifest_version == '2.0':
-                manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' +
-                                (media_el.attrib.get('href') or media_el.attrib.get('url')))
+                media_url = media_el.attrib.get('href') or media_el.attrib.get('url')
+                if not media_url:
+                    continue
+                manifest_url = (
+                    media_url if media_url.startswith('http://') or media_url.startswith('https://')
+                    else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url))
+                # If media_url is itself a f4m manifest do the recursive extraction
+                # since bitrates in parent manifest (this one) and media_url manifest
+                # may differ leading to inability to resolve the format by requested
+                # bitrate in f4m downloader
+                if determine_ext(manifest_url) == 'f4m':
+                    formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id))
+                    continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
-                'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])),
+                'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])),
                  'url': manifest_url,
                  'ext': 'flv',
                  'tbr': tbr,
@@ -832,10 +882,11 @@ class InfoExtractor(object):
  
      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                                entry_protocol='m3u8', preference=None,
-                              m3u8_id=None):
+                              m3u8_id=None, note=None, errnote=None,
+                              fatal=True):
  
          formats = [{
-            'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])),
+            'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
              'url': m3u8_url,
              'ext': ext,
              'protocol': 'm3u8',
@@ -851,8 +902,11 @@ class InfoExtractor(object):
  
          m3u8_doc = self._download_webpage(
              m3u8_url, video_id,
-            note='Downloading m3u8 information',
-            errnote='Failed to download m3u8 information')
+            note=note or 'Downloading m3u8 information',
+            errnote=errnote or 'Failed to download m3u8 information',
+            fatal=fatal)
+        if m3u8_doc is False:
+            return m3u8_doc
          last_info = None
          last_media = None
          kv_rex = re.compile(
@@ -879,8 +933,13 @@ class InfoExtractor(object):
                      formats.append({'url': format_url(line)})
                      continue
                  tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+                format_id = []
+                if m3u8_id:
+                    format_id.append(m3u8_id)
+                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
+                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
                  f = {
-                    'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])),
+                    'format_id': '-'.join(format_id),
                      'url': format_url(line.strip()),
                      'tbr': tbr,
                      'ext': ext,
@@ -937,7 +996,7 @@ class InfoExtractor(object):
      def _parse_smil_video(self, video, video_id, base, rtmp_count):
          src = video.get('src')
          if not src:
-            return ([], rtmp_count)
+            return [], rtmp_count
          bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
          width = int_or_none(video.get('width'))
          height = int_or_none(video.get('height'))
@@ -950,7 +1009,7 @@ class InfoExtractor(object):
                      proto = 'http'
          ext = video.get('ext')
          if proto == 'm3u8':
-            return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count)
+            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
          elif proto == 'rtmp':
              rtmp_count += 1
              streamer = video.get('streamer') or base