Initiate release.

[youtubedl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index ce3d1690304c8a0b488c2f674c48bb4ef34c86b5..342bfb8b3b53bcb76951613002090be8737bbe29 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,10 +1,12 @@
  import base64
+import hashlib
  import json
+import netrc
  import os
  import re
  import socket
  import sys
-import netrc
+import time
  import xml.etree.ElementTree
  
  from ..utils import (
@@ -16,6 +18,7 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      ExtractorError,
+    int_or_none,
      RegexNotFoundError,
      sanitize_filename,
      unescapeHTML,
@@ -62,16 +65,19 @@ class InfoExtractor(object):
                      * tbr        Average bitrate of audio and video in KBit/s
                      * abr        Average audio bitrate in KBit/s
                      * acodec     Name of the audio codec in use
+                    * asr        Audio sampling rate in Hertz
                      * vbr        Average video bitrate in KBit/s
                      * vcodec     Name of the video codec in use
+                    * container  Name of the container format
                      * filesize   The number of bytes, if known in advance
+                    * filesize_approx  An estimate for the number of bytes
                      * player_url SWF Player URL (used for rtmpdump).
                      * protocol   The protocol that will be used for the actual
                                   download, lower-case.
-                                 "http", "https", "rtsp", "rtmp" or so.
+                                 "http", "https", "rtsp", "rtmp", "m3u8" or so.
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
-                                 by this field.
+                                 by this field, regardless of all other values.
                                   -1 for default (order by other properties),
                                   -2 or smaller for less than default.
                      * quality    Order number of the video quality of this
@@ -85,12 +91,22 @@ class InfoExtractor(object):
  
      The following fields are optional:
  
-    thumbnails:     A list of dictionaries (with the entries "resolution" and
-                    "url") for the varying thumbnails
+    display_id      An alternative identifier for the video, not necessarily
+                    unique, but available before title. Typically, id is
+                    something like "4234987", title "Dancing naked mole rats",
+                    and display_id "dancing-naked-mole-rats"
+    thumbnails:     A list of dictionaries, with the following entries:
+                        * "url"
+                        * "width" (optional, int)
+                        * "height" (optional, int)
+                        * "resolution" (optional, string "{width}x{height"},
+                                        deprecated)
      thumbnail:      Full URL to a video thumbnail image.
      description:    One-line video description.
      uploader:       Full name of the video uploader.
+    timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
+                    If not explicitly set, calculated from timestamp.
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      subtitles:      The subtitle file contents as a dictionary in the format
@@ -104,6 +120,8 @@ class InfoExtractor(object):
      webpage_url:    The url to the video webpage, if given to youtube-dl it
                      should allow to get the same result again. (It will be set
                      by YoutubeDL if it's missing)
+    categories:     A list of categories that the video falls in, for example
+                    ["Sports", "Berlin"]
  
      Unless mentioned otherwise, the fields should be Unicode strings.
  
@@ -111,9 +129,6 @@ class InfoExtractor(object):
      _real_extract() methods and define a _VALID_URL regexp.
      Probably, they should also be added to the list of extractors.
  
-    _real_extract() must return a *list* of information dictionaries as
-    described above.
-
      Finally, the _WORKING attribute should be set to False for broken IEs
      in order to warn the users and skip the tests.
      """
@@ -219,6 +234,8 @@ class InfoExtractor(object):
                            webpage_bytes[:1024])
              if m:
                  encoding = m.group(1).decode('ascii')
+            elif webpage_bytes.startswith(b'\xff\xfe'):
+                encoding = 'utf-16'
              else:
                  encoding = 'utf-8'
          if self._downloader.params.get('dump_intermediate_pages', False):
@@ -234,13 +251,31 @@ class InfoExtractor(object):
                  url = url_or_request.get_full_url()
              except AttributeError:
                  url = url_or_request
-            raw_filename = ('%s_%s.dump' % (video_id, url))
+            basen = '%s_%s' % (video_id, url)
+            if len(basen) > 240:
+                h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
+                basen = basen[:240 - len(h)] + h
+            raw_filename = basen + '.dump'
              filename = sanitize_filename(raw_filename, restricted=True)
              self.to_screen(u'Saving request to ' + filename)
              with open(filename, 'wb') as outf:
                  outf.write(webpage_bytes)
  
-        content = webpage_bytes.decode(encoding, 'replace')
+        try:
+            content = webpage_bytes.decode(encoding, 'replace')
+        except LookupError:
+            content = webpage_bytes.decode('utf-8', 'replace')
+
+        if (u'<title>Access to this site is blocked</title>' in content and
+                u'Websense' in content[:512]):
+            msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
+            blocked_iframe = self._html_search_regex(
+                r'<iframe src="([^"]+)"', content,
+                u'Websense information URL', default=None)
+            if blocked_iframe:
+                msg += u' Visit %s for more details' % blocked_iframe
+            raise ExtractorError(msg, expected=True)
+
          return (content, urlh)
  
      def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
@@ -254,17 +289,27 @@ class InfoExtractor(object):
  
      def _download_xml(self, url_or_request, video_id,
                        note=u'Downloading XML', errnote=u'Unable to download XML',
-                      transform_source=None):
+                      transform_source=None, fatal=True):
          """Return the xml as an xml.etree.ElementTree.Element"""
-        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        xml_string = self._download_webpage(
+            url_or_request, video_id, note, errnote, fatal=fatal)
+        if xml_string is False:
+            return xml_string
          if transform_source:
              xml_string = transform_source(xml_string)
          return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
  
      def _download_json(self, url_or_request, video_id,
                         note=u'Downloading JSON metadata',
-                       errnote=u'Unable to download JSON metadata'):
-        json_string = self._download_webpage(url_or_request, video_id, note, errnote)
+                       errnote=u'Unable to download JSON metadata',
+                       transform_source=None,
+                       fatal=True):
+        json_string = self._download_webpage(
+            url_or_request, video_id, note, errnote, fatal=fatal)
+        if (not fatal) and json_string is False:
+            return None
+        if transform_source:
+            json_string = transform_source(json_string)
          try:
              return json.loads(json_string)
          except ValueError as ve:
@@ -329,7 +374,8 @@ class InfoExtractor(object):
          else:
              for p in pattern:
                  mobj = re.search(p, string, flags)
-                if mobj: break
+                if mobj:
+                    break
  
          if os.name != 'nt' and sys.stderr.isatty():
              _name = u'\033[0;34m%s\033[0m' % name
@@ -391,7 +437,7 @@ class InfoExtractor(object):
      # Helper functions for extracting OpenGraph info
      @staticmethod
      def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
          property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
          template = r'<meta[^>]+?%s[^>]+?%s'
          return [
@@ -421,14 +467,17 @@ class InfoExtractor(object):
          if secure: regexes = self._og_regexes('video:secure_url') + regexes
          return self._html_search_regex(regexes, html, name, **kargs)
  
-    def _html_search_meta(self, name, html, display_name=None):
+    def _og_search_url(self, html, **kargs):
+        return self._og_search_property('url', html, **kargs)
+
+    def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
          if display_name is None:
              display_name = name
          return self._html_search_regex(
              r'''(?ix)<meta
-                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
+                    (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
                      [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
-            html, display_name, fatal=False)
+            html, display_name, fatal=fatal, **kwargs)
  
      def _dc_search_uploader(self, html):
          return self._html_search_meta('dc.creator', html, 'uploader')
@@ -457,7 +506,14 @@ class InfoExtractor(object):
          }
          return RATING_TABLE.get(rating.lower(), None)
  
+    def _twitter_search_player(self, html):
+        return self._html_search_meta('twitter:player', html,
+            'twitter card player')
+
      def _sort_formats(self, formats):
+        if not formats:
+            raise ExtractorError(u'No video formats found')
+
          def _formats_key(f):
              # TODO remove the following workaround
              from ..utils import determine_ext
@@ -506,10 +562,53 @@ class InfoExtractor(object):
                  f.get('abr') if f.get('abr') is not None else -1,
                  audio_ext_preference,
                  f.get('filesize') if f.get('filesize') is not None else -1,
+                f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
                  f.get('format_id'),
              )
          formats.sort(key=_formats_key)
  
+    def http_scheme(self):
+        """ Either "https:" or "https:", depending on the user's preferences """
+        return (
+            'http:'
+            if self._downloader.params.get('prefer_insecure', False)
+            else 'https:')
+
+    def _proto_relative_url(self, url, scheme=None):
+        if url is None:
+            return url
+        if url.startswith('//'):
+            if scheme is None:
+                scheme = self.http_scheme()
+            return scheme + url
+        else:
+            return url
+
+    def _sleep(self, timeout, video_id, msg_template=None):
+        if msg_template is None:
+            msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
+        msg = msg_template % {'video_id': video_id, 'timeout': timeout}
+        self.to_screen(msg)
+        time.sleep(timeout)
+
+    def _extract_f4m_formats(self, manifest_url, video_id):
+        manifest = self._download_xml(
+            manifest_url, video_id, 'Downloading f4m manifest',
+            'Unable to download f4m manifest')
+
+        formats = []
+        for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
+            formats.append({
+                'url': manifest_url,
+                'ext': 'flv',
+                'tbr': int_or_none(media_el.attrib.get('bitrate')),
+                'width': int_or_none(media_el.attrib.get('width')),
+                'height': int_or_none(media_el.attrib.get('height')),
+            })
+        self._sort_formats(formats)
+
+        return formats
+
  
  class SearchInfoExtractor(InfoExtractor):
      """