]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/appletrailers.py
Merge tag 'upstream/2015.02.06'
[youtubedl] / youtube_dl / extractor / appletrailers.py
index 4befff3942cd5f17fddb48bfb3b4c7f7623af1d6..287f71e076e91a44ea331c995410fbe8b40d178d 100644 (file)
@@ -1,59 +1,64 @@
+from __future__ import unicode_literals
+
 import re
 import re
-import xml.etree.ElementTree
 import json
 
 from .common import InfoExtractor
 import json
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
 from ..utils import (
-    compat_urlparse,
-    determine_ext,
+    int_or_none,
 )
 
 
 class AppleTrailersIE(InfoExtractor):
 )
 
 
 class AppleTrailersIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
+    _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
     _TEST = {
     _TEST = {
-        u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
-        u"playlist": [
+        "url": "http://trailers.apple.com/trailers/wb/manofsteel/",
+        "playlist": [
             {
             {
-                u"file": u"manofsteel-trailer4.mov",
-                u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8",
-                u"info_dict": {
-                    u"duration": 111,
-                    u"title": u"Trailer 4",
-                    u"upload_date": u"20130523",
-                    u"uploader_id": u"wb",
+                "md5": "d97a8e575432dbcb81b7c3acb741f8a8",
+                "info_dict": {
+                    "id": "manofsteel-trailer4",
+                    "ext": "mov",
+                    "duration": 111,
+                    "title": "Trailer 4",
+                    "upload_date": "20130523",
+                    "uploader_id": "wb",
                 },
             },
             {
                 },
             },
             {
-                u"file": u"manofsteel-trailer3.mov",
-                u"md5": u"b8017b7131b721fb4e8d6f49e1df908c",
-                u"info_dict": {
-                    u"duration": 182,
-                    u"title": u"Trailer 3",
-                    u"upload_date": u"20130417",
-                    u"uploader_id": u"wb",
+                "md5": "b8017b7131b721fb4e8d6f49e1df908c",
+                "info_dict": {
+                    "id": "manofsteel-trailer3",
+                    "ext": "mov",
+                    "duration": 182,
+                    "title": "Trailer 3",
+                    "upload_date": "20130417",
+                    "uploader_id": "wb",
                 },
             },
             {
                 },
             },
             {
-                u"file": u"manofsteel-trailer.mov",
-                u"md5": u"d0f1e1150989b9924679b441f3404d48",
-                u"info_dict": {
-                    u"duration": 148,
-                    u"title": u"Trailer",
-                    u"upload_date": u"20121212",
-                    u"uploader_id": u"wb",
+                "md5": "d0f1e1150989b9924679b441f3404d48",
+                "info_dict": {
+                    "id": "manofsteel-trailer",
+                    "ext": "mov",
+                    "duration": 148,
+                    "title": "Trailer",
+                    "upload_date": "20121212",
+                    "uploader_id": "wb",
                 },
             },
             {
                 },
             },
             {
-                u"file": u"manofsteel-teaser.mov",
-                u"md5": u"5fe08795b943eb2e757fa95cb6def1cb",
-                u"info_dict": {
-                    u"duration": 93,
-                    u"title": u"Teaser",
-                    u"upload_date": u"20120721",
-                    u"uploader_id": u"wb",
+                "md5": "5fe08795b943eb2e757fa95cb6def1cb",
+                "info_dict": {
+                    "id": "manofsteel-teaser",
+                    "ext": "mov",
+                    "duration": 93,
+                    "title": "Teaser",
+                    "upload_date": "20120721",
+                    "uploader_id": "wb",
                 },
                 },
-            }
+            },
         ]
     }
 
         ]
     }
 
@@ -64,24 +69,26 @@ class AppleTrailersIE(InfoExtractor):
         movie = mobj.group('movie')
         uploader_id = mobj.group('company')
 
         movie = mobj.group('movie')
         uploader_id = mobj.group('company')
 
-        playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
-        playlist_snippet = self._download_webpage(playlist_url, movie)
-        playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet)
-        playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned)
-        # The ' in the onClick attributes are not escaped, it couldn't be parsed
-        # with xml.etree.ElementTree.fromstring
-        # like: http://trailers.apple.com/trailers/wb/gravity/
-        def _clean_json(m):
-            return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
-        playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned)
-        playlist_html = u'<html>' + playlist_cleaned + u'</html>'
-
-        doc = xml.etree.ElementTree.fromstring(playlist_html)
+        playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
+
+        def fix_html(s):
+            s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
+            s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
+            # The ' in the onClick attributes are not escaped, it couldn't be parsed
+            # like: http://trailers.apple.com/trailers/wb/gravity/
+
+            def _clean_json(m):
+                return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
+            s = re.sub(self._JSON_RE, _clean_json, s)
+            s = '<html>%s</html>' % s
+            return s
+        doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
+
         playlist = []
         for li in doc.findall('./div/ul/li'):
             on_click = li.find('.//a').attrib['onClick']
             trailer_info_json = self._search_regex(self._JSON_RE,
         playlist = []
         for li in doc.findall('./div/ul/li'):
             on_click = li.find('.//a').attrib['onClick']
             trailer_info_json = self._search_regex(self._JSON_RE,
-                on_click, u'trailer info')
+                                                   on_click, 'trailer info')
             trailer_info = json.loads(trailer_info_json)
             title = trailer_info['title']
             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
             trailer_info = json.loads(trailer_info_json)
             title = trailer_info['title']
             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
@@ -97,8 +104,7 @@ class AppleTrailersIE(InfoExtractor):
             first_url = trailer_info['url']
             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
             first_url = trailer_info['url']
             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
-            settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
-            settings = json.loads(settings_json)
+            settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 
             formats = []
             for format in settings['metadata']['sizes']:
 
             formats = []
             for format in settings['metadata']['sizes']:
@@ -106,24 +112,25 @@ class AppleTrailersIE(InfoExtractor):
                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
                 formats.append({
                     'url': format_url,
                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
                 formats.append({
                     'url': format_url,
-                    'ext': determine_ext(format_url),
                     'format': format['type'],
                     'format': format['type'],
-                    'width': format['width'],
-                    'height': int(format['height']),
+                    'width': int_or_none(format['width']),
+                    'height': int_or_none(format['height']),
                 })
                 })
-            formats = sorted(formats, key=lambda f: (f['height'], f['width']))
+
+            self._sort_formats(formats)
 
             playlist.append({
                 '_type': 'video',
                 'id': video_id,
 
             playlist.append({
                 '_type': 'video',
                 'id': video_id,
-                'title': title,
                 'formats': formats,
                 'title': title,
                 'duration': duration,
                 'thumbnail': thumbnail,
                 'upload_date': upload_date,
                 'uploader_id': uploader_id,
                 'formats': formats,
                 'title': title,
                 'duration': duration,
                 'thumbnail': thumbnail,
                 'upload_date': upload_date,
                 'uploader_id': uploader_id,
-                'user_agent': 'QuickTime compatible (youtube-dl)',
+                'http_headers': {
+                    'User-Agent': 'QuickTime compatible (youtube-dl)',
+                },
             })
 
         return {
             })
 
         return {