X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/a316b1d93c357b5edf19d5e3100526a94191c029..d2e1a98478a93d5d191b83a9aa545df270ebf323:/youtube_dl/extractor/appletrailers.py?ds=sidebyside diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 8b191c1..576f03b 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,163 +1,143 @@ +from __future__ import unicode_literals + import re -import xml.etree.ElementTree +import json from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( - determine_ext, + int_or_none, ) class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P[^/]+)/(?P[^/]+)' - _TEST = { - u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", - u"playlist": [ + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P[^/]+)/(?P[^/]+)' + _TESTS = [{ + "url": "http://trailers.apple.com/trailers/wb/manofsteel/", + 'info_dict': { + 'id': 'manofsteel', + }, + "playlist": [ { - u"file": u"manofsteel-trailer4.mov", - u"md5": u"11874af099d480cc09e103b189805d5f", - u"info_dict": { - u"duration": 111, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_11624.jpg", - u"title": u"Trailer 4", - u"upload_date": u"20130523", - u"uploader_id": u"wb", + "md5": "d97a8e575432dbcb81b7c3acb741f8a8", + "info_dict": { + "id": "manofsteel-trailer4", + "ext": "mov", + "duration": 111, + "title": "Trailer 4", + "upload_date": "20130523", + "uploader_id": "wb", }, }, { - u"file": u"manofsteel-trailer3.mov", - u"md5": u"07a0a262aae5afe68120eed61137ab34", - u"info_dict": { - u"duration": 182, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_10793.jpg", - u"title": u"Trailer 3", - u"upload_date": u"20130417", - u"uploader_id": u"wb", + "md5": "b8017b7131b721fb4e8d6f49e1df908c", + "info_dict": { + "id": "manofsteel-trailer3", + "ext": "mov", + "duration": 182, + "title": "Trailer 3", + "upload_date": "20130417", + "uploader_id": "wb", }, }, { - u"file": u"manofsteel-trailer.mov", - u"md5": u"e401fde0813008e3307e54b6f384cff1", - u"info_dict": { - u"duration": 148, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_8703.jpg", - u"title": u"Trailer", - u"upload_date": u"20121212", - u"uploader_id": u"wb", + "md5": "d0f1e1150989b9924679b441f3404d48", + "info_dict": { + "id": "manofsteel-trailer", + "ext": "mov", + "duration": 148, + "title": "Trailer", + "upload_date": "20121212", + "uploader_id": "wb", }, }, { - u"file": u"manofsteel-teaser.mov", - u"md5": u"76b392f2ae9e7c98b22913c10a639c97", - u"info_dict": { - u"duration": 93, - u"thumbnail": u"http://trailers.apple.com/trailers/wb/manofsteel/images/thumbnail_6899.jpg", - u"title": u"Teaser", - u"upload_date": u"20120721", - u"uploader_id": u"wb", + "md5": "5fe08795b943eb2e757fa95cb6def1cb", + "info_dict": { + "id": "manofsteel-teaser", + "ext": "mov", + "duration": 93, + "title": "Teaser", + "upload_date": "20120721", + "uploader_id": "wb", }, - } + }, ] - } + }, { + 'url': 'http://trailers.apple.com/ca/metropole/autrui/', + 'only_matching': True, + }] + + _JSON_RE = r'iTunes.playURL\((.*?)\);' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) movie = mobj.group('movie') uploader_id = mobj.group('company') - playlist_url = url.partition(u'?')[0] + u'/includes/playlists/web.inc' - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)', u'', playlist_snippet) - playlist_html = u'' + playlist_cleaned + u'' + playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc') - size_cache = {} + def fix_html(s): + s = re.sub(r'(?s).*?', '', s) + s = re.sub(r'', r'', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + + def _clean_json(m): + return 'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = '%s' % s + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): - title = li.find('.//h3').text + on_click = li.find('.//a').attrib['onClick'] + trailer_info_json = self._search_regex(self._JSON_RE, + on_click, 'trailer info') + trailer_info = json.loads(trailer_info_json) + title = trailer_info['title'] video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower() thumbnail = li.find('.//img').attrib['src'] + upload_date = trailer_info['posted'].replace('-', '') - date_el = li.find('.//p') - upload_date = None - m = re.search(r':\s?(?P[0-9]{2})/(?P[0-9]{2})/(?P[0-9]{2})', date_el.text) - if m: - upload_date = u'20' + m.group('year') + m.group('month') + m.group('day') - runtime_el = date_el.find('./br') - m = re.search(r':\s?(?P[0-9]+):(?P[0-9]{1,2})', runtime_el.tail) + runtime = trailer_info['runtime'] + m = re.search(r'(?P[0-9]+):(?P[0-9]{1,2})', runtime) duration = None if m: duration = 60 * int(m.group('minutes')) + int(m.group('seconds')) + first_url = trailer_info['url'] + trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower() + settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id) + settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json') + formats = [] - for formats_el in li.findall('.//a'): - if formats_el.attrib['class'] != 'OverlayPanel': - continue - target = formats_el.attrib['target'] - - format_code = formats_el.text - if 'Automatic' in format_code: - continue - - size_q = formats_el.attrib['href'] - size_id = size_q.rpartition('#videos-')[2] - if size_id not in size_cache: - size_url = url + size_q - sizepage_html = self._download_webpage( - size_url, movie, - note=u'Downloading size info %s' % size_id, - errnote=u'Error while downloading size info %s' % size_id, - ) - _doc = xml.etree.ElementTree.fromstring(sizepage_html) - size_cache[size_id] = _doc - - sizepage_doc = size_cache[size_id] - links = sizepage_doc.findall('.//{http://www.w3.org/1999/xhtml}ul/{http://www.w3.org/1999/xhtml}li/{http://www.w3.org/1999/xhtml}a') - for vid_a in links: - href = vid_a.get('href') - if not href.endswith(target): - continue - detail_q = href.partition('#')[0] - detail_url = url + '/' + detail_q - - m = re.match(r'includes/(?P[^/]+)/', detail_q) - detail_id = m.group('detail_id') - - detail_html = self._download_webpage( - detail_url, movie, - note=u'Downloading detail %s %s' % (detail_id, size_id), - errnote=u'Error while downloading detail %s %s' % (detail_id, size_id) - ) - detail_doc = xml.etree.ElementTree.fromstring(detail_html) - movie_link_el = detail_doc.find('.//{http://www.w3.org/1999/xhtml}a') - assert movie_link_el.get('class') == 'movieLink' - movie_link = movie_link_el.get('href').partition('?')[0].replace('_', '_h') - ext = determine_ext(movie_link) - assert ext == 'mov' - - formats.append({ - 'format': format_code, - 'ext': ext, - 'url': movie_link, - }) - - info = { + for format in settings['metadata']['sizes']: + # The src is a file pointing to the real video file + format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src']) + formats.append({ + 'url': format_url, + 'format': format['type'], + 'width': int_or_none(format['width']), + 'height': int_or_none(format['height']), + }) + + self._sort_formats(formats) + + playlist.append({ '_type': 'video', 'id': video_id, - 'title': title, 'formats': formats, 'title': title, 'duration': duration, 'thumbnail': thumbnail, 'upload_date': upload_date, 'uploader_id': uploader_id, - 'user_agent': 'QuickTime compatible (youtube-dl)', - } - # TODO: Remove when #980 has been merged - info['url'] = formats[-1]['url'] - info['ext'] = formats[-1]['ext'] - - playlist.append(info) + 'http_headers': { + 'User-Agent': 'QuickTime compatible (youtube-dl)', + }, + }) return { '_type': 'playlist',