X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/848723ea972c09f28787db91d8c06e98a274ab89..d018d3313032e12968a6add6800e51d412e2f602:/youtube_dl/extractor/firstpost.py?ds=sidebyside diff --git a/youtube_dl/extractor/firstpost.py b/youtube_dl/extractor/firstpost.py index 7e3d1af..298227d 100644 --- a/youtube_dl/extractor/firstpost.py +++ b/youtube_dl/extractor/firstpost.py @@ -1,12 +1,9 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class FirstpostIE(InfoExtractor): - IE_NAME = 'Firstpost.com' _VALID_URL = r'http://(?:www\.)?firstpost\.com/[^/]+/.*-(?P[0-9]+)\.html' _TEST = { @@ -16,23 +13,38 @@ class FirstpostIE(InfoExtractor): 'id': '1025403', 'ext': 'mp4', 'title': 'India to launch indigenous aircraft carrier INS Vikrant today', - 'description': 'Its flight deck is over twice the size of a football field, its power unit can light up the entire Kochi city and the cabling is enough to cover the distance between here to Delhi.', + 'description': 'md5:feef3041cb09724e0bdc02843348f5f4', } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) + page = self._download_webpage(url, video_id) + + title = self._html_search_meta('twitter:title', page, 'title', fatal=True) + description = self._html_search_meta('twitter:description', page, 'title') + + data = self._download_xml( + 'http://www.firstpost.com/getvideoxml-%s.xml' % video_id, video_id, + 'Downloading video XML') + + item = data.find('./playlist/item') + thumbnail = item.find('./image').text - webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, 'video URL') + formats = [ + { + 'url': details.find('./file').text, + 'format_id': details.find('./label').text.strip(), + 'width': int(details.find('./width').text.strip()), + 'height': int(details.find('./height').text.strip()), + } for details in item.findall('./source/file_details') if details.find('./file').text + ] + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, }