From: Rogério Brito Date: Wed, 10 Jul 2013 21:15:06 +0000 (-0300) Subject: Imported Upstream version 2013.07.10 X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/975f9519f0ae48e780767bf543204c6626a493b0?hp=-c Imported Upstream version 2013.07.10 --- 975f9519f0ae48e780767bf543204c6626a493b0 diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index c3d69e6..150c88d 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -20,9 +20,9 @@ tests = [ # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 + # 83 - vfl26ng3K 2013/07/10 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS"), # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 39a5ee3..c73d0e4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -61,6 +61,17 @@ class TestAllURLsMatching(unittest.TestCase): else: self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url)) + def test_keywords(self): + ies = gen_extractors() + matching_ies = lambda url: [ie.IE_NAME for ie in ies + if ie.suitable(url) and ie.IE_NAME != 'generic'] + self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions']) + self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions']) + self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral']) + self.assertEqual(matching_ies(':tds'), ['ComedyCentral']) + self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral']) + self.assertEqual(matching_ies(':cr'), ['ComedyCentral']) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py index e87b625..e766042 100755 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase): def test_83(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<" - right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS[^?/]+)(?:[?].*)?$' + _TEST = { + u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect", + u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv', + u'md5': u'8af1d4cf447933ed3c7f4871162602db', + u'info_dict': { + u"title": u"1968 Demo - FJCC Conference Presentation Reel #1", + u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also Doug's 1968 Demo page for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | Reel 2 | Reel 3", + u"upload_date": u"19681210", + u"uploader": u"SRI International" + } + } + + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + json_url = url + (u'?' if u'?' in url else '&') + u'output=json' + json_data = self._download_webpage(json_url, video_id) + data = json.loads(json_data) + + title = data['metadata']['title'][0] + description = data['metadata']['description'][0] + uploader = data['metadata']['creator'][0] + upload_date = unified_strdate(data['metadata']['date'][0]) + + formats = [{ + 'format': fdata['format'], + 'url': 'http://' + data['server'] + data['dir'] + fn, + 'file_size': int(fdata['size']), + } + for fn,fdata in data['files'].items() + if 'Video' in fdata['format']] + formats.sort(key=lambda fdata: fdata['file_size']) + + info = { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'uploader': uploader, + 'upload_date': upload_date, + } + thumbnail = data.get('misc', {}).get('image') + if thumbnail: + info['thumbnail'] = thumbnail + + # TODO: Remove when #980 has been merged + info['url'] = formats[-1]['url'] + info['ext'] = determine_ext(formats[-1]['url']) + + return self.video_result(info) \ No newline at end of file diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index 5793a41..dbf8eed 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -32,7 +32,7 @@ class ARDIE(InfoExtractor): # determine title and media streams from webpage html = self._download_webpage(url, video_id) title = re.search(self._TITLE, html).group('title') - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] + streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)] if not streams: assert '"fsk"' in html raise ExtractorError(u'This video is only available after 8:00 pm') diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 183274e..e7a91a1 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,11 +1,9 @@ import re import json +import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - # This is used by the not implemented extractLiveStream method - compat_urllib_parse, - ExtractorError, unified_strdate, ) @@ -16,8 +14,8 @@ class ArteTvIE(InfoExtractor): www.arte.tv/guide, the extraction process is different for each one. The videos expire in 7 days, so we can't add tests. """ - _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P.*?).html' + _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' _LIVE_URL = r'index-[0-9]+\.html$' IE_NAME = u'arte.tv' @@ -27,6 +25,7 @@ class ArteTvIE(InfoExtractor): return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL)) # TODO implement Live Stream + # from ..utils import compat_urllib_parse # def extractLiveStream(self, url): # video_lang = url.split('/')[-4] # info = self.grep_webpage( @@ -56,23 +55,24 @@ class ArteTvIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._EMISSION_URL, url) if mobj is not None: - name = mobj.group('name') + lang = mobj.group('lang') # This is not a real id, it can be for example AJT for the news # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal video_id = mobj.group('id') - return self._extract_emission(url, video_id) + return self._extract_emission(url, video_id, lang) mobj = re.match(self._VIDEOS_URL, url) if mobj is not None: id = mobj.group('id') - return self._extract_video(url, id) + lang = mobj.group('lang') + return self._extract_video(url, id, lang) if re.search(self._LIVE_URL, video_id) is not None: raise ExtractorError(u'Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) # return - def _extract_emission(self, url, video_id): + def _extract_emission(self, url, video_id, lang): """Extract from www.arte.tv/guide""" webpage = self._download_webpage(url, video_id) json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') @@ -91,6 +91,16 @@ class ArteTvIE(InfoExtractor): } formats = player_info['VSR'].values() + def _match_lang(f): + # Return true if that format is in the language of the url + if lang == 'fr': + l = 'F' + elif lang == 'de': + l = 'A' + regexes = [r'VO?%s' % l, r'V%s-ST.' % l] + return any(re.match(r, f['versionCode']) for r in regexes) + # Some formats may not be in the same language as the url + formats = filter(_match_lang, formats) # We order the formats by quality formats = sorted(formats, key=lambda f: int(f['height'])) # Pick the best quality @@ -103,13 +113,15 @@ class ArteTvIE(InfoExtractor): return info_dict - def _extract_video(self, url, video_id): + def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" - config_xml_url = url.replace('/videos/', '/do_delegate/videos/') - config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml') - config_xml = self._download_webpage(config_xml_url, video_id) - config_xml_url = self._html_search_regex(r'