Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tudou.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 from .common import InfoExtractor
   6 from ..compat import compat_str
   7
   8
   9 class TudouIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
  11     _TESTS = [{
  12         'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
  13         'md5': '140a49ed444bd22f93330985d8475fcb',
  14         'info_dict': {
  15             'id': '159448201',
  16             'ext': 'f4v',
  17             'title': '卡马乔国足开大脚长传冲吊集锦',
  18             'thumbnail': 're:^https?://.*\.jpg$',
  19         }
  20     }, {
  21         'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
  22         'info_dict': {
  23             'id': '117049447',
  24             'ext': 'f4v',
  25             'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
  26             'thumbnail': 're:^https?://.*\.jpg$',
  27         }
  28     }, {
  29         'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
  30         'only_matching': True,
  31     }]
  32
  33     _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
  34
  35     def _url_for_id(self, video_id, quality=None):
  36         info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
  37         if quality:
  38             info_url += '&hd' + quality
  39         xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
  40         final_url = xml_data.text
  41         return final_url
  42
  43     def _real_extract(self, url):
  44         video_id = self._match_id(url)
  45         webpage = self._download_webpage(url, video_id)
  46
  47         youku_vcode = self._search_regex(
  48             r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
  49         if youku_vcode:
  50             return self.url_result('youku:' + youku_vcode, ie='Youku')
  51
  52         title = self._search_regex(
  53             r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
  54         thumbnail_url = self._search_regex(
  55             r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
  56
  57         player_url = self._search_regex(
  58             r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
  59             webpage, 'player URL', default=self._PLAYER_URL)
  60
  61         segments = self._parse_json(self._search_regex(
  62             r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
  63         # It looks like the keys are the arguments that have to be passed as
  64         # the hd field in the request url, we pick the higher
  65         # Also, filter non-number qualities (see issue #3643).
  66         quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
  67                          key=lambda k: int(k))[-1]
  68         parts = segments[quality]
  69         result = []
  70         len_parts = len(parts)
  71         if len_parts > 1:
  72             self.to_screen('%s: found %s parts' % (video_id, len_parts))
  73         for part in parts:
  74             part_id = part['k']
  75             final_url = self._url_for_id(part_id, quality)
  76             ext = (final_url.split('?')[0]).split('.')[-1]
  77             part_info = {
  78                 'id': '%s' % part_id,
  79                 'url': final_url,
  80                 'ext': ext,
  81                 'title': title,
  82                 'thumbnail': thumbnail_url,
  83                 'http_headers': {
  84                     'Referer': player_url,
  85                 },
  86             }
  87             result.append(part_info)
  88
  89         return {
  90             '_type': 'multi_video',
  91             'entries': result,
  92             'id': video_id,
  93             'title': title,
  94         }