Fix extraction from youtube.

[youtubedl] / youtube_dl / extractor / tudou.py
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py

index 5f7ac4b35b6c4576e5fb0998d56eb21ec522b58a..7421378a8c771543b3418a7d1f54b40cacb324e1 100644 (file)
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -3,92 +3,47 @@
  from __future__ import unicode_literals
  
  from .common import InfoExtractor
-from ..compat import compat_str
  
  
-class TudouIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tudou\.com/(?:listplay|programs(?:/view)?|albumplay)/([^/]+/)*(?P<id>[^/?#]+?)(?:\.html)?/?(?:$|[?#])'
+class TudouPlaylistIE(InfoExtractor):
+    IE_NAME = 'tudou:playlist'
+    _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html'
      _TESTS = [{
-        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
-        'md5': '140a49ed444bd22f93330985d8475fcb',
+        'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html',
          'info_dict': {
-            'id': '159448201',
-            'ext': 'f4v',
-            'title': '卡马乔国足开大脚长传冲吊集锦',
-            'thumbnail': 're:^https?://.*\.jpg$',
-        }
-    }, {
-        'url': 'http://www.tudou.com/programs/view/ajX3gyhL0pc/',
-        'info_dict': {
-            'id': '117049447',
-            'ext': 'f4v',
-            'title': 'La Sylphide-Bolshoi-Ekaterina Krysanova & Vyacheslav Lopatin 2012',
-            'thumbnail': 're:^https?://.*\.jpg$',
-        }
-    }, {
-        'url': 'http://www.tudou.com/albumplay/cJAHGih4yYg.html',
-        'only_matching': True,
+            'id': 'zzdE77v6Mmo',
+        },
+        'playlist_mincount': 209,
      }]
  
-    _PLAYER_URL = 'http://js.tudouui.com/bin/lingtong/PortalPlayer_177.swf'
-
-    def _url_for_id(self, video_id, quality=None):
-        info_url = 'http://v2.tudou.com/f?id=' + compat_str(video_id)
-        if quality:
-            info_url += '&hd' + quality
-        xml_data = self._download_xml(info_url, video_id, "Opening the info XML page")
-        final_url = xml_data.text
-        return final_url
-
      def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-
-        youku_vcode = self._search_regex(
-            r'vcode\s*:\s*[\'"]([^\'"]*)[\'"]', webpage, 'youku vcode', default=None)
-        if youku_vcode:
-            return self.url_result('youku:' + youku_vcode, ie='Youku')
-
-        title = self._search_regex(
-            r',kw\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'title')
-        thumbnail_url = self._search_regex(
-            r',pic\s*:\s*[\'"]([^\'"]+)[\'"]', webpage, 'thumbnail URL', fatal=False)
-
-        player_url = self._search_regex(
-            r'playerUrl\s*:\s*[\'"]([^\'"]+\.swf)[\'"]',
-            webpage, 'player URL', default=self._PLAYER_URL)
-
-        segments = self._parse_json(self._search_regex(
-            r'segs: \'([^\']+)\'', webpage, 'segments'), video_id)
-        # It looks like the keys are the arguments that have to be passed as
-        # the hd field in the request url, we pick the higher
-        # Also, filter non-number qualities (see issue #3643).
-        quality = sorted(filter(lambda k: k.isdigit(), segments.keys()),
-                         key=lambda k: int(k))[-1]
-        parts = segments[quality]
-        result = []
-        len_parts = len(parts)
-        if len_parts > 1:
-            self.to_screen('%s: found %s parts' % (video_id, len_parts))
-        for part in parts:
-            part_id = part['k']
-            final_url = self._url_for_id(part_id, quality)
-            ext = (final_url.split('?')[0]).split('.')[-1]
-            part_info = {
-                'id': '%s' % part_id,
-                'url': final_url,
-                'ext': ext,
-                'title': title,
-                'thumbnail': thumbnail_url,
-                'http_headers': {
-                    'Referer': player_url,
-                },
-            }
-            result.append(part_info)
+        playlist_id = self._match_id(url)
+        playlist_data = self._download_json(
+            'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id)
+        entries = [self.url_result(
+            'http://www.tudou.com/programs/view/%s' % item['icode'],
+            'Tudou', item['icode'],
+            item['kw']) for item in playlist_data['items']]
+        return self.playlist_result(entries, playlist_id)
+
+
+class TudouAlbumIE(InfoExtractor):
+    IE_NAME = 'tudou:album'
+    _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})'
+    _TESTS = [{
+        'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html',
+        'info_dict': {
+            'id': 'v5qckFJvNJg',
+        },
+        'playlist_mincount': 45,
+    }]
  
-        return {
-            '_type': 'multi_video',
-            'entries': result,
-            'id': video_id,
-            'title': title,
-        }
+    def _real_extract(self, url):
+        album_id = self._match_id(url)
+        album_data = self._download_json(
+            'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id)
+        entries = [self.url_result(
+            'http://www.tudou.com/programs/view/%s' % item['icode'],
+            'Tudou', item['icode'],
+            item['kw']) for item in album_data['items']]
+        return self.playlist_result(entries, album_id)