]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youku.py
5b0b248cdd031588a958056e85f3efa9a6579eb4
   2 from __future__ 
import unicode_literals
 
   9 from .common 
import InfoExtractor
 
  19 class YoukuIE(InfoExtractor
): 
  25                 (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| 
  26                 video\.tudou\.com/v/)| 
  28         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) 
  33         'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html', 
  35             'id': 'XMTc1ODE5Njcy', 
  36             'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', 
  39             'thumbnail': r
're:^https?://.*', 
  41             'uploader_id': '36017967', 
  42             'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', 
  46         'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', 
  47         'only_matching': True, 
  49         'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html', 
  51             'id': 'XODgxNjg1Mzk2', 
  55             'thumbnail': r
're:^https?://.*', 
  57             'uploader_id': '62583473', 
  58             'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', 
  62         'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', 
  64             'id': 'XMTI1OTczNDM5Mg', 
  68             'thumbnail': r
're:^https?://.*', 
  69             'uploader': '放剧场-花千骨', 
  70             'uploader_id': '772849359', 
  71             'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', 
  75         'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', 
  76         'note': 'Video protected with password', 
  78             'id': 'XNjA1NzA2Njgw', 
  80             'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', 
  82             'thumbnail': r
're:^https?://.*', 
  83             'uploader': 'FoxJin1006', 
  84             'uploader_id': '322014285', 
  85             'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', 
  89             'videopassword': '100600', 
  92         # /play/get.json contains streams with "channel_type":"tail" 
  93         'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html', 
  95             'id': 'XOTUxMzg4NDMy', 
  97             'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', 
  99             'thumbnail': r
're:^https?://.*', 
 100             'uploader': '明月庄主moon', 
 101             'uploader_id': '38465621', 
 102             'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', 
 106         'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', 
 108             'id': 'XMjIyNzAzMTQ4NA', 
 110             'title': '卡马乔国足开大脚长传冲吊集锦', 
 112             'thumbnail': r
're:^https?://.*', 
 113             'uploader': '阿卜杜拉之星', 
 114             'uploader_id': '2382249', 
 115             'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', 
 119         'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', 
 120         'only_matching': True, 
 125         return '%d%s' % (int(time
.time()), ''.join([ 
 126             random
.choice(string
.ascii_letters
) for i 
in range(3)])) 
 128     def get_format_name(self
, fm
): 
 143     def _real_extract(self
, url
): 
 144         video_id 
= self
._match
_id
(url
) 
 146         self
._set
_cookie
('youku.com', '__ysuid', self
.get_ysuid()) 
 147         self
._set
_cookie
('youku.com', 'xreferrer', 'http://www.youku.com') 
 149         _
, urlh 
= self
._download
_webpage
_handle
( 
 150             'https://log.mmstat.com/eg.js', video_id
, 'Retrieving cna info') 
 151         # The etag header is '"foobar"'; let's remove the double quotes 
 152         cna 
= urlh
.headers
['etag'][1:-1] 
 155         basic_data_params 
= { 
 158             'client_ip': '192.168.1.1', 
 160             'client_ts': time
.time() / 1000, 
 163         video_password 
= self
._downloader
.params
.get('videopassword') 
 165             basic_data_params
['password'] = video_password
 
 170         headers
.update(self
.geo_verification_headers()) 
 171         data 
= self
._download
_json
( 
 172             'https://ups.youku.com/ups/get.json', video_id
, 
 173             'Downloading JSON metadata', 
 174             query
=basic_data_params
, headers
=headers
)['data'] 
 176         error 
= data
.get('error') 
 178             error_note 
= error
.get('note') 
 179             if error_note 
is not None and '因版权原因无法观看此视频' in error_note
: 
 180                 raise ExtractorError( 
 181                     'Youku said: Sorry, this video is available in China only', expected
=True) 
 182             elif error_note 
and '该视频被设为私密' in error_note
: 
 183                 raise ExtractorError( 
 184                     'Youku said: Sorry, this video is private', expected
=True) 
 186                 msg 
= 'Youku server reported error %i' % error
.get('code') 
 187                 if error_note 
is not None: 
 188                     msg 
+= ': ' + error_note
 
 189                 raise ExtractorError(msg
) 
 192         video_data 
= data
['video'] 
 193         title 
= video_data
['title'] 
 196             'url': stream
['m3u8_url'], 
 197             'format_id': self
.get_format_name(stream
.get('stream_type')), 
 199             'protocol': 'm3u8_native', 
 200             'filesize': int(stream
.get('size')), 
 201             'width': stream
.get('width'), 
 202             'height': stream
.get('height'), 
 203         } for stream 
in data
['stream'] if stream
.get('channel_type') != 'tail'] 
 204         self
._sort
_formats
(formats
) 
 210             'duration': video_data
.get('seconds'), 
 211             'thumbnail': video_data
.get('logo'), 
 212             'uploader': video_data
.get('username'), 
 213             'uploader_id': str_or_none(video_data
.get('userid')), 
 214             'uploader_url': data
.get('uploader', {}).get('homepage'), 
 215             'tags': video_data
.get('tags'), 
 219 class YoukuShowIE(InfoExtractor
): 
 220     _VALID_URL 
= r
'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' 
 221     IE_NAME 
= 'youku:show' 
 224         'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 
 226             'id': 'zc7c670be07ff11e48b3f', 
 228             'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', 
 230         'playlist_count': 50, 
 232         # Episode number not starting from 1 
 233         'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html', 
 235             'id': 'zefbfbd70efbfbd780bef', 
 237             'description': 'md5:275715156abebe5ccc2a1992e9d56b98', 
 239         'playlist_count': 24, 
 241         # Ongoing playlist. The initial page is the last one 
 242         'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html', 
 243         'only_matching': True, 
 246         'url': 'http://list.youku.com/show/id_zefbfbd61237fefbfbdef.html', 
 247         'only_matching': True, 
 249         #  Wrong number of reload_id. 
 250         'url': 'http://list.youku.com/show/id_z20eb4acaf5c211e3b2ad.html', 
 251         'only_matching': True, 
 254     def _extract_entries(self
, playlist_data_url
, show_id
, note
, query
): 
 255         query
['callback'] = 'cb' 
 256         playlist_data 
= self
._download
_json
( 
 257             playlist_data_url
, show_id
, query
=query
, note
=note
, 
 258             transform_source
=lambda s
: js_to_json(strip_jsonp(s
))).get('html') 
 259         if playlist_data 
is None: 
 261         drama_list 
= (get_element_by_class('p-drama-grid', playlist_data
) or 
 262                       get_element_by_class('p-drama-half-row', playlist_data
)) 
 263         if drama_list 
is None: 
 264             raise ExtractorError('No episodes found') 
 265         video_urls 
= re
.findall(r
'<a[^>]+href="([^"]+)"', drama_list
) 
 266         return playlist_data
, [ 
 267             self
.url_result(self
._proto
_relative
_url
(video_url
, 'http:'), YoukuIE
.ie_key()) 
 268             for video_url 
in video_urls
] 
 270     def _real_extract(self
, url
): 
 271         show_id 
= self
._match
_id
(url
) 
 272         webpage 
= self
._download
_webpage
(url
, show_id
) 
 275         page_config 
= self
._parse
_json
(self
._search
_regex
( 
 276             r
'var\s+PageConfig\s*=\s*({.+});', webpage
, 'page config'), 
 277             show_id
, transform_source
=js_to_json
) 
 278         first_page
, initial_entries 
= self
._extract
_entries
( 
 279             'http://list.youku.com/show/module', show_id
, 
 280             note
='Downloading initial playlist data page', 
 282                 'id': page_config
['showid'], 
 285         first_page_reload_id 
= self
._html
_search
_regex
( 
 286             r
'<div[^>]+id="(reload_\d+)', first_page
, 'first page reload id') 
 287         # The first reload_id has the same items as first_page 
 288         reload_ids 
= re
.findall('<li[^>]+data-id="([^"]+)">', first_page
) 
 289         entries
.extend(initial_entries
) 
 290         for idx
, reload_id 
in enumerate(reload_ids
): 
 291             if reload_id 
== first_page_reload_id
: 
 293             _
, new_entries 
= self
._extract
_entries
( 
 294                 'http://list.youku.com/show/episode', show_id
, 
 295                 note
='Downloading playlist data page %d' % (idx 
+ 1), 
 297                     'id': page_config
['showid'], 
 300             if new_entries 
is not None: 
 301                 entries
.extend(new_entries
) 
 302         desc 
= self
._html
_search
_meta
('description', webpage
, fatal
=False) 
 303         playlist_title 
= desc
.split(',')[0] if desc 
else None 
 304         detail_li 
= get_element_by_class('p-intro', webpage
) 
 305         playlist_description 
= get_element_by_class( 
 306             'intro-more', detail_li
) if detail_li 
else None 
 308         return self
.playlist_result( 
 309             entries
, show_id
, playlist_title
, playlist_description
)