]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py
   2 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
, SearchInfoExtractor
 
  23 from .brightcove 
import BrightcoveNewIE
 
  26 class YahooIE(InfoExtractor
): 
  27     IE_DESC 
= 'Yahoo screen and movies' 
  28     _VALID_URL 
= r
'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' 
  30         'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 
  32             'id': '2d25e626-2378-391f-ada0-ddaf1417e588', 
  34             'title': 'Julian Smith & Travis Legg Watch Julian Smith', 
  35             'description': 'Julian and Travis watch Julian Smith', 
  37             'timestamp': 1369812016, 
  38             'upload_date': '20130529', 
  41         'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', 
  42         'md5': '7993e572fac98e044588d0b5260f4352', 
  44             'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', 
  46             'title': "Yahoo Saves 'Community'", 
  47             'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', 
  49             'timestamp': 1406838636, 
  50             'upload_date': '20140731', 
  53         'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html', 
  54         'md5': '71298482f7c64cbb7fa064e4553ff1c1', 
  56             'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58', 
  58             'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder', 
  59             'description': 'md5:f66c890e1490f4910a9953c941dee944', 
  61             'timestamp': 1414489862, 
  62             'upload_date': '20141028', 
  65         'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', 
  66         'md5': '88e209b417f173d86186bef6e4d1f160', 
  68             'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521', 
  70             'title': 'China Moses Is Crazy About the Blues', 
  71             'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', 
  73             'timestamp': 1385722202, 
  74             'upload_date': '20131129', 
  77         'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html', 
  78         'md5': '2a9752f74cb898af5d1083ea9f661b58', 
  80             'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1', 
  82             'title': '\'True Story\' Trailer', 
  83             'description': 'True Story', 
  85             'timestamp': 1418919206, 
  86             'upload_date': '20141218', 
  89         'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html', 
  90         'only_matching': True, 
  92         'note': 'NBC Sports embeds', 
  93         'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313', 
  97             'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d', 
  98             'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson', 
  99             'upload_date': '20150313', 
 100             'uploader': 'NBCU-SPORTS', 
 101             'timestamp': 1426270238, 
 104         'url': 'https://tw.news.yahoo.com/-100120367.html', 
 105         'only_matching': True, 
 107         # Query result is embedded in webpage, but explicit request to video API fails with geo restriction 
 108         'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html', 
 109         'md5': '4fbafb9c9b6f07aa8f870629f6671b35', 
 111             'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504', 
 113             'title': 'Communitary - Community Episode 1: Ladders', 
 114             'description': 'md5:8fc39608213295748e1e289807838c97', 
 116             'timestamp': 1440436550, 
 117             'upload_date': '20150824', 
 118             'series': 'Communitary', 
 124         'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html', 
 126             'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff', 
 128             'title': '單車天使 - 中文版預', 
 129             'description': '中文版預', 
 130             'timestamp': 1476696196, 
 131             'upload_date': '20161017', 
 134             'skip_download': True, 
 137         # Contains both a Yahoo hosted video and multiple Youtube embeds 
 138         'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html', 
 140             'id': '46c5d95a-528f-3d03-b732-732fcadd51de', 
 141             'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead', 
 142             'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.', 
 146                 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6', 
 148                 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs', 
 149                 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.', 
 150                 'timestamp': 1572406500, 
 151                 'upload_date': '20191030', 
 157                 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019', 
 158                 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11', 
 159                 'uploader': 'The Voice', 
 160                 'uploader_id': 'NBCTheVoice', 
 161                 'upload_date': '20191029', 
 167         'expected_warnings': ['HTTP Error 404'], 
 169         'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html', 
 170         'only_matching': True, 
 172         'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html', 
 173         'only_matching': True, 
 175         'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html', 
 176         'only_matching': True, 
 179     def _real_extract(self
, url
): 
 180         url
, country
, display_id 
= re
.match(self
._VALID
_URL
, url
).groups() 
 184             country 
= country
.split('-')[0] 
 185         api_base 
= 'https://%s.yahoo.com/_td/api/resource/' % country
 
 187         for i
, uuid 
in enumerate(['url=' + url
, 'ymedia-alias=' + display_id
]): 
 188             content 
= self
._download
_json
( 
 189                 api_base 
+ 'content;getDetailView=true;uuids=["%s"]' % uuid
, 
 190                 display_id
, 'Downloading content JSON metadata', fatal
=i 
== 1) 
 192                 item 
= content
['items'][0] 
 195         if item
.get('type') != 'video': 
 198             cover 
= item
.get('cover') or {} 
 199             if cover
.get('type') == 'yvideo': 
 200                 cover_url 
= cover
.get('url') 
 202                     entries
.append(self
.url_result( 
 203                         cover_url
, 'Yahoo', cover
.get('uuid'))) 
 205             for e 
in item
.get('body', []): 
 206                 if e
.get('type') == 'videoIframe': 
 207                     iframe_url 
= e
.get('url') 
 210                     entries
.append(self
.url_result(iframe_url
)) 
 212             return self
.playlist_result( 
 213                 entries
, item
.get('uuid'), 
 214                 item
.get('title'), item
.get('summary')) 
 216         video_id 
= item
['uuid'] 
 217         video 
= self
._download
_json
( 
 218             api_base 
+ 'VideoService.videos;view=full;video_ids=["%s"]' % video_id
, 
 219             video_id
, 'Downloading video JSON metadata')[0] 
 220         title 
= video
['title'] 
 222         if country 
== 'malaysia': 
 225         is_live 
= video
.get('live_state') == 'live' 
 226         fmts 
= ('m3u8',) if is_live 
else ('webm', 'mp4') 
 232             media_obj 
= self
._download
_json
( 
 233                 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id
, 
 234                 video_id
, 'Downloading %s JSON metadata' % fmt
, 
 235                 headers
=self
.geo_verification_headers(), query
={ 
 237                     'region': country
.upper(), 
 238                 })['query']['results']['mediaObj'][0] 
 239             msg 
= media_obj
.get('status', {}).get('msg') 
 241             for s 
in media_obj
.get('streams', []): 
 244                 if not host 
or not path
: 
 247                 if s
.get('format') == 'm3u8': 
 248                     formats
.extend(self
._extract
_m
3u8_formats
( 
 249                         s_url
, video_id
, 'mp4', m3u8_id
='hls', fatal
=False)) 
 251                 tbr 
= int_or_none(s
.get('bitrate')) 
 254                     'format_id': fmt 
+ ('-%d' % tbr 
if tbr 
else ''), 
 255                     'width': int_or_none(s
.get('width')), 
 256                     'height': int_or_none(s
.get('height')), 
 258                     'fps': int_or_none(s
.get('framerate')), 
 261             for cc 
in media_obj
.get('closedcaptions', []): 
 262                 cc_url 
= cc
.get('url') 
 263                 if not cc_url 
or cc_url 
in urls
: 
 266                 subtitles
.setdefault(cc
.get('lang') or 'en-US', []).append({ 
 268                     'ext': mimetype2ext(cc
.get('content_type')), 
 271         streaming_url 
= video
.get('streaming_url') 
 272         if streaming_url 
and not is_live
: 
 273             formats
.extend(self
._extract
_m
3u8_formats
( 
 274                 streaming_url
, video_id
, 'mp4', 
 275                 'm3u8_native', m3u8_id
='hls', fatal
=False)) 
 277         if not formats 
and msg 
== 'geo restricted': 
 278             self
.raise_geo_restricted() 
 280         self
._sort
_formats
(formats
) 
 283         for thumb 
in video
.get('thumbnails', []): 
 284             thumb_url 
= thumb
.get('url') 
 288                 'id': thumb
.get('tag'), 
 289                 'url': thumb
.get('url'), 
 290                 'width': int_or_none(thumb
.get('width')), 
 291                 'height': int_or_none(thumb
.get('height')), 
 294         series_info 
= video
.get('series_info') or {} 
 298             'title': self
._live
_title
(title
) if is_live 
else title
, 
 300             'display_id': display_id
, 
 301             'thumbnails': thumbnails
, 
 302             'description': clean_html(video
.get('description')), 
 303             'timestamp': parse_iso8601(video
.get('publish_time')), 
 304             'subtitles': subtitles
, 
 305             'duration': int_or_none(video
.get('duration')), 
 306             'view_count': int_or_none(video
.get('view_count')), 
 308             'series': video
.get('show_name'), 
 309             'season_number': int_or_none(series_info
.get('season_number')), 
 310             'episode_number': int_or_none(series_info
.get('episode_number')), 
 314 class YahooSearchIE(SearchInfoExtractor
): 
 315     IE_DESC 
= 'Yahoo screen search' 
 317     IE_NAME 
= 'screen.yahoo:search' 
 318     _SEARCH_KEY 
= 'yvsearch' 
 320     def _get_n_results(self
, query
, n
): 
 321         """Get a specified number of results for a query""" 
 323         for pagenum 
in itertools
.count(0): 
 324             result_url 
= 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum 
* 30) 
 325             info 
= self
._download
_json
(result_url
, query
, 
 326                                        note
='Downloading results page ' + str(pagenum 
+ 1)) 
 328             results 
= info
['results'] 
 330             for (i
, r
) in enumerate(results
): 
 331                 if (pagenum 
* 30) + i 
>= n
: 
 333                 mobj 
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
) 
 334                 e 
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo') 
 336             if (pagenum 
* 30 + i 
>= n
) or (m
['last'] >= (m
['total'] - 1)): 
 346 class YahooGyaOPlayerIE(InfoExtractor
): 
 347     IE_NAME 
= 'yahoo:gyao:player' 
 348     _VALID_URL 
= r
'https?://(?:gyao\.yahoo\.co\.jp/(?:player|episode/[^/]+)|streaming\.yahoo\.co\.jp/c/y)/(?P<id>\d+/v\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' 
 350         'url': 'https://gyao.yahoo.co.jp/player/00998/v00818/v0000000000000008564/', 
 352             'id': '5993125228001', 
 354             'title': 'フューリー 【字幕版】', 
 355             'description': 'md5:21e691c798a15330eda4db17a8fe45a5', 
 356             'uploader_id': '4235717419001', 
 357             'upload_date': '20190124', 
 358             'timestamp': 1548294365, 
 362             'skip_download': True, 
 365         'url': 'https://streaming.yahoo.co.jp/c/y/01034/v00133/v0000000000000000706/', 
 366         'only_matching': True, 
 368         'url': 'https://gyao.yahoo.co.jp/episode/%E3%81%8D%E3%81%AE%E3%81%86%E4%BD%95%E9%A3%9F%E3%81%B9%E3%81%9F%EF%BC%9F%20%E7%AC%AC2%E8%A9%B1%202019%2F4%2F12%E6%94%BE%E9%80%81%E5%88%86/5cb02352-b725-409e-9f8d-88f947a9f682', 
 369         'only_matching': True, 
 372     def _real_extract(self
, url
): 
 373         video_id 
= self
._match
_id
(url
).replace('/', ':') 
 374         video 
= self
._download
_json
( 
 375             'https://gyao.yahoo.co.jp/dam/v1/videos/' + video_id
, 
 377                 'fields': 'longDescription,title,videoId', 
 379                 'X-User-Agent': 'Unknown Pc GYAO!/2.0.0 Web', 
 382             '_type': 'url_transparent', 
 384             'title': video
['title'], 
 386                 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video
['videoId'], 
 387                 {'geo_countries': ['JP']}), 
 388             'description': video
.get('longDescription'), 
 389             'ie_key': BrightcoveNewIE
.ie_key(), 
 393 class YahooGyaOIE(InfoExtractor
): 
 394     IE_NAME 
= 'yahoo:gyao' 
 395     _VALID_URL 
= r
'https?://(?:gyao\.yahoo\.co\.jp/(?:p|title/[^/]+)|streaming\.yahoo\.co\.jp/p/y)/(?P<id>\d+/v\d+|[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' 
 397         'url': 'https://gyao.yahoo.co.jp/p/00449/v03102/', 
 399             'id': '00449:v03102', 
 403         'url': 'https://streaming.yahoo.co.jp/p/y/01034/v00133/', 
 404         'only_matching': True, 
 406         'url': 'https://gyao.yahoo.co.jp/title/%E3%81%97%E3%82%83%E3%81%B9%E3%81%8F%E3%82%8A007/5b025a49-b2e5-4dc7-945c-09c6634afacf', 
 407         'only_matching': True, 
 410     def _real_extract(self
, url
): 
 411         program_id 
= self
._match
_id
(url
).replace('/', ':') 
 412         videos 
= self
._download
_json
( 
 413             'https://gyao.yahoo.co.jp/api/programs/%s/videos' % program_id
, program_id
)['videos'] 
 416             video_id 
= video
.get('id') 
 419             entries
.append(self
.url_result( 
 420                 'https://gyao.yahoo.co.jp/player/%s/' % video_id
.replace(':', '/'), 
 421                 YahooGyaOPlayerIE
.ie_key(), video_id
)) 
 422         return self
.playlist_result(entries
, program_id
) 
 425 class YahooJapanNewsIE(InfoExtractor
): 
 426     IE_NAME 
= 'yahoo:japannews' 
 427     IE_DESC 
= 'Yahoo! Japan News' 
 428     _VALID_URL 
= r
'https?://(?P<host>(?:news|headlines)\.yahoo\.co\.jp)[^\d]*(?P<id>\d[\d-]*\d)?' 
 429     _GEO_COUNTRIES 
= ['JP'] 
 431         'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int', 
 435             'title': 'ムン大統領が対日批判を強化“現金化”効果は?(テレビ朝日系(ANN)) - Yahoo!ニュース', 
 436             'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系(ANN))', 
 437             'thumbnail': r
're:^https?://.*\.[a-zA-Z\d]{3,4}$', 
 440             'skip_download': True, 
 444         'url': 'https://headlines.yahoo.co.jp/hl?a=20190721-00000001-oxv-l04', 
 445         'only_matching': True, 
 447         'url': 'https://headlines.yahoo.co.jp/videonews/', 
 448         'only_matching': True, 
 450         'url': 'https://news.yahoo.co.jp', 
 451         'only_matching': True, 
 453         'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/', 
 454         'only_matching': True, 
 456         'url': 'https://news.yahoo.co.jp/feature/1356', 
 457         'only_matching': True 
 460     def _extract_formats(self
, json_data
, content_id
): 
 463         video_data 
= try_get( 
 465             lambda x
: x
['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], 
 467         for vid 
in video_data 
or []: 
 468             delivery 
= vid
.get('delivery') 
 469             url 
= url_or_none(vid
.get('Url')) 
 470             if not delivery 
or not url
: 
 472             elif delivery 
== 'hls': 
 474                     self
._extract
_m
3u8_formats
( 
 475                         url
, content_id
, 'mp4', 'm3u8_native', 
 476                         m3u8_id
='hls', fatal
=False)) 
 480                     'format_id': 'http-%s' % compat_str(vid
.get('bitrate', '')), 
 481                     'height': int_or_none(vid
.get('height')), 
 482                     'width': int_or_none(vid
.get('width')), 
 483                     'tbr': int_or_none(vid
.get('bitrate')), 
 485         self
._remove
_duplicate
_formats
(formats
) 
 486         self
._sort
_formats
(formats
) 
 490     def _real_extract(self
, url
): 
 491         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 492         host 
= mobj
.group('host') 
 493         display_id 
= mobj
.group('id') or host
 
 495         webpage 
= self
._download
_webpage
(url
, display_id
) 
 497         title 
= self
._html
_search
_meta
( 
 498             ['og:title', 'twitter:title'], webpage
, 'title', default
=None 
 499         ) or self
._html
_search
_regex
('<title>([^<]+)</title>', webpage
, 'title') 
 501         if display_id 
== host
: 
 502             # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) 
 503             stream_plists 
= re
.findall(r
'plist=(\d+)', webpage
) or re
.findall(r
'plist["\']:\s
*["\']([^"\']+)', webpage) 
 507                         'http
://players
.brightcove
.net
/5690807595001/HyZNerRl7_default
/index
.html?playlistId
=%s' % plist_id, 
 508                         {'geo_countries
': ['JP
']}), 
 509                     ie='BrightcoveNew
', video_id=plist_id) 
 510                 for plist_id in stream_plists] 
 511             return self.playlist_result(entries, playlist_title=title) 
 514         description = self._html_search_meta( 
 515             ['og
:description
', 'description
', 'twitter
:description
'], 
 516             webpage, 'description
', default=None) 
 517         thumbnail = self._og_search_thumbnail( 
 518             webpage, default=None) or self._html_search_meta( 
 519             'twitter
:image
', webpage, 'thumbnail
', default=None) 
 520         space_id = self._search_regex([ 
 521             r'<script
[^
>]+class=["\']yvpub-player["\'][^
>]+spaceid
=([^
&"\']+)', 
 522             r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', 
 523             r'<!--\s
+SpaceID
=(\d
+)' 
 524         ], webpage, 'spaceid
') 
 526         content_id = self._search_regex( 
 527             r'<script
[^
>]+class=["\']yvpub-player["\'][^
>]+contentid
=(?P
<contentid
>[^
&"\']+)', 
 528             webpage, 'contentid', group='contentid') 
 530         json_data = self._download_json( 
 531             'https://feapi-yvpub.yahooapis.jp/v1/content/%s' % content_id, 
 534                 'appid': 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-', 
 536                 'space_id': space_id, 
 538                 'ak': hashlib.md5('_'.join((space_id, host)).encode()).hexdigest(), 
 539                 'device_type': '1100', 
 541         formats = self._extract_formats(json_data, content_id) 
 546             'description': description, 
 547             'thumbnail': thumbnail,