]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nbc.py
3645d3033f74ae174e3eaa85ad55bbe677d9daba
   1 from __future__ 
import unicode_literals
 
   5 from .common 
import InfoExtractor
 
  16 class NBCIE(InfoExtractor
): 
  17     _VALID_URL 
= r
'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' 
  21             'url': 'http://www.nbc.com/the-tonight-show/segments/112966', 
  22             # md5 checksum is not stable 
  26                 'title': 'Jimmy Fallon Surprises Fans at Ben & Jerry\'s', 
  27                 'description': 'Jimmy gives out free scoops of his new "Tonight Dough" ice cream flavor by surprising customers at the Ben & Jerry\'s scoop shop.', 
  31             'url': 'http://www.nbc.com/the-tonight-show/episodes/176', 
  35                 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', 
  36                 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', 
  38             'skip': 'Only works from US', 
  42     def _real_extract(self
, url
): 
  43         video_id 
= self
._match
_id
(url
) 
  44         webpage 
= self
._download
_webpage
(url
, video_id
) 
  45         theplatform_url 
= self
._search
_regex
( 
  46             '(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', 
  47             webpage
, 'theplatform url').replace('_no_endcard', '') 
  48         if theplatform_url
.startswith('//'): 
  49             theplatform_url 
= 'http:' + theplatform_url
 
  50         return self
.url_result(theplatform_url
) 
  53 class NBCNewsIE(InfoExtractor
): 
  54     _VALID_URL 
= r
'''(?x)https?://(?:www\.)?nbcnews\.com/ 
  55         (?:video/.+?/(?P<id>\d+)| 
  56         (?:feature|nightly-news)/[^/]+/(?P<title>.+)) 
  61             'url': 'http://www.nbcnews.com/video/nbc-news/52753292', 
  62             'md5': '47abaac93c6eaf9ad37ee6c4463a5179', 
  66                 'title': 'Crew emerges after four-month Mars food study', 
  67                 'description': 'md5:24e632ffac72b35f8b67a12d1b6ddfc1', 
  71             'url': 'http://www.nbcnews.com/feature/edward-snowden-interview/how-twitter-reacted-snowden-interview-n117236', 
  72             'md5': 'b2421750c9f260783721d898f4c42063', 
  76                 'title': 'How Twitter Reacted To The Snowden Interview', 
  77                 'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64', 
  79             'add_ie': ['ThePlatform'], 
  82             'url': 'http://www.nbcnews.com/feature/dateline-full-episodes/full-episode-family-business-n285156', 
  83             'md5': 'fdbf39ab73a72df5896b6234ff98518a', 
  87                 'title': 'FULL EPISODE: Family Business', 
  88                 'description': 'md5:757988edbaae9d7be1d585eb5d55cc04', 
  92             'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844', 
  93             'md5': 'b5dda8cddd8650baa0dcb616dd2cf60d', 
  97                 'title': 'Nightly News with Brian Williams Full Broadcast (February 4)', 
  98                 'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5', 
 103     def _real_extract(self
, url
): 
 104         mobj 
= re
.match(self
._VALID
_URL
, url
) 
 105         video_id 
= mobj
.group('id') 
 106         if video_id 
is not None: 
 107             all_info 
= self
._download
_xml
('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id
, video_id
) 
 108             info 
= all_info
.find('video') 
 112                 'title': info
.find('headline').text
, 
 114                 'url': find_xpath_attr(info
, 'media', 'type', 'flashVideo').text
, 
 115                 'description': compat_str(info
.find('caption').text
), 
 116                 'thumbnail': find_xpath_attr(info
, 'media', 'type', 'thumbnail').text
, 
 119             # "feature" and "nightly-news" pages use theplatform.com 
 120             title 
= mobj
.group('title') 
 121             webpage 
= self
._download
_webpage
(url
, title
) 
 122             bootstrap_json 
= self
._search
_regex
( 
 123                 r
'var\s+(?:bootstrapJson|playlistData)\s*=\s*({.+});?\s*$', 
 124                 webpage
, 'bootstrap json', flags
=re
.MULTILINE
) 
 125             bootstrap 
= self
._parse
_json
(bootstrap_json
, video_id
) 
 126             info 
= bootstrap
['results'][0]['video'] 
 127             mpxid 
= info
['mpxId'] 
 130                 info
['fallbackPlaylistUrl'], 
 131                 info
['associatedPlaylistUrl'], 
 134             for base_url 
in base_urls
: 
 137                 playlist_url 
= base_url 
+ '?form=MPXNBCNewsAPI' 
 140                     all_videos 
= self
._download
_json
(playlist_url
, title
) 
 141                 except ExtractorError 
as ee
: 
 142                     if isinstance(ee
.cause
, compat_HTTPError
): 
 146                 if not all_videos 
or 'videos' not in all_videos
: 
 150                     info 
= next(v 
for v 
in all_videos
['videos'] if v
['mpxId'] == mpxid
) 
 152                 except StopIteration: 
 156                 raise ExtractorError('Could not find video in playlists') 
 160                 # We get the best quality video 
 161                 'url': info
['videoAssets'][-1]['publicUrl'], 
 162                 'ie_key': 'ThePlatform',