]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nytimes.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   8  from  . common 
import  InfoExtractor
 
  20  class  NYTimesBaseIE ( InfoExtractor
):  
  21      _SECRET 
=  b
'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'  
  23      def  _extract_video_from_id ( self
,  video_id
):  
  24          # Authorization generation algorithm is reverse engineered from `signer` in  
  25          # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js  
  26          path 
=  '/svc/video/api/v3/video/'  +  video_id
 
  27          hm 
=  hmac
. new ( self
._ SECRET
, ( path 
+  ':vhs' ). encode (),  hashlib
. sha512
). hexdigest ()  
  28          video_data 
=  self
._ download
_ json
( 'http://www.nytimes.com'  +  path
,  video_id
,  'Downloading video JSON' ,  headers
={  
  29              'Authorization' :  'NYTV '  +  base64
. b64encode ( hm
. encode ()). decode (),  
  33              video_data 
=  self
._ download
_ json
(  
  34                  'http://www.nytimes.com/svc/video/api/v2/video/'  +  video_id
,  
  35                  video_id
,  'Downloading video JSON' )  
  37          title 
=  video_data
[ 'headline' ]  
  39          def  get_file_size ( file_size
):  
  40              if  isinstance ( file_size
,  int ):  
  42              elif  isinstance ( file_size
,  dict ):  
  43                  return  int ( file_size
. get ( 'value' ,  0 ))  
  49          for  video 
in  video_data
. get ( 'renditions' , []):  
  50              video_url 
=  video
. get ( 'url' )  
  51              format_id 
=  video
. get ( 'type' )  
  52              if not  video_url 
or  format_id 
==  'thumbs'  or  video_url 
in  urls
:  
  54              urls
. append ( video_url
)  
  55              ext 
=  mimetype2ext ( video
. get ( 'mimetype' ))  or  determine_ext ( video_url
)  
  57                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  
  58                      video_url
,  video_id
,  'mp4' ,  'm3u8_native' ,  
  59                      m3u8_id
= format_id 
or  'hls' ,  fatal
= False ))  
  62              #     formats.extend(self._extract_mpd_formats(  
  63              #         video_url, video_id, format_id or 'dash', fatal=False))  
  67                      'format_id' :  format_id
,  
  68                      'vcodec' :  video
. get ( 'videoencoding' )  or  video
. get ( 'video_codec' ),  
  69                      'width' :  int_or_none ( video
. get ( 'width' )),  
  70                      'height' :  int_or_none ( video
. get ( 'height' )),  
  71                      'filesize' :  get_file_size ( video
. get ( 'file_size' )  or  video
. get ( 'fileSize' )),  
  72                      'tbr' :  int_or_none ( video
. get ( 'bitrate' ),  1000 ),  
  75          self
._ sort
_ formats
( formats
)  
  78          for  image 
in  video_data
. get ( 'images' , []):  
  79              image_url 
=  image
. get ( 'url' )  
  83                  'url' :  'http://www.nytimes.com/'  +  image_url
,  
  84                  'width' :  int_or_none ( image
. get ( 'width' )),  
  85                  'height' :  int_or_none ( image
. get ( 'height' )),  
  88          publication_date 
=  video_data
. get ( 'publication_date' )  
  89          timestamp 
=  parse_iso8601 ( publication_date
[:- 8 ])  if  publication_date 
else None  
  94              'description' :  video_data
. get ( 'summary' ),  
  95              'timestamp' :  timestamp
,  
  96              'uploader' :  video_data
. get ( 'byline' ),  
  97              'duration' :  float_or_none ( video_data
. get ( 'duration' ),  1000 ),  
  99              'thumbnails' :  thumbnails
,  
 103  class  NYTimesIE ( NYTimesBaseIE
):  
 104      _VALID_URL 
=  r
'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'  
 107          'url' :  'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263' ,  
 108          'md5' :  'd665342765db043f7e225cff19df0f2d' ,  
 110              'id' :  '100000002847155' ,  
 112              'title' :  'Verbatim: What Is a Photocopier?' ,  
 113              'description' :  'md5:93603dada88ddbda9395632fdc5da260' ,  
 114              'timestamp' :  1398631707 ,  
 115              'upload_date' :  '20140427' ,  
 116              'uploader' :  'Brett Weiner' ,  
 120          'url' :  'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html' ,  
 121          'only_matching' :  True ,  
 124      def  _real_extract ( self
,  url
):  
 125          video_id 
=  self
._ match
_ id
( url
)  
 127          return  self
._ extract
_ video
_ from
_ id
( video_id
)  
 130  class  NYTimesArticleIE ( NYTimesBaseIE
):  
 131      _VALID_URL 
=  r
'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'  
 133          'url' :  'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0' ,  
 134          'md5' :  'e2076d58b4da18e6a001d53fd56db3c9' ,  
 136              'id' :  '100000003628438' ,  
 138              'title' :  'New Minimum Wage: $70,000 a Year' ,  
 139              'description' :  'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.' ,  
 140              'timestamp' :  1429033037 ,  
 141              'upload_date' :  '20150414' ,  
 142              'uploader' :  'Matthew Williams' ,  
 145          'url' :  'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html' ,  
 146          'md5' :  'e0d52040cafb07662acf3c9132db3575' ,  
 148              'id' :  '100000004709062' ,  
 149              'title' :  'The Run-Up: ‘He Was Like an Octopus’' ,  
 151              'description' :  'md5:fb5c6b93b12efc51649b4847fe066ee4' ,  
 152              'series' :  'The Run-Up' ,  
 153              'episode' :  '‘He Was Like an Octopus’' ,  
 154              'episode_number' :  20 ,  
 158          'url' :  'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html' ,  
 160              'id' :  '100000004709479' ,  
 161              'title' :  'The Rise of Hitler' ,  
 163              'description' :  'md5:bce877fd9e3444990cb141875fab0028' ,  
 164              'creator' :  'Pamela Paul' ,  
 168              'skip_download' :  True ,  
 171          'url' :  'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1' ,  
 172          'only_matching' :  True ,  
 175      def  _extract_podcast_from_json ( self
,  json
,  page_id
,  webpage
):  
 176          podcast_audio 
=  self
._ parse
_ json
(  
 177              json
,  page_id
,  transform_source
= js_to_json
)  
 179          audio_data 
=  podcast_audio
[ 'data' ]  
 180          track 
=  audio_data
[ 'track' ]  
 182          episode_title 
=  track
[ 'title' ]  
 183          video_url 
=  track
[ 'source' ]  
 185          description 
=  track
. get ( 'description' )  or  self
._ html
_ search
_ meta
(  
 186              [ 'og:description' ,  'twitter:description' ],  webpage
)  
 188          podcast_title 
=  audio_data
. get ( 'podcast' , {}). get ( 'title' )  
 189          title 
= ( ' %s :  %s '  % ( podcast_title
,  episode_title
)  
 190                   if  podcast_title 
else  episode_title
)  
 192          episode 
=  audio_data
. get ( 'podcast' , {}). get ( 'episode' )  or  ''  
 193          episode_number 
=  int_or_none ( self
._ search
_ regex
(  
 194              r
'[Ee]pisode\s+(\d+)' ,  episode
,  'episode number' ,  default
= None ))  
 197              'id' :  remove_start ( podcast_audio
. get ( 'target' ),  'FT' )  or  page_id
,  
 200              'description' :  description
,  
 201              'creator' :  track
. get ( 'credit' ),  
 202              'series' :  podcast_title
,  
 203              'episode' :  episode_title
,  
 204              'episode_number' :  episode_number
,  
 205              'duration' :  int_or_none ( track
. get ( 'duration' )),  
 208      def  _real_extract ( self
,  url
):  
 209          page_id 
=  self
._ match
_ id
( url
)  
 211          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  
 213          video_id 
=  self
._ search
_ regex
(  
 214              r
'data-videoid=["\' ]( \d
+) ', webpage, ' video 
id ',  
 215              default=None, fatal=False)  
 216          if video_id is not None:  
 217              return self._extract_video_from_id(video_id)  
 219          podcast_data = self._search_regex(  
 220              (r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+ ?
}) \s
* \
) \s
*; \s
*</ script
',  
 221               r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+}) \s
* \
) \s
*; '),  
 222              webpage, ' podcast data
')  
 223          return self._extract_podcast_from_json(podcast_data, page_id, webpage)