]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nytimes.py 
   2  from  __future__ 
import  unicode_literals
   8  from  . common 
import  InfoExtractor
  20  class  NYTimesBaseIE ( InfoExtractor
):   21      _SECRET 
=  b
'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'   23      def  _extract_video_from_id ( self
,  video_id
):   24          # Authorization generation algorithm is reverse engineered from `signer` in   25          # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js   26          path 
=  '/svc/video/api/v3/video/'  +  video_id
  27          hm 
=  hmac
. new ( self
._ SECRET
, ( path 
+  ':vhs' ). encode (),  hashlib
. sha512
). hexdigest ()   28          video_data 
=  self
._ download
_ json
( 'http://www.nytimes.com'  +  path
,  video_id
,  'Downloading video JSON' ,  headers
={   29              'Authorization' :  'NYTV '  +  base64
. b64encode ( hm
. encode ()). decode (),   33              video_data 
=  self
._ download
_ json
(   34                  'http://www.nytimes.com/svc/video/api/v2/video/'  +  video_id
,   35                  video_id
,  'Downloading video JSON' )   37          title 
=  video_data
[ 'headline' ]   39          def  get_file_size ( file_size
):   40              if  isinstance ( file_size
,  int ):   42              elif  isinstance ( file_size
,  dict ):   43                  return  int ( file_size
. get ( 'value' ,  0 ))   49          for  video 
in  video_data
. get ( 'renditions' , []):   50              video_url 
=  video
. get ( 'url' )   51              format_id 
=  video
. get ( 'type' )   52              if not  video_url 
or  format_id 
==  'thumbs'  or  video_url 
in  urls
:   54              urls
. append ( video_url
)   55              ext 
=  mimetype2ext ( video
. get ( 'mimetype' ))  or  determine_ext ( video_url
)   57                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(   58                      video_url
,  video_id
,  'mp4' ,  'm3u8_native' ,   59                      m3u8_id
= format_id 
or  'hls' ,  fatal
= False ))   62              #     formats.extend(self._extract_mpd_formats(   63              #         video_url, video_id, format_id or 'dash', fatal=False))   67                      'format_id' :  format_id
,   68                      'vcodec' :  video
. get ( 'videoencoding' )  or  video
. get ( 'video_codec' ),   69                      'width' :  int_or_none ( video
. get ( 'width' )),   70                      'height' :  int_or_none ( video
. get ( 'height' )),   71                      'filesize' :  get_file_size ( video
. get ( 'file_size' )  or  video
. get ( 'fileSize' )),   72                      'tbr' :  int_or_none ( video
. get ( 'bitrate' ),  1000 ),   75          self
._ sort
_ formats
( formats
)   78          for  image 
in  video_data
. get ( 'images' , []):   79              image_url 
=  image
. get ( 'url' )   83                  'url' :  'http://www.nytimes.com/'  +  image_url
,   84                  'width' :  int_or_none ( image
. get ( 'width' )),   85                  'height' :  int_or_none ( image
. get ( 'height' )),   88          publication_date 
=  video_data
. get ( 'publication_date' )   89          timestamp 
=  parse_iso8601 ( publication_date
[:- 8 ])  if  publication_date 
else None   94              'description' :  video_data
. get ( 'summary' ),   95              'timestamp' :  timestamp
,   96              'uploader' :  video_data
. get ( 'byline' ),   97              'duration' :  float_or_none ( video_data
. get ( 'duration' ),  1000 ),   99              'thumbnails' :  thumbnails
,  103  class  NYTimesIE ( NYTimesBaseIE
):  104      _VALID_URL 
=  r
'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'  107          'url' :  'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263' ,  108          'md5' :  'd665342765db043f7e225cff19df0f2d' ,  110              'id' :  '100000002847155' ,  112              'title' :  'Verbatim: What Is a Photocopier?' ,  113              'description' :  'md5:93603dada88ddbda9395632fdc5da260' ,  114              'timestamp' :  1398631707 ,  115              'upload_date' :  '20140427' ,  116              'uploader' :  'Brett Weiner' ,  120          'url' :  'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html' ,  121          'only_matching' :  True ,  124      def  _real_extract ( self
,  url
):  125          video_id 
=  self
._ match
_ id
( url
)  127          return  self
._ extract
_ video
_ from
_ id
( video_id
)  130  class  NYTimesArticleIE ( NYTimesBaseIE
):  131      _VALID_URL 
=  r
'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'  133          'url' :  'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0' ,  134          'md5' :  'e2076d58b4da18e6a001d53fd56db3c9' ,  136              'id' :  '100000003628438' ,  138              'title' :  'New Minimum Wage: $70,000 a Year' ,  139              'description' :  'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.' ,  140              'timestamp' :  1429033037 ,  141              'upload_date' :  '20150414' ,  142              'uploader' :  'Matthew Williams' ,  145          'url' :  'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html' ,  146          'md5' :  'e0d52040cafb07662acf3c9132db3575' ,  148              'id' :  '100000004709062' ,  149              'title' :  'The Run-Up: ‘He Was Like an Octopus’' ,  151              'description' :  'md5:fb5c6b93b12efc51649b4847fe066ee4' ,  152              'series' :  'The Run-Up' ,  153              'episode' :  '‘He Was Like an Octopus’' ,  154              'episode_number' :  20 ,  158          'url' :  'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html' ,  160              'id' :  '100000004709479' ,  161              'title' :  'The Rise of Hitler' ,  163              'description' :  'md5:bce877fd9e3444990cb141875fab0028' ,  164              'creator' :  'Pamela Paul' ,  168              'skip_download' :  True ,  171          'url' :  'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1' ,  172          'only_matching' :  True ,  175      def  _extract_podcast_from_json ( self
,  json
,  page_id
,  webpage
):  176          podcast_audio 
=  self
._ parse
_ json
(  177              json
,  page_id
,  transform_source
= js_to_json
)  179          audio_data 
=  podcast_audio
[ 'data' ]  180          track 
=  audio_data
[ 'track' ]  182          episode_title 
=  track
[ 'title' ]  183          video_url 
=  track
[ 'source' ]  185          description 
=  track
. get ( 'description' )  or  self
._ html
_ search
_ meta
(  186              [ 'og:description' ,  'twitter:description' ],  webpage
)  188          podcast_title 
=  audio_data
. get ( 'podcast' , {}). get ( 'title' )  189          title 
= ( ' %s :  %s '  % ( podcast_title
,  episode_title
)  190                   if  podcast_title 
else  episode_title
)  192          episode 
=  audio_data
. get ( 'podcast' , {}). get ( 'episode' )  or  ''  193          episode_number 
=  int_or_none ( self
._ search
_ regex
(  194              r
'[Ee]pisode\s+(\d+)' ,  episode
,  'episode number' ,  default
= None ))  197              'id' :  remove_start ( podcast_audio
. get ( 'target' ),  'FT' )  or  page_id
,  200              'description' :  description
,  201              'creator' :  track
. get ( 'credit' ),  202              'series' :  podcast_title
,  203              'episode' :  episode_title
,  204              'episode_number' :  episode_number
,  205              'duration' :  int_or_none ( track
. get ( 'duration' )),  208      def  _real_extract ( self
,  url
):  209          page_id 
=  self
._ match
_ id
( url
)  211          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  213          video_id 
=  self
._ search
_ regex
(  214              r
'data-videoid=["\' ]( \d
+) ', webpage, ' video 
id ',  215              default=None, fatal=False)  216          if video_id is not None:  217              return self._extract_video_from_id(video_id)  219          podcast_data = self._search_regex(  220              (r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+ ?
}) \s
* \
) \s
*; \s
*</ script
',  221               r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+}) \s
* \
) \s
*; '),  222              webpage, ' podcast data
')  223          return self._extract_podcast_from_json(podcast_data, page_id, webpage)