]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/nytimes.py
2 from __future__
import unicode_literals
8 from . common
import InfoExtractor
20 class NYTimesBaseIE ( InfoExtractor
):
21 _SECRET
= b
'pX(2MbU2);4N{7J8)>YwKRJ+/pQ3JkiU2Q^V>mFYv6g6gYvt6v'
23 def _extract_video_from_id ( self
, video_id
):
24 # Authorization generation algorithm is reverse engineered from `signer` in
25 # http://graphics8.nytimes.com/video/vhs/vhs-2.x.min.js
26 path
= '/svc/video/api/v3/video/' + video_id
27 hm
= hmac
. new ( self
._ SECRET
, ( path
+ ':vhs' ). encode (), hashlib
. sha512
). hexdigest ()
28 video_data
= self
._ download
_ json
( 'http://www.nytimes.com' + path
, video_id
, 'Downloading video JSON' , headers
={
29 'Authorization' : 'NYTV ' + base64
. b64encode ( hm
. encode ()). decode (),
33 video_data
= self
._ download
_ json
(
34 'http://www.nytimes.com/svc/video/api/v2/video/' + video_id
,
35 video_id
, 'Downloading video JSON' )
37 title
= video_data
[ 'headline' ]
39 def get_file_size ( file_size
):
40 if isinstance ( file_size
, int ):
42 elif isinstance ( file_size
, dict ):
43 return int ( file_size
. get ( 'value' , 0 ))
49 for video
in video_data
. get ( 'renditions' , []):
50 video_url
= video
. get ( 'url' )
51 format_id
= video
. get ( 'type' )
52 if not video_url
or format_id
== 'thumbs' or video_url
in urls
:
54 urls
. append ( video_url
)
55 ext
= mimetype2ext ( video
. get ( 'mimetype' )) or determine_ext ( video_url
)
57 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
58 video_url
, video_id
, 'mp4' , 'm3u8_native' ,
59 m3u8_id
= format_id
or 'hls' , fatal
= False ))
62 # formats.extend(self._extract_mpd_formats(
63 # video_url, video_id, format_id or 'dash', fatal=False))
67 'format_id' : format_id
,
68 'vcodec' : video
. get ( 'videoencoding' ) or video
. get ( 'video_codec' ),
69 'width' : int_or_none ( video
. get ( 'width' )),
70 'height' : int_or_none ( video
. get ( 'height' )),
71 'filesize' : get_file_size ( video
. get ( 'file_size' ) or video
. get ( 'fileSize' )),
72 'tbr' : int_or_none ( video
. get ( 'bitrate' ), 1000 ),
75 self
._ sort
_ formats
( formats
)
78 for image
in video_data
. get ( 'images' , []):
79 image_url
= image
. get ( 'url' )
83 'url' : 'http://www.nytimes.com/' + image_url
,
84 'width' : int_or_none ( image
. get ( 'width' )),
85 'height' : int_or_none ( image
. get ( 'height' )),
88 publication_date
= video_data
. get ( 'publication_date' )
89 timestamp
= parse_iso8601 ( publication_date
[:- 8 ]) if publication_date
else None
94 'description' : video_data
. get ( 'summary' ),
95 'timestamp' : timestamp
,
96 'uploader' : video_data
. get ( 'byline' ),
97 'duration' : float_or_none ( video_data
. get ( 'duration' ), 1000 ),
99 'thumbnails' : thumbnails
,
103 class NYTimesIE ( NYTimesBaseIE
):
104 _VALID_URL
= r
'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)'
107 'url' : 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263' ,
108 'md5' : 'd665342765db043f7e225cff19df0f2d' ,
110 'id' : '100000002847155' ,
112 'title' : 'Verbatim: What Is a Photocopier?' ,
113 'description' : 'md5:93603dada88ddbda9395632fdc5da260' ,
114 'timestamp' : 1398631707 ,
115 'upload_date' : '20140427' ,
116 'uploader' : 'Brett Weiner' ,
120 'url' : 'http://www.nytimes.com/video/travel/100000003550828/36-hours-in-dubai.html' ,
121 'only_matching' : True ,
124 def _real_extract ( self
, url
):
125 video_id
= self
._ match
_ id
( url
)
127 return self
._ extract
_ video
_ from
_ id
( video_id
)
130 class NYTimesArticleIE ( NYTimesBaseIE
):
131 _VALID_URL
= r
'https?://(?:www\.)?nytimes\.com/(.(?<!video))*?/(?:[^/]+/)*(?P<id>[^.]+)(?:\.html)?'
133 'url' : 'http://www.nytimes.com/2015/04/14/business/owner-of-gravity-payments-a-credit-card-processor-is-setting-a-new-minimum-wage-70000-a-year.html?_r=0' ,
134 'md5' : 'e2076d58b4da18e6a001d53fd56db3c9' ,
136 'id' : '100000003628438' ,
138 'title' : 'New Minimum Wage: $70,000 a Year' ,
139 'description' : 'Dan Price, C.E.O. of Gravity Payments, surprised his 120-person staff by announcing that he planned over the next three years to raise the salary of every employee to $70,000 a year.' ,
140 'timestamp' : 1429033037 ,
141 'upload_date' : '20150414' ,
142 'uploader' : 'Matthew Williams' ,
145 'url' : 'http://www.nytimes.com/2016/10/14/podcasts/revelations-from-the-final-weeks.html' ,
146 'md5' : 'e0d52040cafb07662acf3c9132db3575' ,
148 'id' : '100000004709062' ,
149 'title' : 'The Run-Up: ‘He Was Like an Octopus’' ,
151 'description' : 'md5:fb5c6b93b12efc51649b4847fe066ee4' ,
152 'series' : 'The Run-Up' ,
153 'episode' : '‘He Was Like an Octopus’' ,
154 'episode_number' : 20 ,
158 'url' : 'http://www.nytimes.com/2016/10/16/books/review/inside-the-new-york-times-book-review-the-rise-of-hitler.html' ,
160 'id' : '100000004709479' ,
161 'title' : 'The Rise of Hitler' ,
163 'description' : 'md5:bce877fd9e3444990cb141875fab0028' ,
164 'creator' : 'Pamela Paul' ,
168 'skip_download' : True ,
171 'url' : 'http://www.nytimes.com/news/minute/2014/03/17/times-minute-whats-next-in-crimea/?_php=true&_type=blogs&_php=true&_type=blogs&_r=1' ,
172 'only_matching' : True ,
175 def _extract_podcast_from_json ( self
, json
, page_id
, webpage
):
176 podcast_audio
= self
._ parse
_ json
(
177 json
, page_id
, transform_source
= js_to_json
)
179 audio_data
= podcast_audio
[ 'data' ]
180 track
= audio_data
[ 'track' ]
182 episode_title
= track
[ 'title' ]
183 video_url
= track
[ 'source' ]
185 description
= track
. get ( 'description' ) or self
._ html
_ search
_ meta
(
186 [ 'og:description' , 'twitter:description' ], webpage
)
188 podcast_title
= audio_data
. get ( 'podcast' , {}). get ( 'title' )
189 title
= ( ' %s : %s ' % ( podcast_title
, episode_title
)
190 if podcast_title
else episode_title
)
192 episode
= audio_data
. get ( 'podcast' , {}). get ( 'episode' ) or ''
193 episode_number
= int_or_none ( self
._ search
_ regex
(
194 r
'[Ee]pisode\s+(\d+)' , episode
, 'episode number' , default
= None ))
197 'id' : remove_start ( podcast_audio
. get ( 'target' ), 'FT' ) or page_id
,
200 'description' : description
,
201 'creator' : track
. get ( 'credit' ),
202 'series' : podcast_title
,
203 'episode' : episode_title
,
204 'episode_number' : episode_number
,
205 'duration' : int_or_none ( track
. get ( 'duration' )),
208 def _real_extract ( self
, url
):
209 page_id
= self
._ match
_ id
( url
)
211 webpage
= self
._ download
_ webpage
( url
, page_id
)
213 video_id
= self
._ search
_ regex
(
214 r
'data-videoid=["\' ]( \d
+) ', webpage, ' video
id ',
215 default=None, fatal=False)
216 if video_id is not None:
217 return self._extract_video_from_id(video_id)
219 podcast_data = self._search_regex(
220 (r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+ ?
}) \s
* \
) \s
*; \s
*</ script
',
221 r' NYTD\
. FlexTypes\
. push\s
* \
( \s
*({.+}) \s
* \
) \s
*; '),
222 webpage, ' podcast data
')
223 return self._extract_podcast_from_json(podcast_data, page_id, webpage)