]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
13 class WashingtonPostIE ( InfoExtractor
):
14 IE_NAME
= 'washingtonpost'
15 _VALID_URL
= r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'
16 _EMBED_URL
= r
'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} '
18 'url' : 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
19 'md5' : '6f537e1334b714eb15f9563bd4b9cdfa' ,
21 'id' : '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
23 'title' : 'Egypt finds belongings, debris from plane crash' ,
24 'description' : 'md5:a17ceee432f215a5371388c1f680bd86' ,
25 'upload_date' : '20160520' ,
26 'uploader' : 'Reuters' ,
27 'timestamp' : 1463778452 ,
32 def _extract_urls ( cls
, webpage
):
34 r
'<iframe[^>]+\bsrc=["\' ]( %s) ' % cls._EMBED_URL, webpage)
36 def _real_extract(self, url):
37 video_id = self._match_id(url)
38 video_data = self._download_json(
39 ' http
:// www
. washingtonpost
. com
/ posttv
/ c
/ videojson
/ %s ?resType
= jsonp
' % video_id,
40 video_id, transform_source=strip_jsonp)[0][' contentConfig
']
41 title = video_data[' title
']
45 for s in video_data.get(' streams
', []):
47 if not s_url or s_url in urls:
50 video_type = s.get(' type ')
51 if video_type == ' smil
':
53 elif video_type in (' ts
', ' hls
') and (' _master
. m3u8
' in s_url or ' _mobile
. m3u8
' in s_url):
54 m3u8_formats = self._extract_m3u8_formats(
55 s_url, video_id, ' mp4
', ' m3u8_native
', m3u8_id=' hls
', fatal=False)
56 for m3u8_format in m3u8_formats:
57 width = m3u8_format.get(' width
')
60 vbr = self._search_regex(
61 r' %d _ %d _ ( \d
+) ' % (width, m3u8_format[' height
']), m3u8_format[' url
'], ' vbr
', default=None)
64 ' vbr
': int_or_none(vbr),
66 formats.extend(m3u8_formats)
68 width = int_or_none(s.get(' width
'))
69 vbr = int_or_none(s.get(' bitrate
'))
70 has_width = width != 0
73 ' %s-%d-%d ' % (video_type, width, vbr)
76 ' vbr
': vbr if has_width else None,
78 ' height
': int_or_none(s.get(' height
')),
79 ' acodec
': s.get(' audioCodec
'),
80 ' vcodec
': s.get(' videoCodec
') if has_width else ' none
',
81 ' filesize
': int_or_none(s.get(' fileSize
')),
84 ' protocol
': ' m3u8_native
' if video_type in (' ts
', ' hls
') else None,
86 source_media_url = video_data.get(' sourceMediaURL
')
89 ' format_id
': ' source_media
',
90 ' url
': source_media_url,
93 formats, (' width
', ' height
', ' vbr
', ' filesize
', ' tbr
', ' format_id
'))
98 ' description
': video_data.get(' blurb
'),
99 ' uploader
': video_data.get(' credits
', {}).get(' source
'),
101 ' duration
': int_or_none(video_data.get(' videoDuration
'), 100),
102 ' timestamp
': int_or_none(
103 video_data.get(' dateConfig
', {}).get(' dateFirstPublished
'), 1000),
107 class WashingtonPostArticleIE(InfoExtractor):
108 IE_NAME = ' washingtonpost
: article
'
109 _VALID_URL = r' https?
://( ?
: www\
.) ?washingtonpost\
. com
/( ?
:[ ^
/]+/)*( ?P
< id >[ ^
/ ?
#]+)'
111 'url' : 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,
113 'id' : 'sinkhole-of-bureaucracy' ,
114 'title' : 'Sinkhole of bureaucracy' ,
117 'md5' : 'b9be794ceb56c7267d410a13f99d801a' ,
119 'id' : 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,
121 'title' : 'Breaking Points: The Paper Mine' ,
123 'description' : 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,
124 'uploader' : 'The Washington Post' ,
125 'timestamp' : 1395527908 ,
126 'upload_date' : '20140322' ,
129 'md5' : '1fff6a689d8770966df78c8cb6c8c17c' ,
131 'id' : '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,
133 'title' : 'The town bureaucracy sustains' ,
134 'description' : 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,
136 'timestamp' : 1395528005 ,
137 'upload_date' : '20140322' ,
138 'uploader' : 'The Washington Post' ,
142 'url' : 'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,
144 'id' : 'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,
145 'title' : 'One airline figured out how to make sure its airplanes never disappear' ,
148 'md5' : 'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,
150 'id' : '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,
152 'description' : 'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,
153 'upload_date' : '20141230' ,
154 'uploader' : 'The Washington Post' ,
155 'timestamp' : 1419974765 ,
156 'title' : 'Why black boxes don’t transmit data in real time' ,
162 def suitable ( cls
, url
):
163 return False if WashingtonPostIE
. suitable ( url
) else super ( WashingtonPostArticleIE
, cls
). suitable ( url
)
165 def _real_extract ( self
, url
):
166 page_id
= self
._ match
_ id
( url
)
167 webpage
= self
._ download
_ webpage
( url
, page_id
)
169 title
= self
._ og
_ search
_ title
( webpage
)
171 uuids
= re
. findall ( r
'''(?x)
173 <div\s+class="posttv-video-embed[^>]*?data-uuid=|
175 )"([^"]+)"''' , webpage
)
176 entries
= [ self
. url_result ( 'washingtonpost: %s ' % uuid
, 'WashingtonPost' , uuid
) for uuid
in uuids
]