]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
13 class WashingtonPostIE ( InfoExtractor
):
14 IE_NAME
= 'washingtonpost'
15 _VALID_URL
= r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'
17 'url' : 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
18 'md5' : '6f537e1334b714eb15f9563bd4b9cdfa' ,
20 'id' : '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,
22 'title' : 'Egypt finds belongings, debris from plane crash' ,
23 'description' : 'md5:a17ceee432f215a5371388c1f680bd86' ,
24 'upload_date' : '20160520' ,
25 'uploader' : 'Reuters' ,
26 'timestamp' : 1463778452 ,
30 def _real_extract ( self
, url
):
31 video_id
= self
._ match
_ id
( url
)
32 video_data
= self
._ download
_ json
(
33 'http://www.washingtonpost.com/posttv/c/videojson/ %s ?resType=jsonp' % video_id
,
34 video_id
, transform_source
= strip_jsonp
)[ 0 ][ 'contentConfig' ]
35 title
= video_data
[ 'title' ]
39 for s
in video_data
. get ( 'streams' , []):
41 if not s_url
or s_url
in urls
:
44 video_type
= s
. get ( 'type' )
45 if video_type
== 'smil' :
47 elif video_type
in ( 'ts' , 'hls' ) and ( '_master.m3u8' in s_url
or '_mobile.m3u8' in s_url
):
48 m3u8_formats
= self
._ extract
_ m
3u8_ formats
(
49 s_url
, video_id
, 'mp4' , 'm3u8_native' , m3u8_id
= 'hls' , fatal
= False )
50 for m3u8_format
in m3u8_formats
:
51 width
= m3u8_format
. get ( 'width' )
54 vbr
= self
._ search
_ regex
(
55 r
' %d _ %d _(\d+)' % ( width
, m3u8_format
[ 'height' ]), m3u8_format
[ 'url' ], 'vbr' , default
= None )
58 'vbr' : int_or_none ( vbr
),
60 formats
. extend ( m3u8_formats
)
62 width
= int_or_none ( s
. get ( 'width' ))
63 vbr
= int_or_none ( s
. get ( 'bitrate' ))
64 has_width
= width
!= 0
67 ' %s-%d-%d ' % ( video_type
, width
, vbr
)
70 'vbr' : vbr
if has_width
else None ,
72 'height' : int_or_none ( s
. get ( 'height' )),
73 'acodec' : s
. get ( 'audioCodec' ),
74 'vcodec' : s
. get ( 'videoCodec' ) if has_width
else 'none' ,
75 'filesize' : int_or_none ( s
. get ( 'fileSize' )),
78 'protocol' : 'm3u8_native' if video_type
in ( 'ts' , 'hls' ) else None ,
80 source_media_url
= video_data
. get ( 'sourceMediaURL' )
83 'format_id' : 'source_media' ,
84 'url' : source_media_url
,
87 formats
, ( 'width' , 'height' , 'vbr' , 'filesize' , 'tbr' , 'format_id' ))
92 'description' : video_data
. get ( 'blurb' ),
93 'uploader' : video_data
. get ( 'credits' , {}). get ( 'source' ),
95 'duration' : int_or_none ( video_data
. get ( 'videoDuration' ), 100 ),
96 'timestamp' : int_or_none (
97 video_data
. get ( 'dateConfig' , {}). get ( 'dateFirstPublished' ), 1000 ),
101 class WashingtonPostArticleIE ( InfoExtractor
):
102 IE_NAME
= 'washingtonpost:article'
103 _VALID_URL
= r
'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'
105 'url' : 'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,
107 'id' : 'sinkhole-of-bureaucracy' ,
108 'title' : 'Sinkhole of bureaucracy' ,
111 'md5' : 'b9be794ceb56c7267d410a13f99d801a' ,
113 'id' : 'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,
115 'title' : 'Breaking Points: The Paper Mine' ,
117 'description' : 'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,
118 'uploader' : 'The Washington Post' ,
119 'timestamp' : 1395527908 ,
120 'upload_date' : '20140322' ,
123 'md5' : '1fff6a689d8770966df78c8cb6c8c17c' ,
125 'id' : '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,
127 'title' : 'The town bureaucracy sustains' ,
128 'description' : 'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,
130 'timestamp' : 1395528005 ,
131 'upload_date' : '20140322' ,
132 'uploader' : 'The Washington Post' ,
136 'url' : 'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,
138 'id' : 'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,
139 'title' : 'One airline figured out how to make sure its airplanes never disappear' ,
142 'md5' : 'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,
144 'id' : '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,
146 'description' : 'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,
147 'upload_date' : '20141230' ,
148 'uploader' : 'The Washington Post' ,
149 'timestamp' : 1419974765 ,
150 'title' : 'Why black boxes don’t transmit data in real time' ,
156 def suitable ( cls
, url
):
157 return False if WashingtonPostIE
. suitable ( url
) else super ( WashingtonPostArticleIE
, cls
). suitable ( url
)
159 def _real_extract ( self
, url
):
160 page_id
= self
._ match
_ id
( url
)
161 webpage
= self
._ download
_ webpage
( url
, page_id
)
163 title
= self
._ og
_ search
_ title
( webpage
)
165 uuids
= re
. findall ( r
'''(?x)
167 <div\s+class="posttv-video-embed[^>]*?data-uuid=|
169 )"([^"]+)"''' , webpage
)
170 entries
= [ self
. url_result ( 'washingtonpost: %s ' % uuid
, 'WashingtonPost' , uuid
) for uuid
in uuids
]