]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  13  class  WashingtonPostIE ( InfoExtractor
):  
  14      IE_NAME 
=  'washingtonpost'  
  15      _VALID_URL 
=  r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'  
  16      _EMBED_URL 
=  r
'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} '  
  18          'url' :  'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,  
  19          'md5' :  '6f537e1334b714eb15f9563bd4b9cdfa' ,  
  21              'id' :  '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,  
  23              'title' :  'Egypt finds belongings, debris from plane crash' ,  
  24              'description' :  'md5:a17ceee432f215a5371388c1f680bd86' ,  
  25              'upload_date' :  '20160520' ,  
  26              'uploader' :  'Reuters' ,  
  27              'timestamp' :  1463778452 ,  
  32      def  _extract_urls ( cls
,  webpage
):  
  34              r
'<iframe[^>]+\bsrc=["\' ]( %s) ' % cls._EMBED_URL, webpage)  
  36      def _real_extract(self, url):  
  37          video_id = self._match_id(url)  
  38          video_data = self._download_json(  
  39              ' http
:// www
. washingtonpost
. com
/ posttv
/ c
/ videojson
/ %s ?resType
= jsonp
' % video_id,  
  40              video_id, transform_source=strip_jsonp)[0][' contentConfig
']  
  41          title = video_data[' title
']  
  45          for s in video_data.get(' streams
', []):  
  47              if not s_url or s_url in urls:  
  50              video_type = s.get(' type ')  
  51              if video_type == ' smil
':  
  53              elif video_type in (' ts
', ' hls
') and (' _master
. m3u8
' in s_url or ' _mobile
. m3u8
' in s_url):  
  54                  m3u8_formats = self._extract_m3u8_formats(  
  55                      s_url, video_id, ' mp4
', ' m3u8_native
', m3u8_id=' hls
', fatal=False)  
  56                  for m3u8_format in m3u8_formats:  
  57                      width = m3u8_format.get(' width
')  
  60                      vbr = self._search_regex(  
  61                          r' %d _ %d _ ( \d
+) ' % (width, m3u8_format[' height
']), m3u8_format[' url
'], ' vbr
', default=None)  
  64                              ' vbr
': int_or_none(vbr),  
  66                  formats.extend(m3u8_formats)  
  68                  width = int_or_none(s.get(' width
'))  
  69                  vbr = int_or_none(s.get(' bitrate
'))  
  70                  has_width = width != 0  
  73                          ' %s-%d-%d ' % (video_type, width, vbr)  
  76                      ' vbr
': vbr if has_width else None,  
  78                      ' height
': int_or_none(s.get(' height
')),  
  79                      ' acodec
': s.get(' audioCodec
'),  
  80                      ' vcodec
': s.get(' videoCodec
') if has_width else ' none
',  
  81                      ' filesize
': int_or_none(s.get(' fileSize
')),  
  84                      ' protocol
': ' m3u8_native
' if video_type in (' ts
', ' hls
') else None,  
  86          source_media_url = video_data.get(' sourceMediaURL
')  
  89                  ' format_id
': ' source_media
',  
  90                  ' url
': source_media_url,  
  93              formats, (' width
', ' height
', ' vbr
', ' filesize
', ' tbr
', ' format_id
'))  
  98              ' description
': video_data.get(' blurb
'),  
  99              ' uploader
': video_data.get(' credits
', {}).get(' source
'),  
 101              ' duration
': int_or_none(video_data.get(' videoDuration
'), 100),  
 102              ' timestamp
': int_or_none(  
 103                  video_data.get(' dateConfig
', {}).get(' dateFirstPublished
'), 1000),  
 107  class WashingtonPostArticleIE(InfoExtractor):  
 108      IE_NAME = ' washingtonpost
: article
'  
 109      _VALID_URL = r' https?
://( ?
: www\
.) ?washingtonpost\
. com
/( ?
:[ ^
/]+/)*( ?P
< id >[ ^
/ ?
#]+)'  
 111          'url' :  'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,  
 113              'id' :  'sinkhole-of-bureaucracy' ,  
 114              'title' :  'Sinkhole of bureaucracy' ,  
 117              'md5' :  'b9be794ceb56c7267d410a13f99d801a' ,  
 119                  'id' :  'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,  
 121                  'title' :  'Breaking Points: The Paper Mine' ,  
 123                  'description' :  'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,  
 124                  'uploader' :  'The Washington Post' ,  
 125                  'timestamp' :  1395527908 ,  
 126                  'upload_date' :  '20140322' ,  
 129              'md5' :  '1fff6a689d8770966df78c8cb6c8c17c' ,  
 131                  'id' :  '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,  
 133                  'title' :  'The town bureaucracy sustains' ,  
 134                  'description' :  'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,  
 136                  'timestamp' :  1395528005 ,  
 137                  'upload_date' :  '20140322' ,  
 138                  'uploader' :  'The Washington Post' ,  
 142          'url' :  'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,  
 144              'id' :  'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,  
 145              'title' :  'One airline figured out how to make sure its airplanes never disappear' ,  
 148              'md5' :  'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,  
 150                  'id' :  '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,  
 152                  'description' :  'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,  
 153                  'upload_date' :  '20141230' ,  
 154                  'uploader' :  'The Washington Post' ,  
 155                  'timestamp' :  1419974765 ,  
 156                  'title' :  'Why black boxes don’t transmit data in real time' ,  
 162      def  suitable ( cls
,  url
):  
 163          return False if  WashingtonPostIE
. suitable ( url
)  else  super ( WashingtonPostArticleIE
,  cls
). suitable ( url
)  
 165      def  _real_extract ( self
,  url
):  
 166          page_id 
=  self
._ match
_ id
( url
)  
 167          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  
 169          title 
=  self
._ og
_ search
_ title
( webpage
)  
 171          uuids 
=  re
. findall ( r
'''(?x)  
 173                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|  
 175              )"([^"]+)"''' ,  webpage
)  
 176          entries 
= [ self
. url_result ( 'washingtonpost: %s '  %  uuid
,  'WashingtonPost' ,  uuid
)  for  uuid 
in  uuids
]