]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py 
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  13  class  WashingtonPostIE ( InfoExtractor
):   14      IE_NAME 
=  'washingtonpost'   15      _VALID_URL 
=  r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'   16      _EMBED_URL 
=  r
'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} '   18          'url' :  'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,   19          'md5' :  '6f537e1334b714eb15f9563bd4b9cdfa' ,   21              'id' :  '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,   23              'title' :  'Egypt finds belongings, debris from plane crash' ,   24              'description' :  'md5:a17ceee432f215a5371388c1f680bd86' ,   25              'upload_date' :  '20160520' ,   26              'uploader' :  'Reuters' ,   27              'timestamp' :  1463778452 ,   32      def  _extract_urls ( cls
,  webpage
):   34              r
'<iframe[^>]+\bsrc=["\' ]( %s) ' % cls._EMBED_URL, webpage)   36      def _real_extract(self, url):   37          video_id = self._match_id(url)   38          video_data = self._download_json(   39              ' http
:// www
. washingtonpost
. com
/ posttv
/ c
/ videojson
/ %s ?resType
= jsonp
' % video_id,   40              video_id, transform_source=strip_jsonp)[0][' contentConfig
']   41          title = video_data[' title
']   45          for s in video_data.get(' streams
', []):   47              if not s_url or s_url in urls:   50              video_type = s.get(' type ')   51              if video_type == ' smil
':   53              elif video_type in (' ts
', ' hls
') and (' _master
. m3u8
' in s_url or ' _mobile
. m3u8
' in s_url):   54                  m3u8_formats = self._extract_m3u8_formats(   55                      s_url, video_id, ' mp4
', ' m3u8_native
', m3u8_id=' hls
', fatal=False)   56                  for m3u8_format in m3u8_formats:   57                      width = m3u8_format.get(' width
')   60                      vbr = self._search_regex(   61                          r' %d _ %d _ ( \d
+) ' % (width, m3u8_format[' height
']), m3u8_format[' url
'], ' vbr
', default=None)   64                              ' vbr
': int_or_none(vbr),   66                  formats.extend(m3u8_formats)   68                  width = int_or_none(s.get(' width
'))   69                  vbr = int_or_none(s.get(' bitrate
'))   70                  has_width = width != 0   73                          ' %s-%d-%d ' % (video_type, width, vbr)   76                      ' vbr
': vbr if has_width else None,   78                      ' height
': int_or_none(s.get(' height
')),   79                      ' acodec
': s.get(' audioCodec
'),   80                      ' vcodec
': s.get(' videoCodec
') if has_width else ' none
',   81                      ' filesize
': int_or_none(s.get(' fileSize
')),   84                      ' protocol
': ' m3u8_native
' if video_type in (' ts
', ' hls
') else None,   86          source_media_url = video_data.get(' sourceMediaURL
')   89                  ' format_id
': ' source_media
',   90                  ' url
': source_media_url,   93              formats, (' width
', ' height
', ' vbr
', ' filesize
', ' tbr
', ' format_id
'))   98              ' description
': video_data.get(' blurb
'),   99              ' uploader
': video_data.get(' credits
', {}).get(' source
'),  101              ' duration
': int_or_none(video_data.get(' videoDuration
'), 100),  102              ' timestamp
': int_or_none(  103                  video_data.get(' dateConfig
', {}).get(' dateFirstPublished
'), 1000),  107  class WashingtonPostArticleIE(InfoExtractor):  108      IE_NAME = ' washingtonpost
: article
'  109      _VALID_URL = r' https?
://( ?
: www\
.) ?washingtonpost\
. com
/( ?
:[ ^
/]+/)*( ?P
< id >[ ^
/ ?
#]+)'  111          'url' :  'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,  113              'id' :  'sinkhole-of-bureaucracy' ,  114              'title' :  'Sinkhole of bureaucracy' ,  117              'md5' :  'b9be794ceb56c7267d410a13f99d801a' ,  119                  'id' :  'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,  121                  'title' :  'Breaking Points: The Paper Mine' ,  123                  'description' :  'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,  124                  'uploader' :  'The Washington Post' ,  125                  'timestamp' :  1395527908 ,  126                  'upload_date' :  '20140322' ,  129              'md5' :  '1fff6a689d8770966df78c8cb6c8c17c' ,  131                  'id' :  '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,  133                  'title' :  'The town bureaucracy sustains' ,  134                  'description' :  'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,  136                  'timestamp' :  1395528005 ,  137                  'upload_date' :  '20140322' ,  138                  'uploader' :  'The Washington Post' ,  142          'url' :  'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,  144              'id' :  'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,  145              'title' :  'One airline figured out how to make sure its airplanes never disappear' ,  148              'md5' :  'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,  150                  'id' :  '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,  152                  'description' :  'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,  153                  'upload_date' :  '20141230' ,  154                  'uploader' :  'The Washington Post' ,  155                  'timestamp' :  1419974765 ,  156                  'title' :  'Why black boxes don’t transmit data in real time' ,  162      def  suitable ( cls
,  url
):  163          return False if  WashingtonPostIE
. suitable ( url
)  else  super ( WashingtonPostArticleIE
,  cls
). suitable ( url
)  165      def  _real_extract ( self
,  url
):  166          page_id 
=  self
._ match
_ id
( url
)  167          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  169          title 
=  self
._ og
_ search
_ title
( webpage
)  171          uuids 
=  re
. findall ( r
'''(?x)  173                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|  175              )"([^"]+)"''' ,  webpage
)  176          entries 
= [ self
. url_result ( 'washingtonpost: %s '  %  uuid
,  'WashingtonPost' ,  uuid
)  for  uuid 
in  uuids
]