]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py 
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  13  class  WashingtonPostIE ( InfoExtractor
):   14      IE_NAME 
=  'washingtonpost'   15      _VALID_URL 
=  r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'   17          'url' :  'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,   18          'md5' :  '6f537e1334b714eb15f9563bd4b9cdfa' ,   20              'id' :  '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,   22              'title' :  'Egypt finds belongings, debris from plane crash' ,   23              'description' :  'md5:a17ceee432f215a5371388c1f680bd86' ,   24              'upload_date' :  '20160520' ,   25              'uploader' :  'Reuters' ,   26              'timestamp' :  1463778452 ,   30      def  _real_extract ( self
,  url
):   31          video_id 
=  self
._ match
_ id
( url
)   32          video_data 
=  self
._ download
_ json
(   33              'http://www.washingtonpost.com/posttv/c/videojson/ %s ?resType=jsonp'  %  video_id
,   34              video_id
,  transform_source
= strip_jsonp
)[ 0 ][ 'contentConfig' ]   35          title 
=  video_data
[ 'title' ]   39          for  s 
in  video_data
. get ( 'streams' , []):   41              if not  s_url 
or  s_url 
in  urls
:   44              video_type 
=  s
. get ( 'type' )   45              if  video_type 
==  'smil' :   47              elif  video_type 
in  ( 'ts' ,  'hls' )  and  ( '_master.m3u8'  in  s_url 
or  '_mobile.m3u8'  in  s_url
):   48                  m3u8_formats 
=  self
._ extract
_ m
3u8_ formats
(   49                      s_url
,  video_id
,  'mp4' ,  'm3u8_native' ,  m3u8_id
= 'hls' ,  fatal
= False )   50                  for  m3u8_format 
in  m3u8_formats
:   51                      width 
=  m3u8_format
. get ( 'width' )   54                      vbr 
=  self
._ search
_ regex
(   55                          r
' %d _ %d _(\d+)'  % ( width
,  m3u8_format
[ 'height' ]),  m3u8_format
[ 'url' ],  'vbr' ,  default
= None )   58                              'vbr' :  int_or_none ( vbr
),   60                  formats
. extend ( m3u8_formats
)   62                  width 
=  int_or_none ( s
. get ( 'width' ))   63                  vbr 
=  int_or_none ( s
. get ( 'bitrate' ))   64                  has_width 
=  width 
!=  0   67                          ' %s-%d-%d '  % ( video_type
,  width
,  vbr
)   70                      'vbr' :  vbr 
if  has_width 
else None ,   72                      'height' :  int_or_none ( s
. get ( 'height' )),   73                      'acodec' :  s
. get ( 'audioCodec' ),   74                      'vcodec' :  s
. get ( 'videoCodec' )  if  has_width 
else  'none' ,   75                      'filesize' :  int_or_none ( s
. get ( 'fileSize' )),   78                      'protocol' :  'm3u8_native'  if  video_type 
in  ( 'ts' ,  'hls' )  else None ,   80          source_media_url 
=  video_data
. get ( 'sourceMediaURL' )   83                  'format_id' :  'source_media' ,   84                  'url' :  source_media_url
,   87              formats
, ( 'width' ,  'height' ,  'vbr' ,  'filesize' ,  'tbr' ,  'format_id' ))   92              'description' :  video_data
. get ( 'blurb' ),   93              'uploader' :  video_data
. get ( 'credits' , {}). get ( 'source' ),   95              'duration' :  int_or_none ( video_data
. get ( 'videoDuration' ),  100 ),   96              'timestamp' :  int_or_none (   97                  video_data
. get ( 'dateConfig' , {}). get ( 'dateFirstPublished' ),  1000 ),  101  class  WashingtonPostArticleIE ( InfoExtractor
):  102      IE_NAME 
=  'washingtonpost:article'  103      _VALID_URL 
=  r
'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'  105          'url' :  'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,  107              'id' :  'sinkhole-of-bureaucracy' ,  108              'title' :  'Sinkhole of bureaucracy' ,  111              'md5' :  'b9be794ceb56c7267d410a13f99d801a' ,  113                  'id' :  'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,  115                  'title' :  'Breaking Points: The Paper Mine' ,  117                  'description' :  'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,  118                  'uploader' :  'The Washington Post' ,  119                  'timestamp' :  1395527908 ,  120                  'upload_date' :  '20140322' ,  123              'md5' :  '1fff6a689d8770966df78c8cb6c8c17c' ,  125                  'id' :  '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,  127                  'title' :  'The town bureaucracy sustains' ,  128                  'description' :  'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,  130                  'timestamp' :  1395528005 ,  131                  'upload_date' :  '20140322' ,  132                  'uploader' :  'The Washington Post' ,  136          'url' :  'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,  138              'id' :  'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,  139              'title' :  'One airline figured out how to make sure its airplanes never disappear' ,  142              'md5' :  'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,  144                  'id' :  '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,  146                  'description' :  'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,  147                  'upload_date' :  '20141230' ,  148                  'uploader' :  'The Washington Post' ,  149                  'timestamp' :  1419974765 ,  150                  'title' :  'Why black boxes don’t transmit data in real time' ,  156      def  suitable ( cls
,  url
):  157          return False if  WashingtonPostIE
. suitable ( url
)  else  super ( WashingtonPostArticleIE
,  cls
). suitable ( url
)  159      def  _real_extract ( self
,  url
):  160          page_id 
=  self
._ match
_ id
( url
)  161          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  163          title 
=  self
._ og
_ search
_ title
( webpage
)  165          uuids 
=  re
. findall ( r
'''(?x)  167                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|  169              )"([^"]+)"''' ,  webpage
)  170          entries 
= [ self
. url_result ( 'washingtonpost: %s '  %  uuid
,  'WashingtonPost' ,  uuid
)  for  uuid 
in  uuids
]