]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  13  class  WashingtonPostIE ( InfoExtractor
):  
  14      IE_NAME 
=  'washingtonpost'  
  15      _VALID_URL 
=  r
'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f] {8} -[\da-f] {4} -[\da-f] {4} -[\da-f] {4} -[\da-f] {12} )'  
  17          'url' :  'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,  
  18          'md5' :  '6f537e1334b714eb15f9563bd4b9cdfa' ,  
  20              'id' :  '480ba4ee-1ec7-11e6-82c2-a7dcb313287d' ,  
  22              'title' :  'Egypt finds belongings, debris from plane crash' ,  
  23              'description' :  'md5:a17ceee432f215a5371388c1f680bd86' ,  
  24              'upload_date' :  '20160520' ,  
  25              'uploader' :  'Reuters' ,  
  26              'timestamp' :  1463778452 ,  
  30      def  _real_extract ( self
,  url
):  
  31          video_id 
=  self
._ match
_ id
( url
)  
  32          video_data 
=  self
._ download
_ json
(  
  33              'http://www.washingtonpost.com/posttv/c/videojson/ %s ?resType=jsonp'  %  video_id
,  
  34              video_id
,  transform_source
= strip_jsonp
)[ 0 ][ 'contentConfig' ]  
  35          title 
=  video_data
[ 'title' ]  
  39          for  s 
in  video_data
. get ( 'streams' , []):  
  41              if not  s_url 
or  s_url 
in  urls
:  
  44              video_type 
=  s
. get ( 'type' )  
  45              if  video_type 
==  'smil' :  
  47              elif  video_type 
in  ( 'ts' ,  'hls' )  and  ( '_master.m3u8'  in  s_url 
or  '_mobile.m3u8'  in  s_url
):  
  48                  m3u8_formats 
=  self
._ extract
_ m
3u8_ formats
(  
  49                      s_url
,  video_id
,  'mp4' ,  'm3u8_native' ,  m3u8_id
= 'hls' ,  fatal
= False )  
  50                  for  m3u8_format 
in  m3u8_formats
:  
  51                      width 
=  m3u8_format
. get ( 'width' )  
  54                      vbr 
=  self
._ search
_ regex
(  
  55                          r
' %d _ %d _(\d+)'  % ( width
,  m3u8_format
[ 'height' ]),  m3u8_format
[ 'url' ],  'vbr' ,  default
= None )  
  58                              'vbr' :  int_or_none ( vbr
),  
  60                  formats
. extend ( m3u8_formats
)  
  62                  width 
=  int_or_none ( s
. get ( 'width' ))  
  63                  vbr 
=  int_or_none ( s
. get ( 'bitrate' ))  
  64                  has_width 
=  width 
!=  0  
  67                          ' %s-%d-%d '  % ( video_type
,  width
,  vbr
)  
  70                      'vbr' :  vbr 
if  has_width 
else None ,  
  72                      'height' :  int_or_none ( s
. get ( 'height' )),  
  73                      'acodec' :  s
. get ( 'audioCodec' ),  
  74                      'vcodec' :  s
. get ( 'videoCodec' )  if  has_width 
else  'none' ,  
  75                      'filesize' :  int_or_none ( s
. get ( 'fileSize' )),  
  78                      'protocol' :  'm3u8_native'  if  video_type 
in  ( 'ts' ,  'hls' )  else None ,  
  80          source_media_url 
=  video_data
. get ( 'sourceMediaURL' )  
  83                  'format_id' :  'source_media' ,  
  84                  'url' :  source_media_url
,  
  87              formats
, ( 'width' ,  'height' ,  'vbr' ,  'filesize' ,  'tbr' ,  'format_id' ))  
  92              'description' :  video_data
. get ( 'blurb' ),  
  93              'uploader' :  video_data
. get ( 'credits' , {}). get ( 'source' ),  
  95              'duration' :  int_or_none ( video_data
. get ( 'videoDuration' ),  100 ),  
  96              'timestamp' :  int_or_none (  
  97                  video_data
. get ( 'dateConfig' , {}). get ( 'dateFirstPublished' ),  1000 ),  
 101  class  WashingtonPostArticleIE ( InfoExtractor
):  
 102      IE_NAME 
=  'washingtonpost:article'  
 103      _VALID_URL 
=  r
'https?://(?:www\.)?washingtonpost\.com/(?:[^/]+/)*(?P<id>[^/?#]+)'  
 105          'url' :  'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,  
 107              'id' :  'sinkhole-of-bureaucracy' ,  
 108              'title' :  'Sinkhole of bureaucracy' ,  
 111              'md5' :  'b9be794ceb56c7267d410a13f99d801a' ,  
 113                  'id' :  'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,  
 115                  'title' :  'Breaking Points: The Paper Mine' ,  
 117                  'description' :  'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,  
 118                  'uploader' :  'The Washington Post' ,  
 119                  'timestamp' :  1395527908 ,  
 120                  'upload_date' :  '20140322' ,  
 123              'md5' :  '1fff6a689d8770966df78c8cb6c8c17c' ,  
 125                  'id' :  '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,  
 127                  'title' :  'The town bureaucracy sustains' ,  
 128                  'description' :  'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,  
 130                  'timestamp' :  1395528005 ,  
 131                  'upload_date' :  '20140322' ,  
 132                  'uploader' :  'The Washington Post' ,  
 136          'url' :  'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,  
 138              'id' :  'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,  
 139              'title' :  'One airline figured out how to make sure its airplanes never disappear' ,  
 142              'md5' :  'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,  
 144                  'id' :  '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,  
 146                  'description' :  'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,  
 147                  'upload_date' :  '20141230' ,  
 148                  'uploader' :  'The Washington Post' ,  
 149                  'timestamp' :  1419974765 ,  
 150                  'title' :  'Why black boxes don’t transmit data in real time' ,  
 156      def  suitable ( cls
,  url
):  
 157          return False if  WashingtonPostIE
. suitable ( url
)  else  super ( WashingtonPostArticleIE
,  cls
). suitable ( url
)  
 159      def  _real_extract ( self
,  url
):  
 160          page_id 
=  self
._ match
_ id
( url
)  
 161          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  
 163          title 
=  self
._ og
_ search
_ title
( webpage
)  
 165          uuids 
=  re
. findall ( r
'''(?x)  
 167                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|  
 169              )"([^"]+)"''' ,  webpage
)  
 170          entries 
= [ self
. url_result ( 'washingtonpost: %s '  %  uuid
,  'WashingtonPost' ,  uuid
)  for  uuid 
in  uuids
]