]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/washingtonpost.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  13  class  WashingtonPostIE ( InfoExtractor
):  
  14      _VALID_URL 
=  r
'https?://(?:www\.)?washingtonpost\.com/.*?/(?P<id>[^/]+)/(?:$|[?#])'  
  16          'url' :  'http://www.washingtonpost.com/sf/national/2014/03/22/sinkhole-of-bureaucracy/' ,  
  18              'id' :  'sinkhole-of-bureaucracy' ,  
  19              'title' :  'Sinkhole of bureaucracy' ,  
  22              'md5' :  '79132cc09ec5309fa590ae46e4cc31bc' ,  
  24                  'id' :  'fc433c38-b146-11e3-b8b3-44b1d1cd4c1f' ,  
  26                  'title' :  'Breaking Points: The Paper Mine' ,  
  28                  'description' :  'Overly complicated paper pushing is nothing new to government bureaucracy. But the way federal retirement applications are filed may be the most outdated. David Fahrenthold explains.' ,  
  29                  'uploader' :  'The Washington Post' ,  
  30                  'timestamp' :  1395527908 ,  
  31                  'upload_date' :  '20140322' ,  
  34              'md5' :  'e1d5734c06865cc504ad99dc2de0d443' ,  
  36                  'id' :  '41255e28-b14a-11e3-b8b3-44b1d1cd4c1f' ,  
  38                  'title' :  'The town bureaucracy sustains' ,  
  39                  'description' :  'Underneath the friendly town of Boyers is a sea of government paperwork. In a disused limestone mine, hundreds of locals now track, file and process retirement applications for the federal government. We set out to find out what it \' s like to do paperwork 230 feet underground.' ,  
  41                  'timestamp' :  1395528005 ,  
  42                  'upload_date' :  '20140322' ,  
  43                  'uploader' :  'The Washington Post' ,  
  47          'url' :  'http://www.washingtonpost.com/blogs/wonkblog/wp/2014/12/31/one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear/' ,  
  49              'id' :  'one-airline-figured-out-how-to-make-sure-its-airplanes-never-disappear' ,  
  50              'title' :  'One airline figured out how to make sure its airplanes never disappear' ,  
  53              'md5' :  'a7c1b5634ba5e57a6a82cdffa5b1e0d0' ,  
  55                  'id' :  '0e4bb54c-9065-11e4-a66f-0ca5037a597d' ,  
  57                  'description' :  'Washington Post transportation reporter Ashley Halsey III explains why a plane \' s black box needs to be recovered from a crash site instead of having its information streamed in real time throughout the flight.' ,  
  58                  'upload_date' :  '20141230' ,  
  59                  'uploader' :  'The Washington Post' ,  
  60                  'timestamp' :  1419974765 ,  
  61                  'title' :  'Why black boxes don’t transmit data in real time' ,  
  66      def  _real_extract ( self
,  url
):  
  67          page_id 
=  self
._ match
_ id
( url
)  
  68          webpage 
=  self
._ download
_ webpage
( url
,  page_id
)  
  70          title 
=  self
._ og
_ search
_ title
( webpage
)  
  72          uuids 
=  re
. findall ( r
'''(?x)  
  74                  <div\s+class="posttv-video-embed[^>]*?data-uuid=|  
  76              )"([^"]+)"''' ,  webpage
)  
  78          for  i
,  uuid 
in  enumerate ( uuids
,  start
= 1 ):  
  79              vinfo_all 
=  self
._ download
_ json
(  
  80                  'http://www.washingtonpost.com/posttv/c/videojson/ %s ?resType=jsonp'  %  uuid
,  
  82                  transform_source
= strip_jsonp
,  
  83                  note
= 'Downloading information of video  %d / %d '  % ( i
,  len ( uuids
))  
  85              vinfo 
=  vinfo_all
[ 0 ][ 'contentConfig' ]  
  86              uploader 
=  vinfo
. get ( 'credits' , {}). get ( 'source' )  
  87              timestamp 
=  int_or_none (  
  88                  vinfo
. get ( 'dateConfig' , {}). get ( 'dateFirstPublished' ),  1000 )  
  92                      ' %s-%s-%s '  % ( s
. get ( 'type' ),  s
. get ( 'width' ),  s
. get ( 'bitrate' ))  
  95                  'vbr' :  s
. get ( 'bitrate' )  if  s
. get ( 'width' ) !=  0  else None ,  
  96                  'width' :  s
. get ( 'width' ),  
  97                  'height' :  s
. get ( 'height' ),  
  98                  'acodec' :  s
. get ( 'audioCodec' ),  
  99                  'vcodec' :  s
. get ( 'videoCodec' )  if  s
. get ( 'width' ) !=  0  else  'none' ,  
 100                  'filesize' :  s
. get ( 'fileSize' ),  
 103                  'preference' : - 100  if  s
. get ( 'type' ) ==  'smil'  else None ,  
 107                  }. get ( s
. get ( 'type' )),  
 108              }  for  s 
in  vinfo
. get ( 'streams' , [])]  
 109              source_media_url 
=  vinfo
. get ( 'sourceMediaURL' )  
 112                      'format_id' :  'source_media' ,  
 113                      'url' :  source_media_url
,  
 115              self
._ sort
_ formats
( formats
)  
 118                  'title' :  vinfo
[ 'title' ],  
 119                  'description' :  vinfo
. get ( 'blurb' ),  
 120                  'uploader' :  uploader
,  
 122                  'duration' :  int_or_none ( vinfo
. get ( 'videoDuration' ),  100 ),  
 123                  'timestamp' :  timestamp
,