]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/pbs.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  17  class  PBSIE ( InfoExtractor
):  
  18      _VALID_URL 
=  r
'''(?x)https?://  
  21             video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |  
  22             # Article with embedded player (or direct video)  
  23             (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |  
  25             video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/  
  31              'url' :  'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/' ,  
  32              'md5' :  'ce1888486f0908d555a8093cac9a7362' ,  
  36                  'title' :  'Constitution USA with Peter Sagal - A More Perfect Union' ,  
  37                  'description' :  'md5:ba0c207295339c8d6eced00b7c363c6a' ,  
  41                  'skip_download' :  True ,   # requires ffmpeg  
  45              'url' :  'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/' ,  
  46              'md5' :  '143c98aa54a346738a3d78f54c925321' ,  
  50                  'title' :  'FRONTLINE - Losing Iraq' ,  
  51                  'description' :  'md5:f5bfbefadf421e8bb8647602011caf8e' ,  
  55                  'skip_download' :  True ,   # requires ffmpeg  
  59              'url' :  'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/' ,  
  60              'md5' :  'b19856d7f5351b17a5ab1dc6a64be633' ,  
  64                  'title' :  'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist' ,  
  65                  'description' :  'md5:5871c15cba347c1b3d28ac47a73c7c28' ,  
  70              'url' :  'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/' ,  
  71              'md5' :  'c62859342be2a0358d6c9eb306595978' ,  
  75                  'description' :  'md5:68d87ef760660eb564455eb30ca464fe' ,  
  76                  'title' :  'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full' ,  
  78                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
  81                  'skip_download' :  True ,   # requires ffmpeg  
  85              'url' :  'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html' ,  
  86              'md5' :  '908f3e5473a693b266b84e25e1cf9703' ,  
  89                  'display_id' :  'killer-typhoon' ,  
  91                  'description' :  'md5:c741d14e979fc53228c575894094f157' ,  
  92                  'title' :  'NOVA - Killer Typhoon' ,  
  94                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
  95                  'upload_date' :  '20140122' ,  
  99                  'skip_download' :  True ,   # requires ffmpeg  
 103              'url' :  'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/' ,  
 105                  'id' :  'united-states-of-secrets' ,  
 110              'url' :  'http://www.pbs.org/wgbh/americanexperience/films/death/player/' ,  
 113                  'display_id' :  'player' ,  
 115                  'title' :  'American Experience - Death and the Civil War, Chapter 1' ,  
 116                  'description' :  'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.' ,  
 118                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
 121                  'skip_download' :  True ,   # requires ffmpeg  
 125              'url' :  'http://video.pbs.org/video/2365367186/' ,  
 128                  'display_id' :  '2365367186' ,  
 130                  'title' :  'To Catch A Comet - Full Episode' ,  
 131                  'description' :  'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.' ,  
 133                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
 136                  'skip_download' :  True ,   # requires ffmpeg  
 141              # Video embedded in iframe containing angle brackets as attribute's value (e.g.  
 142              # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see  
 143              # https://github.com/rg3/youtube-dl/issues/7059)  
 144              'url' :  'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/' ,  
 147                  'display_id' :  'a-chefs-life-season-3-episode-5-prickly-business' ,  
 149                  'title' :  "A Chef's Life - Season 3, Ep. 5: Prickly Business" ,  
 150                  'description' :  'md5:61db2ddf27c9912f09c241014b118ed1' ,  
 152                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
 155                  'skip_download' :  True ,   # requires ffmpeg  
 159              # Frontline video embedded via flp2012.js  
 160              'url' :  'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists' ,  
 163                  'display_id' :  'the-atomic-artists' ,  
 165                  'title' :  'FRONTLINE - The Atomic Artists' ,  
 166                  'description' :  'md5:f5bfbefadf421e8bb8647602011caf8e' ,  
 168                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
 171                  'skip_download' :  True ,   # requires ffmpeg  
 176          101 :  'We \' re sorry, but this video is not yet available.' ,  
 177          403 :  'We \' re sorry, but this video is not available in your region due to right restrictions.' ,  
 178          404 :  'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.' ,  
 179          410 :  'This video has expired and is no longer available for online streaming.' ,  
 182      def  _extract_webpage ( self
,  url
):  
 183          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 185          presumptive_id 
=  mobj
. group ( 'presumptive_id' )  
 186          display_id 
=  presumptive_id
 
 188              webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  
 190              upload_date 
=  unified_strdate ( self
._ search
_ regex
(  
 191                  r
'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"' ,  
 192                  webpage
,  'upload date' ,  default
= None ))  
 194              # tabbed frontline videos  
 195              tabbed_videos 
=  re
. findall (  
 196                  r
'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"' ,  webpage
)  
 198                  return  tabbed_videos
,  presumptive_id
,  upload_date
 
 201                  r
"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'" ,   # frontline video embed  
 202                  r
'class="coveplayerid">([^<]+)<' ,                        # coveplayer  
 203                  r
'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>' ,   # jwplayer  
 206              media_id 
=  self
._ search
_ regex
(  
 207                  MEDIA_ID_REGEXES
,  webpage
,  'media ID' ,  fatal
= False ,  default
= None )  
 209                  return  media_id
,  presumptive_id
,  upload_date
 
 211              # Fronline video embedded via flp  
 212              video_id 
=  self
._ search
_ regex
(  
 213                  r
'videoid\s*:\s*"([\d+a-z]{7,})"' ,  webpage
,  'videoid' ,  default
= None )  
 215                  # pkg_id calculation is reverse engineered from  
 216                  # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js  
 217                  prg_id 
=  self
._ search
_ regex
(  
 218                      r
'videoid\s*:\s*"([\d+a-z]{7,})"' ,  webpage
,  'videoid' )[ 7 :]  
 220                      prg_id 
=  prg_id
. split ( 'q' )[ 1 ]  
 221                  prg_id 
=  int ( prg_id
,  16 )  
 222                  getdir 
=  self
._ download
_ json
(  
 223                      'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir %d .json'  %  prg_id
,  
 224                      presumptive_id
,  'Downloading getdir JSON' ,  
 225                      transform_source
= strip_jsonp
)  
 226                  return  getdir
[ 'mid' ],  presumptive_id
,  upload_date
 
 228              for  iframe 
in  re
. findall ( r
'(?s)<iframe(.+?)></iframe>' ,  webpage
):  
 229                  url 
=  self
._ search
_ regex
(  
 230                      r
'src=(["\' ])( ?P
< url
>.+ ?partnerplayer
.+ ?
) \
1 ', iframe,  
 231                      ' player URL
', default=None, group=' url
')  
 235              mobj = re.match(self._VALID_URL, url)  
 237          player_id = mobj.group(' player_id
')  
 239              display_id = player_id  
 241              player_page = self._download_webpage(  
 242                  url, display_id, note=' Downloading player page
',  
 243                  errnote=' Could 
not  download player page
')  
 244              video_id = self._search_regex(  
 245                  r' < div\s
+ id = "video_([0-9]+)" ', player_page, ' video ID
')  
 247              video_id = mobj.group(' id ')  
 248              display_id = video_id  
 250          return video_id, display_id, None  
 252      def _real_extract(self, url):  
 253          video_id, display_id, upload_date = self._extract_webpage(url)  
 255          if isinstance(video_id, list):  
 256              entries = [self.url_result(  
 257                  ' http
:// video
. pbs
. org
/ video
/ %s ' % vid_id, ' PBS
', vid_id)  
 258                  for vid_id in video_id]  
 259              return self.playlist_result(entries, display_id)  
 261          info = self._download_json(  
 262              ' http
:// video
. pbs
. org
/ videoInfo
/ %s ?format
= json
& type = partner
' % video_id,  
 266          for encoding_name in (' recommended_encoding
', ' alternate_encoding
'):  
 267              redirect = info.get(encoding_name)  
 270              redirect_url = redirect.get(' url
')  
 274              redirect_info = self._download_json(  
 275                  redirect_url + ' ?format
= json
', display_id,  
 276                  ' Downloading 
%s  video url info
' % encoding_name)  
 278              if redirect_info[' status
'] == ' error
':  
 279                  raise ExtractorError(  
 282                          self._ERRORS.get(redirect_info[' http_code
'], redirect_info[' message
'])),  
 285              format_url = redirect_info.get(' url
')  
 289              if determine_ext(format_url) == ' m3u8
':  
 290                  formats.extend(self._extract_m3u8_formats(  
 291                      format_url, display_id, ' mp4
', preference=1, m3u8_id=' hls
'))  
 295                      ' format_id
': redirect.get(' eeid
'),  
 297          self._sort_formats(formats)  
 299          rating_str = info.get(' rating
')  
 300          if rating_str is not None:  
 301              rating_str = rating_str.rpartition(' - ')[2]  
 302          age_limit = US_RATINGS.get(rating_str)  
 305          closed_captions_url = info.get(' closed_captions_url
')  
 306          if closed_captions_url:  
 309                  ' url
': closed_captions_url,  
 312          # info[' title
'] is often incomplete (e.g. ' Full Episode
', ' Episode 
5 ', etc)  
 313          # Try turning it to ' program 
-  title
' naming scheme if possible  
 314          alt_title = info.get(' program
', {}).get(' title
')  
 316              info[' title
'] = alt_title + '  -  ' + re.sub(r' ^
' + alt_title + ' [ \s\
-:]+ ', ' ', info[' title
'])  
 320              ' display_id
': display_id,  
 321              ' title
': info[' title
'],  
 322              ' description
': info[' program
'].get(' description
'),  
 323              ' thumbnail
': info.get(' image_url
'),  
 324              ' duration
': int_or_none(info.get(' duration
')),  
 325              ' age_limit
': age_limit,  
 326              ' upload_date
': upload_date,  
 328              ' subtitles
': subtitles,