]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/pbs.py 
8fb9b1849cfd96e5ce21ef2ffcffba200f8ba482
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  17  class  PBSIE ( InfoExtractor
):   18      _VALID_URL 
=  r
'''(?x)https?://   21             video\.pbs\.org/(?:viralplayer|video)/(?P<id>[0-9]+)/? |   22             # Article with embedded player (or direct video)   23             (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |   25             video\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/   31              'url' :  'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/' ,   32              'md5' :  'ce1888486f0908d555a8093cac9a7362' ,   36                  'title' :  'Constitution USA with Peter Sagal - A More Perfect Union' ,   37                  'description' :  'md5:ba0c207295339c8d6eced00b7c363c6a' ,   41                  'skip_download' :  True ,   # requires ffmpeg   45              'url' :  'http://www.pbs.org/wgbh/pages/frontline/losing-iraq/' ,   46              'md5' :  '143c98aa54a346738a3d78f54c925321' ,   50                  'title' :  'FRONTLINE - Losing Iraq' ,   51                  'description' :  'md5:f5bfbefadf421e8bb8647602011caf8e' ,   55                  'skip_download' :  True ,   # requires ffmpeg   59              'url' :  'http://www.pbs.org/newshour/bb/education-jan-june12-cyberschools_02-23/' ,   60              'md5' :  'b19856d7f5351b17a5ab1dc6a64be633' ,   64                  'title' :  'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist' ,   65                  'description' :  'md5:5871c15cba347c1b3d28ac47a73c7c28' ,   70              'url' :  'http://www.pbs.org/wnet/gperf/dudamel-conducts-verdi-requiem-hollywood-bowl-full-episode/3374/' ,   71              'md5' :  'c62859342be2a0358d6c9eb306595978' ,   75                  'description' :  'md5:68d87ef760660eb564455eb30ca464fe' ,   76                  'title' :  'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full' ,   78                  'thumbnail' :  're:^https?://.*\.jpg$' ,   81                  'skip_download' :  True ,   # requires ffmpeg   85              'url' :  'http://www.pbs.org/wgbh/nova/earth/killer-typhoon.html' ,   86              'md5' :  '908f3e5473a693b266b84e25e1cf9703' ,   89                  'display_id' :  'killer-typhoon' ,   91                  'description' :  'md5:c741d14e979fc53228c575894094f157' ,   92                  'title' :  'NOVA - Killer Typhoon' ,   94                  'thumbnail' :  're:^https?://.*\.jpg$' ,   95                  'upload_date' :  '20140122' ,   99                  'skip_download' :  True ,   # requires ffmpeg  103              'url' :  'http://www.pbs.org/wgbh/pages/frontline/united-states-of-secrets/' ,  105                  'id' :  'united-states-of-secrets' ,  110              'url' :  'http://www.pbs.org/wgbh/americanexperience/films/death/player/' ,  113                  'display_id' :  'player' ,  115                  'title' :  'American Experience - Death and the Civil War, Chapter 1' ,  116                  'description' :  'American Experience, TV’s most-watched history series, brings to life the compelling stories from our past that inform our understanding of the world today.' ,  118                  'thumbnail' :  're:^https?://.*\.jpg$' ,  121                  'skip_download' :  True ,   # requires ffmpeg  125              'url' :  'http://video.pbs.org/video/2365367186/' ,  128                  'display_id' :  '2365367186' ,  130                  'title' :  'To Catch A Comet - Full Episode' ,  131                  'description' :  'On November 12, 2014, billions of kilometers from Earth, spacecraft orbiter Rosetta and lander Philae did what no other had dared to attempt \u2014 land on the volatile surface of a comet as it zooms around the sun at 67,000 km/hr. The European Space Agency hopes this mission can help peer into our past and unlock secrets of our origins.' ,  133                  'thumbnail' :  're:^https?://.*\.jpg$' ,  136                  'skip_download' :  True ,   # requires ffmpeg  141              # Video embedded in iframe containing angle brackets as attribute's value (e.g.  142              # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see  143              # https://github.com/rg3/youtube-dl/issues/7059)  144              'url' :  'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/' ,  147                  'display_id' :  'a-chefs-life-season-3-episode-5-prickly-business' ,  149                  'title' :  "A Chef's Life - Season 3, Ep. 5: Prickly Business" ,  150                  'description' :  'md5:61db2ddf27c9912f09c241014b118ed1' ,  152                  'thumbnail' :  're:^https?://.*\.jpg$' ,  155                  'skip_download' :  True ,   # requires ffmpeg  159              # Frontline video embedded via flp2012.js  160              'url' :  'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists' ,  163                  'display_id' :  'the-atomic-artists' ,  165                  'title' :  'FRONTLINE - The Atomic Artists' ,  166                  'description' :  'md5:f5bfbefadf421e8bb8647602011caf8e' ,  168                  'thumbnail' :  're:^https?://.*\.jpg$' ,  171                  'skip_download' :  True ,   # requires ffmpeg  176          101 :  'We \' re sorry, but this video is not yet available.' ,  177          403 :  'We \' re sorry, but this video is not available in your region due to right restrictions.' ,  178          404 :  'We are experiencing technical difficulties that are preventing us from playing the video at this time. Please check back again soon.' ,  179          410 :  'This video has expired and is no longer available for online streaming.' ,  182      def  _extract_webpage ( self
,  url
):  183          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  185          presumptive_id 
=  mobj
. group ( 'presumptive_id' )  186          display_id 
=  presumptive_id
 188              webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  190              upload_date 
=  unified_strdate ( self
._ search
_ regex
(  191                  r
'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"' ,  192                  webpage
,  'upload date' ,  default
= None ))  194              # tabbed frontline videos  195              tabbed_videos 
=  re
. findall (  196                  r
'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"' ,  webpage
)  198                  return  tabbed_videos
,  presumptive_id
,  upload_date
 201                  r
"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'" ,   # frontline video embed  202                  r
'class="coveplayerid">([^<]+)<' ,                        # coveplayer  203                  r
'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>' ,   # jwplayer  206              media_id 
=  self
._ search
_ regex
(  207                  MEDIA_ID_REGEXES
,  webpage
,  'media ID' ,  fatal
= False ,  default
= None )  209                  return  media_id
,  presumptive_id
,  upload_date
 211              # Fronline video embedded via flp  212              video_id 
=  self
._ search
_ regex
(  213                  r
'videoid\s*:\s*"([\d+a-z]{7,})"' ,  webpage
,  'videoid' ,  default
= None )  215                  # pkg_id calculation is reverse engineered from  216                  # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js  217                  prg_id 
=  self
._ search
_ regex
(  218                      r
'videoid\s*:\s*"([\d+a-z]{7,})"' ,  webpage
,  'videoid' )[ 7 :]  220                      prg_id 
=  prg_id
. split ( 'q' )[ 1 ]  221                  prg_id 
=  int ( prg_id
,  16 )  222                  getdir 
=  self
._ download
_ json
(  223                      'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir %d .json'  %  prg_id
,  224                      presumptive_id
,  'Downloading getdir JSON' ,  225                      transform_source
= strip_jsonp
)  226                  return  getdir
[ 'mid' ],  presumptive_id
,  upload_date
 228              for  iframe 
in  re
. findall ( r
'(?s)<iframe(.+?)></iframe>' ,  webpage
):  229                  url 
=  self
._ search
_ regex
(  230                      r
'src=(["\' ])( ?P
< url
>.+ ?partnerplayer
.+ ?
) \
1 ', iframe,  231                      ' player URL
', default=None, group=' url
')  235              mobj = re.match(self._VALID_URL, url)  237          player_id = mobj.group(' player_id
')  239              display_id = player_id  241              player_page = self._download_webpage(  242                  url, display_id, note=' Downloading player page
',  243                  errnote=' Could 
not  download player page
')  244              video_id = self._search_regex(  245                  r' < div\s
+ id = "video_([0-9]+)" ', player_page, ' video ID
')  247              video_id = mobj.group(' id ')  248              display_id = video_id  250          return video_id, display_id, None  252      def _real_extract(self, url):  253          video_id, display_id, upload_date = self._extract_webpage(url)  255          if isinstance(video_id, list):  256              entries = [self.url_result(  257                  ' http
:// video
. pbs
. org
/ video
/ %s ' % vid_id, ' PBS
', vid_id)  258                  for vid_id in video_id]  259              return self.playlist_result(entries, display_id)  261          info = self._download_json(  262              ' http
:// video
. pbs
. org
/ videoInfo
/ %s ?format
= json
& type = partner
' % video_id,  266          for encoding_name in (' recommended_encoding
', ' alternate_encoding
'):  267              redirect = info.get(encoding_name)  270              redirect_url = redirect.get(' url
')  274              redirect_info = self._download_json(  275                  redirect_url + ' ?format
= json
', display_id,  276                  ' Downloading 
%s  video url info
' % encoding_name)  278              if redirect_info[' status
'] == ' error
':  279                  raise ExtractorError(  282                          self._ERRORS.get(redirect_info[' http_code
'], redirect_info[' message
'])),  285              format_url = redirect_info.get(' url
')  289              if determine_ext(format_url) == ' m3u8
':  290                  formats.extend(self._extract_m3u8_formats(  291                      format_url, display_id, ' mp4
', preference=1, m3u8_id=' hls
'))  295                      ' format_id
': redirect.get(' eeid
'),  297          self._sort_formats(formats)  299          rating_str = info.get(' rating
')  300          if rating_str is not None:  301              rating_str = rating_str.rpartition(' - ')[2]  302          age_limit = US_RATINGS.get(rating_str)  305          closed_captions_url = info.get(' closed_captions_url
')  306          if closed_captions_url:  309                  ' url
': closed_captions_url,  312          # info[' title
'] is often incomplete (e.g. ' Full Episode
', ' Episode 
5 ', etc)  313          # Try turning it to ' program 
-  title
' naming scheme if possible  314          alt_title = info.get(' program
', {}).get(' title
')  316              info[' title
'] = alt_title + '  -  ' + re.sub(r' ^
' + alt_title + ' [ \s\
-:]+ ', ' ', info[' title
'])  320              ' display_id
': display_id,  321              ' title
': info[' title
'],  322              ' description
': info[' program
'].get(' description
'),  323              ' thumbnail
': info.get(' image_url
'),  324              ' duration
': int_or_none(info.get(' duration
')),  325              ' age_limit
': age_limit,  326              ' upload_date
': upload_date,  328              ' subtitles
': subtitles,