]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/tagesschau.py 
8670cee28d381de6011e3187db3024bcc40519de
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
  15  class  TagesschauPlayerIE ( InfoExtractor
):   16      IE_NAME 
=  'tagesschau:player'   17      _VALID_URL 
=  r
'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'   20          'url' :  'http://www.tagesschau.de/multimedia/video/video-179517~player.html' ,   21          'md5' :  '8d09548d5c15debad38bee3a4d15ca21' ,   25              'title' :  'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD' ,   26              'thumbnail' :  're:^https?:.*\.jpg$' ,   27              'formats' :  'mincount:6' ,   30          'url' :  'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html' ,   31          'md5' :  '76e6eec6ebd40740671cf0a2c88617e5' ,   35              'title' :  'Trabi - Bye, bye Rennpappe' ,   36              'thumbnail' :  're:^https?:.*\.jpg$' ,   37              'formats' :  'mincount:2' ,   40          'url' :  'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html' ,   41          'only_matching' :  True ,   46          's' : { 'width' :  320 ,  'height' :  180 ,  'quality' :  1 },   47          'm' : { 'width' :  512 ,  'height' :  288 ,  'quality' :  2 },   48          'l' : { 'width' :  960 ,  'height' :  540 ,  'quality' :  3 },   49          'xl' : { 'width' :  1280 ,  'height' :  720 ,  'quality' :  4 },   50          'xxl' : { 'quality' :  5 },   53      def  _extract_via_api ( self
,  kind
,  video_id
):   54          info 
=  self
._ download
_ json
(   55              'https://www.tagesschau.de/api/multimedia/ {0} / {0} - {1} .json' . format ( kind
,  video_id
),   57          title 
=  info
[ 'headline' ]   59          for  media 
in  info
[ 'mediadata' ]:   60              for  format_id
,  format_url 
in  media
. items ():   61                  if  determine_ext ( format_url
) ==  'm3u8' :   62                      formats
. extend ( self
._ extract
_ m
3u8_ formats
(   63                          format_url
,  video_id
,  'mp4' ,   64                          entry_protocol
= 'm3u8_native' ,  m3u8_id
= 'hls' ))   68                          'format_id' :  format_id
,   69                          'vcodec' :  'none'  if  kind 
==  'audio'  else None ,   71          self
._ sort
_ formats
( formats
)   72          timestamp 
=  parse_iso8601 ( info
. get ( 'date' ))   76              'timestamp' :  timestamp
,   80      def  _real_extract ( self
,  url
):   81          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)   82          video_id 
=  mobj
. group ( 'id' )   84          # kind = mobj.group('kind').lower()   86          #     return self._extract_via_api(kind, video_id)   88          # JSON api does not provide some audio formats (e.g. ogg) thus   89          # extractiong audio via webpage   91          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)   93          title 
=  self
._ og
_ search
_ title
( webpage
). strip ()   96          for  media_json 
in  re
. findall ( r
'({src\s*:\s*["\' ] http
[ ^
}]+ type \s
*:[ ^
}]+}) ', webpage):   97              media = self._parse_json(js_to_json(media_json), video_id, fatal=False)  100              src = media.get(' src
')  103              quality = media.get(' quality
')  104              kind = media.get(' type ', ' ').split(' / ')[0]  105              ext = determine_ext(src)  108                  ' format_id
': ' %s _ %s ' % (quality, ext) if quality else ext,  110                  ' vcodec
': ' none
' if kind == ' audio
' else None,  112              f.update(self._FORMATS.get(quality, {}))  115          self._sort_formats(formats)  117          thumbnail = self._og_search_thumbnail(webpage)  122              ' thumbnail
': thumbnail,  127  class TagesschauIE(InfoExtractor):  128      _VALID_URL = r' https?
://( ?
: www\
.) ?tagesschau\
. de
/( ?P
< path
>[ ^
/]+/( ?
:[ ^
/]+/)* ?
( ?P
< id >[ ^
/ #?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'  131          'url' :  'http://www.tagesschau.de/multimedia/video/video-102143.html' ,  132          'md5' :  'f7c27a0eff3bfe8c7727e65f8fe1b1e6' ,  134              'id' :  'video-102143' ,  136              'title' :  'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt' ,  137              'description' :  '18.07.2015 20:10 Uhr' ,  138              'thumbnail' :  're:^https?:.*\.jpg$' ,  141          'url' :  'http://www.tagesschau.de/multimedia/sendung/ts-5727.html' ,  142          'md5' :  '3c54c1f6243d279b706bde660ceec633' ,  146              'title' :  'Sendung: tagesschau  \t 04.12.2014 20:00 Uhr' ,  147              'description' :  'md5:695c01bfd98b7e313c501386327aea59' ,  148              'thumbnail' :  're:^https?:.*\.jpg$' ,  152          'url' :  'http://www.tagesschau.de/multimedia/audio/audio-29417.html' ,  153          'md5' :  '76e6eec6ebd40740671cf0a2c88617e5' ,  157              'title' :  'Trabi - Bye, bye Rennpappe' ,  158              'description' :  'md5:8687dda862cbbe2cfb2df09b56341317' ,  159              'thumbnail' :  're:^https?:.*\.jpg$' ,  163          'url' :  'http://www.tagesschau.de/inland/bnd-303.html' ,  164          'md5' :  'e0916c623e85fc1d2b26b78f299d3958' ,  168              'title' :  'Viele Baustellen für neuen BND-Chef' ,  169              'description' :  'md5:1e69a54be3e1255b2b07cdbce5bcd8b4' ,  170              'thumbnail' :  're:^https?:.*\.jpg$' ,  173          'url' :  'http://www.tagesschau.de/inland/afd-parteitag-135.html' ,  175              'id' :  'afd-parteitag-135' ,  176              'title' :  'Möchtegern-Underdog mit Machtanspruch' ,  180          'url' :  'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html' ,  181          'only_matching' :  True ,  183          'url' :  'http://www.tagesschau.de/multimedia/sendung/tt-3827.html' ,  184          'only_matching' :  True ,  186          'url' :  'http://www.tagesschau.de/multimedia/sendung/nm-3475.html' ,  187          'only_matching' :  True ,  189          'url' :  'http://www.tagesschau.de/multimedia/sendung/weltspiegel-3167.html' ,  190          'only_matching' :  True ,  192          'url' :  'http://www.tagesschau.de/multimedia/tsvorzwanzig-959.html' ,  193          'only_matching' :  True ,  195          'url' :  'http://www.tagesschau.de/multimedia/sendung/bab/bab-3299~_bab-sendung-209.html' ,  196          'only_matching' :  True ,  198          'url' :  'http://www.tagesschau.de/multimedia/video/video-102303~_bab-sendung-211.html' ,  199          'only_matching' :  True ,  201          'url' :  'http://www.tagesschau.de/100sekunden/index.html' ,  202          'only_matching' :  True ,  204          # playlist article with collapsing sections  205          'url' :  'http://www.tagesschau.de/wirtschaft/faq-freihandelszone-eu-usa-101.html' ,  206          'only_matching' :  True ,  210      def  suitable ( cls
,  url
):  211          return False if  TagesschauPlayerIE
. suitable ( url
)  else  super ( TagesschauIE
,  cls
). suitable ( url
)  213      def  _extract_formats ( self
,  download_text
,  media_kind
):  215              r
'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>' ,  219              link_url 
=  l
. group ( 'url' )  222              format_id 
=  self
._ search
_ regex
(  223                  r
'.*/[^/.]+\.([^/]+)\.[^/.]+$' ,  link_url
,  'format ID' ,  224                  default
= determine_ext ( link_url
))  226                  'format_id' :  format_id
,  227                  'url' :  l
. group ( 'url' ),  228                  'format_name' :  l
. group ( 'name' ),  230              title 
=  l
. group ( 'title' )  232                  if  media_kind
. lower () ==  'video' :  235                              Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;  236                              (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;  237                              (?P<vbr>[0-9]+)kbps&\#10;  238                              Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;  239                              Größe:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''' ,  243                              'format_note' :  m
. group ( 'audio_desc' ),  244                              'vcodec' :  m
. group ( 'vcodec' ),  245                              'width' :  int ( m
. group ( 'width' )),  246                              'height' :  int ( m
. group ( 'height' )),  247                              'abr' :  int ( m
. group ( 'abr' )),  248                              'vbr' :  int ( m
. group ( 'vbr' )),  249                              'filesize_approx' :  parse_filesize ( m
. group ( 'filesize_approx' )),  253                          r
'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)' ,  257                              'format_note' :  ' %s ,  %s '  % ( m
. group ( 'format' ),  m
. group ( 'note' )),  259                              'abr' :  int ( m
. group ( 'abr' )),  261              formats
. append ( format
)  262          self
._ sort
_ formats
( formats
)  265      def  _real_extract ( self
,  url
):  266          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  267          video_id 
=  mobj
. group ( 'id' )  or  mobj
. group ( 'path' )  268          display_id 
=  video_id
. lstrip ( '-' )  270          webpage 
=  self
._ download
_ webpage
( url
,  display_id
)  272          title 
=  self
._ html
_ search
_ regex
(  273              r
'<span[^>]*class="headline"[^>]*>(.+?)</span>' ,  274              webpage
,  'title' ,  default
= None )  or  self
._ og
_ search
_ title
( webpage
)  276          DOWNLOAD_REGEX 
=  r
'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'  278          webpage_type 
=  self
._ og
_ search
_ property
( 'type' ,  webpage
,  default
= None )  279          if  webpage_type 
==  'website' :   # Article  281              for  num
, ( entry_title
,  media_kind
,  download_text
)  in  enumerate ( re
. findall (  282                      r
'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*? %s '  %  DOWNLOAD_REGEX
,  285                      'id' :  ' %s-%d '  % ( display_id
,  num
),  286                      'title' :  ' %s '  %  entry_title
,  287                      'formats' :  self
._ extract
_ formats
( download_text
,  media_kind
),  290                  return  self
. playlist_result ( entries
,  display_id
,  title
)  291              formats 
=  entries
[ 0 ][ 'formats' ]  292          else :   # Assume single video  293              download_text 
=  self
._ search
_ regex
(  294                  DOWNLOAD_REGEX
,  webpage
,  'download links' ,  group
= 'links' )  295              media_kind 
=  self
._ search
_ regex
(  296                  DOWNLOAD_REGEX
,  webpage
,  'media kind' ,  default
= 'Video' ,  group
= 'kind' )  297              formats 
=  self
._ extract
_ formats
( download_text
,  media_kind
)  298          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
)  299          description 
=  self
._ html
_ search
_ regex
(  300              r
'(?s)<p class="teasertext">(.*?)</p>' ,  301              webpage
,  'description' ,  default
= None )  303          self
._ sort
_ formats
( formats
)  308              'thumbnail' :  thumbnail
,  310              'description' :  description
,