]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py 
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
   7  from  . generic 
import  GenericIE
  11      get_element_by_attribute
,   18  from  .. compat 
import  compat_etree_fromstring
  21  class  ARDMediathekIE ( InfoExtractor
):   22      IE_NAME 
=  'ARD:mediathek'   23      _VALID_URL 
=  r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'   26          'url' :  'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,   30              'title' :  'Ich liebe das Leben trotzdem' ,   31              'description' :  'md5:45e4c225c72b27993314b31a84a5261c' ,   36              'skip_download' :  True ,   39          'url' :  'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,   40          'md5' :  'f4d98b10759ac06c0072bbcd1f0b9e3e' ,   44              'title' :  'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,   45              'description' :  'md5:196392e79876d0ac94c94e8cdb2875f1' ,   50          'url' :  'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,   51          'md5' :  '219d94d8980b4f538c7fcb0865eb7f2c' ,   55              'title' :  'Tod eines Fußballers' ,   56              'description' :  'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,   60          'url' :  'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,   61          'only_matching' :  True ,   64      def  _extract_media_info ( self
,  media_info_url
,  webpage
,  video_id
):   65          media_info 
=  self
._ download
_ json
(   66              media_info_url
,  video_id
,  'Downloading media JSON' )   68          formats 
=  self
._ extract
_ formats
( media_info
,  video_id
)   71              if  '"fsk"'  in  webpage
:   73                      'This video is only available after 20:00' ,  expected
= True )   74              elif  media_info
. get ( '_geoblocked' ):   75                  raise  ExtractorError ( 'This video is not available due to geo restriction' ,  expected
= True )   77          self
._ sort
_ formats
( formats
)   79          duration 
=  int_or_none ( media_info
. get ( '_duration' ))   80          thumbnail 
=  media_info
. get ( '_previewImage' )   83          subtitle_url 
=  media_info
. get ( '_subtitleUrl' )   93              'thumbnail' :  thumbnail
,   95              'subtitles' :  subtitles
,   98      def  _extract_formats ( self
,  media_info
,  video_id
):   99          type_ 
=  media_info
. get ( '_type' )  100          media_array 
=  media_info
. get ( '_mediaArray' , [])  102          for  num
,  media 
in  enumerate ( media_array
):  103              for  stream 
in  media
. get ( '_mediaStreamArray' , []):  104                  stream_urls 
=  stream
. get ( '_stream' )  107                  if not  isinstance ( stream_urls
,  list ):  108                      stream_urls 
= [ stream_urls
]  109                  quality 
=  stream
. get ( '_quality' )  110                  server 
=  stream
. get ( '_server' )  111                  for  stream_url 
in  stream_urls
:  112                      ext 
=  determine_ext ( stream_url
)  113                      if  quality 
!=  'auto'  and  ext 
in  ( 'f4m' ,  'm3u8' ):  116                          formats
. extend ( self
._ extract
_ f
4 m
_ formats
(  117                              stream_url 
+  '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,  118                              video_id
,  preference
=- 1 ,  f4m_id
= 'hds' ,  fatal
= False ))  120                          formats
. extend ( self
._ extract
_ m
3u8_ formats
(  121                              stream_url
,  video_id
,  'mp4' ,  preference
= 1 ,  m3u8_id
= 'hls' ,  fatal
= False ))  123                          if  server 
and  server
. startswith ( 'rtmp' ):  126                                  'play_path' :  stream_url
,  127                                  'format_id' :  'a %s-r tmp- %s '  % ( num
,  quality
),  129                          elif  stream_url
. startswith ( 'http' ):  132                                  'format_id' :  'a %s-%s-%s '  % ( num
,  ext
,  quality
)  136                          m 
=  re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' ,  stream_url
)  139                                  'width' :  int ( m
. group ( 'width' )),  140                                  'height' :  int ( m
. group ( 'height' )),  147      def  _real_extract ( self
,  url
):  148          # determine video id from url  149          m 
=  re
. match ( self
._ VALID
_U RL
,  url
)  151          numid 
=  re
. search ( r
'documentId=([0-9]+)' ,  url
)  153              video_id 
=  numid
. group ( 1 )  155              video_id 
=  m
. group ( 'video_id' )  157          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  159          if  '>Der gewünschte Beitrag ist nicht mehr verfügbar.<'  in  webpage
:  160              raise  ExtractorError ( 'Video  %s  is no longer available'  %  video_id
,  expected
= True )  162          if  'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.'  in  webpage
:  163              raise  ExtractorError ( 'This program is only suitable for those aged 12 and older. Video  %s  is therefore only available between 20 pm and 6 am.'  %  video_id
,  expected
= True )  165          if  re
. search ( r
'[\?&]rss($|[=&])' ,  url
):  166              doc 
=  compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))  168                  return  GenericIE () ._ extract
_ rss
( url
,  video_id
,  doc
)  170          title 
=  self
._ html
_ search
_ regex
(  171              [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,  172               r
'<meta name="dcterms.title" content="(.*?)"/>' ,  173               r
'<h4 class="headline">(.*?)</h4>' ],  175          description 
=  self
._ html
_ search
_ meta
(  176              'dcterms.abstract' ,  webpage
,  'description' ,  default
= None )  177          if  description 
is None :  178              description 
=  self
._ html
_ search
_ meta
(  179                  'description' ,  webpage
,  'meta description' )  181          # Thumbnail is sometimes not present.  182          # It is in the mobile version, but that seems to use a different URL  183          # structure altogether.  184          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None )  186          media_streams 
=  re
. findall ( r
'''(?x)  187              mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*  188              "([^"]+)"''' ,  webpage
)  191              QUALITIES 
=  qualities ([ 'lo' ,  'hi' ,  'hq' ])  193              for  furl 
in  set ( media_streams
):  194                  if  furl
. endswith ( '.f4m' ):  197                      fid_m 
=  re
. match ( r
'.*\.([^.]+)\.[^.]+$' ,  furl
)  198                      fid 
=  fid_m
. group ( 1 )  if  fid_m 
else None  200                      'quality' :  QUALITIES ( fid
),  204              self
._ sort
_ formats
( formats
)  208          else :   # request JSON file  209              info 
=  self
._ extract
_ media
_ info
(  210                  'http://www.ardmediathek.de/play/media/ %s '  %  video_id
,  webpage
,  video_id
)  215              'description' :  description
,  216              'thumbnail' :  thumbnail
,  222  class  ARDIE ( InfoExtractor
):  223      _VALID_URL 
=  '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'  225          'url' :  'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,  226          'md5' :  'd216c3a86493f9322545e045ddc3eb35' ,  228              'display_id' :  'die-story-im-ersten-mission-unter-falscher-flagge' ,  232              'title' :  'Die Story im Ersten: Mission unter falscher Flagge' ,  233              'upload_date' :  '20140804' ,  234              'thumbnail' :  're:^https?://.*\.jpg$' ,  238      def  _real_extract ( self
,  url
):  239          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  240          display_id 
=  mobj
. group ( 'display_id' )  242          player_url 
=  mobj
. group ( 'mainurl' ) +  '~playerXml.xml'  243          doc 
=  self
._ download
_ xml
( player_url
,  display_id
)  244          video_node 
=  doc
. find ( './video' )  245          upload_date 
=  unified_strdate ( xpath_text (  246              video_node
,  './broadcastDate' ))  247          thumbnail 
=  xpath_text ( video_node
,  './/teaserImage//variant/url' )  250          for  a 
in  video_node
. findall ( './/asset' ):  252                  'format_id' :  a
. attrib
[ 'type' ],  253                  'width' :  int_or_none ( a
. find ( './frameWidth' ). text
),  254                  'height' :  int_or_none ( a
. find ( './frameHeight' ). text
),  255                  'vbr' :  int_or_none ( a
. find ( './bitrateVideo' ). text
),  256                  'abr' :  int_or_none ( a
. find ( './bitrateAudio' ). text
),  257                  'vcodec' :  a
. find ( './codecVideo' ). text
,  258                  'tbr' :  int_or_none ( a
. find ( './totalBitrate' ). text
),  260              if  a
. find ( './serverPrefix' ). text
:  261                  f
[ 'url' ] =  a
. find ( './serverPrefix' ). text
 262                  f
[ 'playpath' ] =  a
. find ( './fileName' ). text
 264                  f
[ 'url' ] =  a
. find ( './fileName' ). text
 266          self
._ sort
_ formats
( formats
)  269              'id' :  mobj
. group ( 'id' ),  271              'display_id' :  display_id
,  272              'title' :  video_node
. find ( './title' ). text
,  273              'duration' :  parse_duration ( video_node
. find ( './duration' ). text
),  274              'upload_date' :  upload_date
,  275              'thumbnail' :  thumbnail
,  279  class  SportschauIE ( ARDMediathekIE
):  280      IE_NAME 
=  'Sportschau'  281      _VALID_URL 
=  r
'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'  283          'url' :  'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html' ,  285              'id' :  'seppeltkokainhatnichtsmitklassischemdopingzutun100' ,  287              'title' :  'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"' ,  288              'thumbnail' :  're:^https?://.*\.jpg$' ,  289              'description' :  'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.' ,  293              'skip_download' :  True ,  297      def  _real_extract ( self
,  url
):  298          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  299          video_id 
=  mobj
. group ( 'id' )  300          base_url 
=  mobj
. group ( 'baseurl' )  302          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  303          title 
=  get_element_by_attribute ( 'class' ,  'headline' ,  webpage
)  304          description 
=  self
._ html
_ search
_ meta
( 'description' ,  webpage
,  'description' )  306          info 
=  self
._ extract
_ media
_ info
(  307              base_url 
+  '-mc_defaultQuality-h.json' ,  webpage
,  video_id
)  311              'description' :  description
,