]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  . generic 
import  GenericIE
 
  11      get_element_by_attribute
,  
  18  from  .. compat 
import  compat_etree_fromstring
 
  21  class  ARDMediathekIE ( InfoExtractor
):  
  22      IE_NAME 
=  'ARD:mediathek'  
  23      _VALID_URL 
=  r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'  
  26          'url' :  'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,  
  30              'title' :  'Ich liebe das Leben trotzdem' ,  
  31              'description' :  'md5:45e4c225c72b27993314b31a84a5261c' ,  
  36              'skip_download' :  True ,  
  39          'url' :  'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,  
  40          'md5' :  'f4d98b10759ac06c0072bbcd1f0b9e3e' ,  
  44              'title' :  'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,  
  45              'description' :  'md5:196392e79876d0ac94c94e8cdb2875f1' ,  
  50          'url' :  'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,  
  51          'md5' :  '219d94d8980b4f538c7fcb0865eb7f2c' ,  
  55              'title' :  'Tod eines Fußballers' ,  
  56              'description' :  'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,  
  60          'url' :  'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,  
  61          'only_matching' :  True ,  
  64      def  _extract_media_info ( self
,  media_info_url
,  webpage
,  video_id
):  
  65          media_info 
=  self
._ download
_ json
(  
  66              media_info_url
,  video_id
,  'Downloading media JSON' )  
  68          formats 
=  self
._ extract
_ formats
( media_info
,  video_id
)  
  71              if  '"fsk"'  in  webpage
:  
  73                      'This video is only available after 20:00' ,  expected
= True )  
  74              elif  media_info
. get ( '_geoblocked' ):  
  75                  raise  ExtractorError ( 'This video is not available due to geo restriction' ,  expected
= True )  
  77          self
._ sort
_ formats
( formats
)  
  79          duration 
=  int_or_none ( media_info
. get ( '_duration' ))  
  80          thumbnail 
=  media_info
. get ( '_previewImage' )  
  83          subtitle_url 
=  media_info
. get ( '_subtitleUrl' )  
  93              'thumbnail' :  thumbnail
,  
  95              'subtitles' :  subtitles
,  
  98      def  _extract_formats ( self
,  media_info
,  video_id
):  
  99          type_ 
=  media_info
. get ( '_type' )  
 100          media_array 
=  media_info
. get ( '_mediaArray' , [])  
 102          for  num
,  media 
in  enumerate ( media_array
):  
 103              for  stream 
in  media
. get ( '_mediaStreamArray' , []):  
 104                  stream_urls 
=  stream
. get ( '_stream' )  
 107                  if not  isinstance ( stream_urls
,  list ):  
 108                      stream_urls 
= [ stream_urls
]  
 109                  quality 
=  stream
. get ( '_quality' )  
 110                  server 
=  stream
. get ( '_server' )  
 111                  for  stream_url 
in  stream_urls
:  
 112                      ext 
=  determine_ext ( stream_url
)  
 113                      if  quality 
!=  'auto'  and  ext 
in  ( 'f4m' ,  'm3u8' ):  
 116                          formats
. extend ( self
._ extract
_ f
4 m
_ formats
(  
 117                              stream_url 
+  '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,  
 118                              video_id
,  preference
=- 1 ,  f4m_id
= 'hds' ,  fatal
= False ))  
 120                          formats
. extend ( self
._ extract
_ m
3u8_ formats
(  
 121                              stream_url
,  video_id
,  'mp4' ,  preference
= 1 ,  m3u8_id
= 'hls' ,  fatal
= False ))  
 123                          if  server 
and  server
. startswith ( 'rtmp' ):  
 126                                  'play_path' :  stream_url
,  
 127                                  'format_id' :  'a %s-r tmp- %s '  % ( num
,  quality
),  
 129                          elif  stream_url
. startswith ( 'http' ):  
 132                                  'format_id' :  'a %s-%s-%s '  % ( num
,  ext
,  quality
)  
 136                          m 
=  re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' ,  stream_url
)  
 139                                  'width' :  int ( m
. group ( 'width' )),  
 140                                  'height' :  int ( m
. group ( 'height' )),  
 147      def  _real_extract ( self
,  url
):  
 148          # determine video id from url  
 149          m 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 151          numid 
=  re
. search ( r
'documentId=([0-9]+)' ,  url
)  
 153              video_id 
=  numid
. group ( 1 )  
 155              video_id 
=  m
. group ( 'video_id' )  
 157          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 159          if  '>Der gewünschte Beitrag ist nicht mehr verfügbar.<'  in  webpage
:  
 160              raise  ExtractorError ( 'Video  %s  is no longer available'  %  video_id
,  expected
= True )  
 162          if  'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.'  in  webpage
:  
 163              raise  ExtractorError ( 'This program is only suitable for those aged 12 and older. Video  %s  is therefore only available between 20 pm and 6 am.'  %  video_id
,  expected
= True )  
 165          if  re
. search ( r
'[\?&]rss($|[=&])' ,  url
):  
 166              doc 
=  compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))  
 168                  return  GenericIE () ._ extract
_ rss
( url
,  video_id
,  doc
)  
 170          title 
=  self
._ html
_ search
_ regex
(  
 171              [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,  
 172               r
'<meta name="dcterms.title" content="(.*?)"/>' ,  
 173               r
'<h4 class="headline">(.*?)</h4>' ],  
 175          description 
=  self
._ html
_ search
_ meta
(  
 176              'dcterms.abstract' ,  webpage
,  'description' ,  default
= None )  
 177          if  description 
is None :  
 178              description 
=  self
._ html
_ search
_ meta
(  
 179                  'description' ,  webpage
,  'meta description' )  
 181          # Thumbnail is sometimes not present.  
 182          # It is in the mobile version, but that seems to use a different URL  
 183          # structure altogether.  
 184          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None )  
 186          media_streams 
=  re
. findall ( r
'''(?x)  
 187              mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*  
 188              "([^"]+)"''' ,  webpage
)  
 191              QUALITIES 
=  qualities ([ 'lo' ,  'hi' ,  'hq' ])  
 193              for  furl 
in  set ( media_streams
):  
 194                  if  furl
. endswith ( '.f4m' ):  
 197                      fid_m 
=  re
. match ( r
'.*\.([^.]+)\.[^.]+$' ,  furl
)  
 198                      fid 
=  fid_m
. group ( 1 )  if  fid_m 
else None  
 200                      'quality' :  QUALITIES ( fid
),  
 204              self
._ sort
_ formats
( formats
)  
 208          else :   # request JSON file  
 209              info 
=  self
._ extract
_ media
_ info
(  
 210                  'http://www.ardmediathek.de/play/media/ %s '  %  video_id
,  webpage
,  video_id
)  
 215              'description' :  description
,  
 216              'thumbnail' :  thumbnail
,  
 222  class  ARDIE ( InfoExtractor
):  
 223      _VALID_URL 
=  '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'  
 225          'url' :  'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,  
 226          'md5' :  'd216c3a86493f9322545e045ddc3eb35' ,  
 228              'display_id' :  'die-story-im-ersten-mission-unter-falscher-flagge' ,  
 232              'title' :  'Die Story im Ersten: Mission unter falscher Flagge' ,  
 233              'upload_date' :  '20140804' ,  
 234              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 238      def  _real_extract ( self
,  url
):  
 239          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 240          display_id 
=  mobj
. group ( 'display_id' )  
 242          player_url 
=  mobj
. group ( 'mainurl' ) +  '~playerXml.xml'  
 243          doc 
=  self
._ download
_ xml
( player_url
,  display_id
)  
 244          video_node 
=  doc
. find ( './video' )  
 245          upload_date 
=  unified_strdate ( xpath_text (  
 246              video_node
,  './broadcastDate' ))  
 247          thumbnail 
=  xpath_text ( video_node
,  './/teaserImage//variant/url' )  
 250          for  a 
in  video_node
. findall ( './/asset' ):  
 252                  'format_id' :  a
. attrib
[ 'type' ],  
 253                  'width' :  int_or_none ( a
. find ( './frameWidth' ). text
),  
 254                  'height' :  int_or_none ( a
. find ( './frameHeight' ). text
),  
 255                  'vbr' :  int_or_none ( a
. find ( './bitrateVideo' ). text
),  
 256                  'abr' :  int_or_none ( a
. find ( './bitrateAudio' ). text
),  
 257                  'vcodec' :  a
. find ( './codecVideo' ). text
,  
 258                  'tbr' :  int_or_none ( a
. find ( './totalBitrate' ). text
),  
 260              if  a
. find ( './serverPrefix' ). text
:  
 261                  f
[ 'url' ] =  a
. find ( './serverPrefix' ). text
 
 262                  f
[ 'playpath' ] =  a
. find ( './fileName' ). text
 
 264                  f
[ 'url' ] =  a
. find ( './fileName' ). text
 
 266          self
._ sort
_ formats
( formats
)  
 269              'id' :  mobj
. group ( 'id' ),  
 271              'display_id' :  display_id
,  
 272              'title' :  video_node
. find ( './title' ). text
,  
 273              'duration' :  parse_duration ( video_node
. find ( './duration' ). text
),  
 274              'upload_date' :  upload_date
,  
 275              'thumbnail' :  thumbnail
,  
 279  class  SportschauIE ( ARDMediathekIE
):  
 280      IE_NAME 
=  'Sportschau'  
 281      _VALID_URL 
=  r
'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'  
 283          'url' :  'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html' ,  
 285              'id' :  'seppeltkokainhatnichtsmitklassischemdopingzutun100' ,  
 287              'title' :  'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"' ,  
 288              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 289              'description' :  'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.' ,  
 293              'skip_download' :  True ,  
 297      def  _real_extract ( self
,  url
):  
 298          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 299          video_id 
=  mobj
. group ( 'id' )  
 300          base_url 
=  mobj
. group ( 'baseurl' )  
 302          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 303          title 
=  get_element_by_attribute ( 'class' ,  'headline' ,  webpage
)  
 304          description 
=  self
._ html
_ search
_ meta
( 'description' ,  webpage
,  'description' )  
 306          info 
=  self
._ extract
_ media
_ info
(  
 307              base_url 
+  '-mc_defaultQuality-h.json' ,  webpage
,  video_id
)  
 311              'description' :  description
,