]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  . generic 
import  GenericIE
 
  11      get_element_by_attribute
,  
  18  from  .. compat 
import  compat_etree_fromstring
 
  21  class  ARDMediathekIE ( InfoExtractor
):  
  22      IE_NAME 
=  'ARD:mediathek'  
  23      _VALID_URL 
=  r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'  
  26          'url' :  'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114' ,  
  30              'title' :  'Ich liebe das Leben trotzdem' ,  
  31              'description' :  'md5:45e4c225c72b27993314b31a84a5261c' ,  
  36              'skip_download' :  True ,  
  39          'url' :  'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916' ,  
  40          'md5' :  'f4d98b10759ac06c0072bbcd1f0b9e3e' ,  
  44              'title' :  'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)' ,  
  45              'description' :  'md5:196392e79876d0ac94c94e8cdb2875f1' ,  
  50          'url' :  'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086' ,  
  51          'md5' :  '219d94d8980b4f538c7fcb0865eb7f2c' ,  
  55              'title' :  'Tod eines Fußballers' ,  
  56              'description' :  'md5:f6e39f3461f0e1f54bfa48c8875c86ef' ,  
  60          'url' :  'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,  
  61          'only_matching' :  True ,  
  64      def  _extract_media_info ( self
,  media_info_url
,  webpage
,  video_id
):  
  65          media_info 
=  self
._ download
_ json
(  
  66              media_info_url
,  video_id
,  'Downloading media JSON' )  
  68          formats 
=  self
._ extract
_ formats
( media_info
,  video_id
)  
  71              if  '"fsk"'  in  webpage
:  
  73                      'This video is only available after 20:00' ,  expected
= True )  
  74              elif  media_info
. get ( '_geoblocked' ):  
  75                  raise  ExtractorError ( 'This video is not available due to geo restriction' ,  expected
= True )  
  77          self
._ sort
_ formats
( formats
)  
  79          duration 
=  int_or_none ( media_info
. get ( '_duration' ))  
  80          thumbnail 
=  media_info
. get ( '_previewImage' )  
  83          subtitle_url 
=  media_info
. get ( '_subtitleUrl' )  
  93              'thumbnail' :  thumbnail
,  
  95              'subtitles' :  subtitles
,  
  98      def  _extract_formats ( self
,  media_info
,  video_id
):  
  99          type_ 
=  media_info
. get ( '_type' )  
 100          media_array 
=  media_info
. get ( '_mediaArray' , [])  
 102          for  num
,  media 
in  enumerate ( media_array
):  
 103              for  stream 
in  media
. get ( '_mediaStreamArray' , []):  
 104                  stream_urls 
=  stream
. get ( '_stream' )  
 107                  if not  isinstance ( stream_urls
,  list ):  
 108                      stream_urls 
= [ stream_urls
]  
 109                  quality 
=  stream
. get ( '_quality' )  
 110                  server 
=  stream
. get ( '_server' )  
 111                  for  stream_url 
in  stream_urls
:  
 112                      ext 
=  determine_ext ( stream_url
)  
 114                          formats
. extend ( self
._ extract
_ f
4 m
_ formats
(  
 115                              stream_url 
+  '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124' ,  
 116                              video_id
,  preference
=- 1 ,  f4m_id
= 'hds' ))  
 118                          formats
. extend ( self
._ extract
_ m
3u8_ formats
(  
 119                              stream_url
,  video_id
,  'mp4' ,  preference
= 1 ,  m3u8_id
= 'hls' ))  
 121                          if  server 
and  server
. startswith ( 'rtmp' ):  
 124                                  'play_path' :  stream_url
,  
 125                                  'format_id' :  'a %s-r tmp- %s '  % ( num
,  quality
),  
 127                          elif  stream_url
. startswith ( 'http' ):  
 130                                  'format_id' :  'a %s-%s-%s '  % ( num
,  ext
,  quality
)  
 134                          m 
=  re
. search ( r
'_(?P<width>\d+)x(?P<height>\d+)\.mp4$' ,  stream_url
)  
 137                                  'width' :  int ( m
. group ( 'width' )),  
 138                                  'height' :  int ( m
. group ( 'height' )),  
 145      def  _real_extract ( self
,  url
):  
 146          # determine video id from url  
 147          m 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 149          numid 
=  re
. search ( r
'documentId=([0-9]+)' ,  url
)  
 151              video_id 
=  numid
. group ( 1 )  
 153              video_id 
=  m
. group ( 'video_id' )  
 155          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 157          if  '>Der gewünschte Beitrag ist nicht mehr verfügbar.<'  in  webpage
:  
 158              raise  ExtractorError ( 'Video  %s  is no longer available'  %  video_id
,  expected
= True )  
 160          if  'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.'  in  webpage
:  
 161              raise  ExtractorError ( 'This program is only suitable for those aged 12 and older. Video  %s  is therefore only available between 20 pm and 6 am.'  %  video_id
,  expected
= True )  
 163          if  re
. search ( r
'[\?&]rss($|[=&])' ,  url
):  
 164              doc 
=  compat_etree_fromstring ( webpage
. encode ( 'utf-8' ))  
 166                  return  GenericIE () ._ extract
_ rss
( url
,  video_id
,  doc
)  
 168          title 
=  self
._ html
_ search
_ regex
(  
 169              [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,  
 170               r
'<meta name="dcterms.title" content="(.*?)"/>' ,  
 171               r
'<h4 class="headline">(.*?)</h4>' ],  
 173          description 
=  self
._ html
_ search
_ meta
(  
 174              'dcterms.abstract' ,  webpage
,  'description' ,  default
= None )  
 175          if  description 
is None :  
 176              description 
=  self
._ html
_ search
_ meta
(  
 177                  'description' ,  webpage
,  'meta description' )  
 179          # Thumbnail is sometimes not present.  
 180          # It is in the mobile version, but that seems to use a different URL  
 181          # structure altogether.  
 182          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None )  
 184          media_streams 
=  re
. findall ( r
'''(?x)  
 185              mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*  
 186              "([^"]+)"''' ,  webpage
)  
 189              QUALITIES 
=  qualities ([ 'lo' ,  'hi' ,  'hq' ])  
 191              for  furl 
in  set ( media_streams
):  
 192                  if  furl
. endswith ( '.f4m' ):  
 195                      fid_m 
=  re
. match ( r
'.*\.([^.]+)\.[^.]+$' ,  furl
)  
 196                      fid 
=  fid_m
. group ( 1 )  if  fid_m 
else None  
 198                      'quality' :  QUALITIES ( fid
),  
 202              self
._ sort
_ formats
( formats
)  
 206          else :   # request JSON file  
 207              info 
=  self
._ extract
_ media
_ info
(  
 208                  'http://www.ardmediathek.de/play/media/ %s '  %  video_id
,  webpage
,  video_id
)  
 213              'description' :  description
,  
 214              'thumbnail' :  thumbnail
,  
 220  class  ARDIE ( InfoExtractor
):  
 221      _VALID_URL 
=  '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'  
 223          'url' :  'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,  
 224          'md5' :  'd216c3a86493f9322545e045ddc3eb35' ,  
 226              'display_id' :  'die-story-im-ersten-mission-unter-falscher-flagge' ,  
 230              'title' :  'Die Story im Ersten: Mission unter falscher Flagge' ,  
 231              'upload_date' :  '20140804' ,  
 232              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 236      def  _real_extract ( self
,  url
):  
 237          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 238          display_id 
=  mobj
. group ( 'display_id' )  
 240          player_url 
=  mobj
. group ( 'mainurl' ) +  '~playerXml.xml'  
 241          doc 
=  self
._ download
_ xml
( player_url
,  display_id
)  
 242          video_node 
=  doc
. find ( './video' )  
 243          upload_date 
=  unified_strdate ( xpath_text (  
 244              video_node
,  './broadcastDate' ))  
 245          thumbnail 
=  xpath_text ( video_node
,  './/teaserImage//variant/url' )  
 248          for  a 
in  video_node
. findall ( './/asset' ):  
 250                  'format_id' :  a
. attrib
[ 'type' ],  
 251                  'width' :  int_or_none ( a
. find ( './frameWidth' ). text
),  
 252                  'height' :  int_or_none ( a
. find ( './frameHeight' ). text
),  
 253                  'vbr' :  int_or_none ( a
. find ( './bitrateVideo' ). text
),  
 254                  'abr' :  int_or_none ( a
. find ( './bitrateAudio' ). text
),  
 255                  'vcodec' :  a
. find ( './codecVideo' ). text
,  
 256                  'tbr' :  int_or_none ( a
. find ( './totalBitrate' ). text
),  
 258              if  a
. find ( './serverPrefix' ). text
:  
 259                  f
[ 'url' ] =  a
. find ( './serverPrefix' ). text
 
 260                  f
[ 'playpath' ] =  a
. find ( './fileName' ). text
 
 262                  f
[ 'url' ] =  a
. find ( './fileName' ). text
 
 264          self
._ sort
_ formats
( formats
)  
 267              'id' :  mobj
. group ( 'id' ),  
 269              'display_id' :  display_id
,  
 270              'title' :  video_node
. find ( './title' ). text
,  
 271              'duration' :  parse_duration ( video_node
. find ( './duration' ). text
),  
 272              'upload_date' :  upload_date
,  
 273              'thumbnail' :  thumbnail
,  
 277  class  SportschauIE ( ARDMediathekIE
):  
 278      IE_NAME 
=  'Sportschau'  
 279      _VALID_URL 
=  r
'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'  
 281          'url' :  'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html' ,  
 283              'id' :  'seppeltkokainhatnichtsmitklassischemdopingzutun100' ,  
 285              'title' :  'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"' ,  
 286              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 287              'description' :  'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.' ,  
 291              'skip_download' :  True ,  
 295      def  _real_extract ( self
,  url
):  
 296          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 297          video_id 
=  mobj
. group ( 'id' )  
 298          base_url 
=  mobj
. group ( 'baseurl' )  
 300          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 301          title 
=  get_element_by_attribute ( 'class' ,  'headline' ,  webpage
)  
 302          description 
=  self
._ html
_ search
_ meta
( 'description' ,  webpage
,  'description' )  
 304          info 
=  self
._ extract
_ media
_ info
(  
 305              base_url 
+  '-mc_defaultQuality-h.json' ,  webpage
,  video_id
)  
 309              'description' :  description
,