]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py 
 
 
 
 
 
 
 
 
6a35ea463edcafe3b9d7db4c53b9bf0c53198fd0
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  . generic 
import  GenericIE
 
  20  class  ARDMediathekIE ( InfoExtractor
):  
  21      IE_NAME 
=  'ARD:mediathek'  
  22      _VALID_URL 
=  r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'  
  25          'url' :  'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,  
  26          'only_matching' :  True ,  
  28          'url' :  'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916' ,  
  32              'title' :  'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)' ,  
  33              'description' :  'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.' ,  
  35          'skip' :  'Blocked outside of Germany' ,  
  38      def  _real_extract ( self
,  url
):  
  39          # determine video id from url  
  40          m 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
  42          numid 
=  re
. search ( r
'documentId=([0-9]+)' ,  url
)  
  44              video_id 
=  numid
. group ( 1 )  
  46              video_id 
=  m
. group ( 'video_id' )  
  48          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
  50          if  '>Der gewünschte Beitrag ist nicht mehr verfügbar.<'  in  webpage
:  
  51              raise  ExtractorError ( 'Video  %s  is no longer available'  %  video_id
,  expected
= True )  
  53          if  'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.'  in  webpage
:  
  54              raise  ExtractorError ( 'This program is only suitable for those aged 12 and older. Video  %s  is therefore only available between 20 pm and 6 am.'  %  video_id
,  expected
= True )  
  56          if  re
. search ( r
'[\?&]rss($|[=&])' ,  url
):  
  57              doc 
=  parse_xml ( webpage
)  
  59                  return  GenericIE () ._ extract
_ rss
( url
,  video_id
,  doc
)  
  61          title 
=  self
._ html
_ search
_ regex
(  
  62              [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,  
  63               r
'<meta name="dcterms.title" content="(.*?)"/>' ,  
  64               r
'<h4 class="headline">(.*?)</h4>' ],  
  66          description 
=  self
._ html
_ search
_ meta
(  
  67              'dcterms.abstract' ,  webpage
,  'description' ,  default
= None )  
  68          if  description 
is None :  
  69              description 
=  self
._ html
_ search
_ meta
(  
  70                  'description' ,  webpage
,  'meta description' )  
  72          # Thumbnail is sometimes not present.  
  73          # It is in the mobile version, but that seems to use a different URL  
  74          # structure altogether.  
  75          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None )  
  77          media_streams 
=  re
. findall ( r
'''(?x)  
  78              mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*  
  79              "([^"]+)"''' ,  webpage
)  
  82              QUALITIES 
=  qualities ([ 'lo' ,  'hi' ,  'hq' ])  
  84              for  furl 
in  set ( media_streams
):  
  85                  if  furl
. endswith ( '.f4m' ):  
  88                      fid_m 
=  re
. match ( r
'.*\.([^.]+)\.[^.]+$' ,  furl
)  
  89                      fid 
=  fid_m
. group ( 1 )  if  fid_m 
else None  
  91                      'quality' :  QUALITIES ( fid
),  
  95          else :   # request JSON file  
  96              media_info 
=  self
._ download
_ json
(  
  97                  'http://www.ardmediathek.de/play/media/ %s '  %  video_id
,  video_id
)  
  98              # The second element of the _mediaArray contains the standard http urls  
  99              streams 
=  media_info
[ '_mediaArray' ][ 1 ][ '_mediaStreamArray' ]  
 101                  if  '"fsk"'  in  webpage
:  
 102                      raise  ExtractorError ( 'This video is only available after 20:00' )  
 106                  if  type ( s
[ '_stream' ]) ==  list :  
 107                      for  index
,  url 
in  enumerate ( s
[ '_stream' ][::- 1 ]):  
 108                          quality 
=  s
[ '_quality' ] +  index
 
 112                              'format_id' :  ' %s-%s '  % ( determine_ext ( url
),  quality
)  
 117                      'quality' :  s
[ '_quality' ],  
 121                  format
[ 'format_id' ] =  ' %s-%s '  % (  
 122                      determine_ext ( format
[ 'url' ]),  format
[ 'quality' ])  
 124                  formats
. append ( format
)  
 126          self
._ sort
_ formats
( formats
)  
 131              'description' :  description
,  
 133              'thumbnail' :  thumbnail
,  
 137  class  ARDIE ( InfoExtractor
):  
 138      _VALID_URL 
=  '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'  
 140          'url' :  'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,  
 141          'md5' :  'd216c3a86493f9322545e045ddc3eb35' ,  
 143              'display_id' :  'die-story-im-ersten-mission-unter-falscher-flagge' ,  
 147              'title' :  'Die Story im Ersten: Mission unter falscher Flagge' ,  
 148              'upload_date' :  '20140804' ,  
 149              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 153      def  _real_extract ( self
,  url
):  
 154          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 155          display_id 
=  mobj
. group ( 'display_id' )  
 157          player_url 
=  mobj
. group ( 'mainurl' ) +  '~playerXml.xml'  
 158          doc 
=  self
._ download
_ xml
( player_url
,  display_id
)  
 159          video_node 
=  doc
. find ( './video' )  
 160          upload_date 
=  unified_strdate ( xpath_text (  
 161              video_node
,  './broadcastDate' ))  
 162          thumbnail 
=  xpath_text ( video_node
,  './/teaserImage//variant/url' )  
 165          for  a 
in  video_node
. findall ( './/asset' ):  
 167                  'format_id' :  a
. attrib
[ 'type' ],  
 168                  'width' :  int_or_none ( a
. find ( './frameWidth' ). text
),  
 169                  'height' :  int_or_none ( a
. find ( './frameHeight' ). text
),  
 170                  'vbr' :  int_or_none ( a
. find ( './bitrateVideo' ). text
),  
 171                  'abr' :  int_or_none ( a
. find ( './bitrateAudio' ). text
),  
 172                  'vcodec' :  a
. find ( './codecVideo' ). text
,  
 173                  'tbr' :  int_or_none ( a
. find ( './totalBitrate' ). text
),  
 175              if  a
. find ( './serverPrefix' ). text
:  
 176                  f
[ 'url' ] =  a
. find ( './serverPrefix' ). text
 
 177                  f
[ 'playpath' ] =  a
. find ( './fileName' ). text
 
 179                  f
[ 'url' ] =  a
. find ( './fileName' ). text
 
 181          self
._ sort
_ formats
( formats
)  
 184              'id' :  mobj
. group ( 'id' ),  
 186              'display_id' :  display_id
,  
 187              'title' :  video_node
. find ( './title' ). text
,  
 188              'duration' :  parse_duration ( video_node
. find ( './duration' ). text
),  
 189              'upload_date' :  upload_date
,  
 190              'thumbnail' :  thumbnail
,