]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/ard.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  . generic 
import  GenericIE
 
  20  class  ARDMediathekIE ( InfoExtractor
):  
  21      IE_NAME 
=  'ARD:mediathek'  
  22      _VALID_URL 
=  r
'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'  
  25          'url' :  'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht' ,  
  26          'only_matching' :  True ,  
  28          'url' :  'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916' ,  
  32              'title' :  'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)' ,  
  33              'description' :  'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.' ,  
  35          'skip' :  'Blocked outside of Germany' ,  
  38      def  _real_extract ( self
,  url
):  
  39          # determine video id from url  
  40          m 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
  42          numid 
=  re
. search ( r
'documentId=([0-9]+)' ,  url
)  
  44              video_id 
=  numid
. group ( 1 )  
  46              video_id 
=  m
. group ( 'video_id' )  
  48          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
  50          if  '>Der gewünschte Beitrag ist nicht mehr verfügbar.<'  in  webpage
:  
  51              raise  ExtractorError ( 'Video  %s  is no longer available'  %  video_id
,  expected
= True )  
  53          if  re
. search ( r
'[\?&]rss($|[=&])' ,  url
):  
  54              doc 
=  parse_xml ( webpage
)  
  56                  return  GenericIE () ._ extract
_ rss
( url
,  video_id
,  doc
)  
  58          title 
=  self
._ html
_ search
_ regex
(  
  59              [ r
'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>' ,  
  60               r
'<meta name="dcterms.title" content="(.*?)"/>' ,  
  61               r
'<h4 class="headline">(.*?)</h4>' ],  
  63          description 
=  self
._ html
_ search
_ meta
(  
  64              'dcterms.abstract' ,  webpage
,  'description' ,  default
= None )  
  65          if  description 
is None :  
  66              description 
=  self
._ html
_ search
_ meta
(  
  67                  'description' ,  webpage
,  'meta description' )  
  69          # Thumbnail is sometimes not present.  
  70          # It is in the mobile version, but that seems to use a different URL  
  71          # structure altogether.  
  72          thumbnail 
=  self
._ og
_ search
_ thumbnail
( webpage
,  default
= None )  
  74          media_streams 
=  re
. findall ( r
'''(?x)  
  75              mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*  
  76              "([^"]+)"''' ,  webpage
)  
  79              QUALITIES 
=  qualities ([ 'lo' ,  'hi' ,  'hq' ])  
  81              for  furl 
in  set ( media_streams
):  
  82                  if  furl
. endswith ( '.f4m' ):  
  85                      fid_m 
=  re
. match ( r
'.*\.([^.]+)\.[^.]+$' ,  furl
)  
  86                      fid 
=  fid_m
. group ( 1 )  if  fid_m 
else None  
  88                      'quality' :  QUALITIES ( fid
),  
  92          else :   # request JSON file  
  93              media_info 
=  self
._ download
_ json
(  
  94                  'http://www.ardmediathek.de/play/media/ %s '  %  video_id
,  video_id
)  
  95              # The second element of the _mediaArray contains the standard http urls  
  96              streams 
=  media_info
[ '_mediaArray' ][ 1 ][ '_mediaStreamArray' ]  
  98                  if  '"fsk"'  in  webpage
:  
  99                      raise  ExtractorError ( 'This video is only available after 20:00' )  
 103                  if  type ( s
[ '_stream' ]) ==  list :  
 104                      for  index
,  url 
in  enumerate ( s
[ '_stream' ][::- 1 ]):  
 105                          quality 
=  s
[ '_quality' ] +  index
 
 109                              'format_id' :  ' %s-%s '  % ( determine_ext ( url
),  quality
)  
 114                      'quality' :  s
[ '_quality' ],  
 118                  format
[ 'format_id' ] =  ' %s-%s '  % (  
 119                      determine_ext ( format
[ 'url' ]),  format
[ 'quality' ])  
 121                  formats
. append ( format
)  
 123          self
._ sort
_ formats
( formats
)  
 128              'description' :  description
,  
 130              'thumbnail' :  thumbnail
,  
 134  class  ARDIE ( InfoExtractor
):  
 135      _VALID_URL 
=  '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'  
 137          'url' :  'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html' ,  
 138          'md5' :  'd216c3a86493f9322545e045ddc3eb35' ,  
 140              'display_id' :  'die-story-im-ersten-mission-unter-falscher-flagge' ,  
 144              'title' :  'Die Story im Ersten: Mission unter falscher Flagge' ,  
 145              'upload_date' :  '20140804' ,  
 146              'thumbnail' :  're:^https?://.*\.jpg$' ,  
 150      def  _real_extract ( self
,  url
):  
 151          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 152          display_id 
=  mobj
. group ( 'display_id' )  
 154          player_url 
=  mobj
. group ( 'mainurl' ) +  '~playerXml.xml'  
 155          doc 
=  self
._ download
_ xml
( player_url
,  display_id
)  
 156          video_node 
=  doc
. find ( './video' )  
 157          upload_date 
=  unified_strdate ( xpath_text (  
 158              video_node
,  './broadcastDate' ))  
 159          thumbnail 
=  xpath_text ( video_node
,  './/teaserImage//variant/url' )  
 162          for  a 
in  video_node
. findall ( './/asset' ):  
 164                  'format_id' :  a
. attrib
[ 'type' ],  
 165                  'width' :  int_or_none ( a
. find ( './frameWidth' ). text
),  
 166                  'height' :  int_or_none ( a
. find ( './frameHeight' ). text
),  
 167                  'vbr' :  int_or_none ( a
. find ( './bitrateVideo' ). text
),  
 168                  'abr' :  int_or_none ( a
. find ( './bitrateAudio' ). text
),  
 169                  'vcodec' :  a
. find ( './codecVideo' ). text
,  
 170                  'tbr' :  int_or_none ( a
. find ( './totalBitrate' ). text
),  
 172              if  a
. find ( './serverPrefix' ). text
:  
 173                  f
[ 'url' ] =  a
. find ( './serverPrefix' ). text
 
 174                  f
[ 'playpath' ] =  a
. find ( './fileName' ). text
 
 176                  f
[ 'url' ] =  a
. find ( './fileName' ). text
 
 178          self
._ sort
_ formats
( formats
)  
 181              'id' :  mobj
. group ( 'id' ),  
 183              'display_id' :  display_id
,  
 184              'title' :  video_node
. find ( './title' ). text
,  
 185              'duration' :  parse_duration ( video_node
. find ( './duration' ). text
),  
 186              'upload_date' :  upload_date
,  
 187              'thumbnail' :  thumbnail
,