]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/mtv.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
  29  def  _media_xml_tag ( tag
):  
  30      return  '{http://search.yahoo.com/mrss/} %s '  %  tag
 
  33  class  MTVServicesInfoExtractor ( InfoExtractor
):  
  34      _MOBILE_TEMPLATE 
=  None  
  38      def  _id_from_uri ( uri
):  
  39          return  uri
. split ( ':' )[- 1 ]  
  42      def  _remove_template_parameter ( url
):  
  43          # Remove the templates, like &device={device}  
  44          return  re
. sub ( r
'&[^=]*?={.*?}(?=(&|$))' ,  '' ,  url
)  
  46      def  _get_feed_url ( self
,  uri
):  
  49      def  _get_thumbnail_url ( self
,  uri
,  itemdoc
):  
  50          search_path 
=  ' %s / %s '  % ( _media_xml_tag ( 'group' ),  _media_xml_tag ( 'thumbnail' ))  
  51          thumb_node 
=  itemdoc
. find ( search_path
)  
  52          if  thumb_node 
is None :  
  54          return  thumb_node
. get ( 'url' )  or  thumb_node
. text 
or None  
  56      def  _extract_mobile_video_formats ( self
,  mtvn_id
):  
  57          webpage_url 
=  self
._ MOBILE
_ TEMPLATE 
%  mtvn_id
 
  58          req 
=  sanitized_Request ( webpage_url
)  
  59          # Otherwise we get a webpage that would execute some javascript  
  60          req
. add_header ( 'User-Agent' ,  'curl/7' )  
  61          webpage 
=  self
._ download
_ webpage
( req
,  mtvn_id
,  
  62                                           'Downloading mobile page' )  
  63          metrics_url 
=  unescapeHTML ( self
._ search
_ regex
( r
'<a href="(http://metrics.+?)"' ,  webpage
,  'url' ))  
  64          req 
=  HEADRequest ( metrics_url
)  
  65          response 
=  self
._ request
_ webpage
( req
,  mtvn_id
,  'Resolving url' )  
  66          url 
=  response
. geturl ()  
  67          # Transform the url to get the best quality:  
  68          url 
=  re
. sub ( r
'.+pxE=mp4' ,  'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4' ,  url
,  1 )  
  69          return  [{ 'url' :  url
,  'ext' :  'mp4' }]  
  71      def  _extract_video_formats ( self
,  mdoc
,  mtvn_id
,  video_id
):  
  72          if  re
. match ( r
'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$' ,  mdoc
. find ( './/src' ). text
)  is not None :  
  73              if  mtvn_id 
is not None and  self
._ MOBILE
_ TEMPLATE 
is not None :  
  74                  self
. to_screen ( 'The normal version is not available from your '  
  75                                 'country, trying with the mobile version' )  
  76                  return  self
._ extract
_ mobile
_ video
_ formats
( mtvn_id
)  
  77              raise  ExtractorError ( 'This video is not available from your country.' ,  
  81          for  rendition 
in  mdoc
. findall ( './/rendition' ):  
  82              if  rendition
. get ( 'method' ) ==  'hls' :  
  83                  hls_url 
=  rendition
. find ( './src' ). text
 
  84                  formats
. extend ( self
._ extract
_ m
3u8_ formats
(  
  85                      hls_url
,  video_id
,  ext
= 'mp4' ,  entry_protocol
= 'm3u8_native' ,  
  86                      m3u8_id
= 'hls' ,  fatal
= False ))  
  90                      _
,  _
,  ext 
=  rendition
. attrib
[ 'type' ]. partition ( '/' )  
  91                      rtmp_video_url 
=  rendition
. find ( './src' ). text
 
  92                      if  'error_not_available.swf'  in  rtmp_video_url
:  
  94                              ' %s  said: video is not available'  %  self
. IE_NAME
,  
  96                      if  rtmp_video_url
. endswith ( 'siteunavail.png' ):  
  99                          'ext' :  'flv'  if  rtmp_video_url
. startswith ( 'rtmp' )  else  ext
,  
 100                          'url' :  rtmp_video_url
,  
 101                          'format_id' :  '-' . join ( filter ( None , [  
 102                              'rtmp'  if  rtmp_video_url
. startswith ( 'rtmp' )  else None ,  
 103                              rendition
. get ( 'bitrate' )])),  
 104                          'width' :  int ( rendition
. get ( 'width' )),  
 105                          'height' :  int ( rendition
. get ( 'height' )),  
 107                  except  ( KeyError ,  TypeError ):  
 108                      raise  ExtractorError ( 'Invalid rendition field.' )  
 110              self
._ sort
_ formats
( formats
)  
 113      def  _extract_subtitles ( self
,  mdoc
,  mtvn_id
):  
 115          for  transcript 
in  mdoc
. findall ( './/transcript' ):  
 116              if  transcript
. get ( 'kind' ) !=  'captions' :  
 118              lang 
=  transcript
. get ( 'srclang' )  
 119              for  typographic 
in  transcript
. findall ( './typographic' ):  
 120                  sub_src 
=  typographic
. get ( 'src' )  
 123                  ext 
=  typographic
. get ( 'format' )  
 126                  subtitles
. setdefault ( lang
, []). append ({  
 127                      'url' :  compat_str ( sub_src
),  
 132      def  _get_video_info ( self
,  itemdoc
,  use_hls
= True ):  
 133          uri 
=  itemdoc
. find ( 'guid' ). text
 
 134          video_id 
=  self
._ id
_ from
_u ri
( uri
)  
 135          self
. report_extraction ( video_id
)  
 136          content_el 
=  itemdoc
. find ( ' %s / %s '  % ( _media_xml_tag ( 'group' ),  _media_xml_tag ( 'content' )))  
 137          mediagen_url 
=  self
._ remove
_ template
_ parameter
( content_el
. attrib
[ 'url' ])  
 138          mediagen_url 
=  mediagen_url
. replace ( 'device= {device} ' ,  '' )  
 139          if  'acceptMethods'  not in  mediagen_url
:  
 140              mediagen_url 
+=  '&'  if  '?'  in  mediagen_url 
else  '?'  
 141              mediagen_url 
+=  'acceptMethods='  
 142              mediagen_url 
+=  'hls'  if  use_hls 
else  'fms'  
 144          mediagen_doc 
=  self
._ download
_ xml
(  
 145              mediagen_url
,  video_id
,  'Downloading video urls' ,  fatal
= False )  
 147          if  mediagen_doc 
is False :  
 150          item 
=  mediagen_doc
. find ( './video/item' )  
 151          if  item 
is not None and  item
. get ( 'type' ) ==  'text' :  
 152              message 
=  ' %s  returned error: '  %  self
. IE_NAME
 
 153              if  item
. get ( 'code' )  is not None :  
 154                  message 
+=  ' %s  - '  %  item
. get ( 'code' )  
 156              raise  ExtractorError ( message
,  expected
= True )  
 158          description 
=  strip_or_none ( xpath_text ( itemdoc
,  'description' ))  
 160          timestamp 
=  timeconvert ( xpath_text ( itemdoc
,  'pubDate' ))  
 164              title_el 
=  find_xpath_attr (  
 165                  itemdoc
,  './/{http://search.yahoo.com/mrss/}category' ,  
 166                  'scheme' ,  'urn:mtvn:video_title' )  
 168              title_el 
=  itemdoc
. find ( compat_xpath ( './/{http://search.yahoo.com/mrss/}title' ))  
 170              title_el 
=  itemdoc
. find ( compat_xpath ( './/title' ))  
 171              if  title_el
. text 
is None :  
 174          title 
=  title_el
. text
 
 176              raise  ExtractorError ( 'Could not find video title' )  
 177          title 
=  title
. strip ()  
 179          # This a short id that's used in the webpage urls  
 181          mtvn_id_node 
=  find_xpath_attr ( itemdoc
,  './/{http://search.yahoo.com/mrss/}category' ,  
 182                                         'scheme' ,  'urn:mtvn:id' )  
 183          if  mtvn_id_node 
is not None :  
 184              mtvn_id 
=  mtvn_id_node
. text
 
 186          formats 
=  self
._ extract
_ video
_ formats
( mediagen_doc
,  mtvn_id
,  video_id
)  
 188          # Some parts of complete video may be missing (e.g. missing Act 3 in  
 189          # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)  
 193          self
._ sort
_ formats
( formats
)  
 198              'subtitles' :  self
._ extract
_ subtitles
( mediagen_doc
,  mtvn_id
),  
 200              'thumbnail' :  self
._ get
_ thumbnail
_u rl
( uri
,  itemdoc
),  
 201              'description' :  description
,  
 202              'duration' :  float_or_none ( content_el
. attrib
. get ( 'duration' )),  
 203              'timestamp' :  timestamp
,  
 206      def  _get_feed_query ( self
,  uri
):  
 209              data
[ 'lang' ] =  self
._L ANG
 
 212      def  _get_videos_info ( self
,  uri
,  use_hls
= True ):  
 213          video_id 
=  self
._ id
_ from
_u ri
( uri
)  
 214          feed_url 
=  self
._ get
_ feed
_u rl
( uri
)  
 215          info_url 
=  update_url_query ( feed_url
,  self
._ get
_ feed
_ query
( uri
))  
 216          return  self
._ get
_ videos
_ info
_ from
_u rl
( info_url
,  video_id
,  use_hls
)  
 218      def  _get_videos_info_from_url ( self
,  url
,  video_id
,  use_hls
= True ):  
 219          idoc 
=  self
._ download
_ xml
(  
 221              'Downloading info' ,  transform_source
= fix_xml_ampersands
)  
 223          title 
=  xpath_text ( idoc
,  './channel/title' )  
 224          description 
=  xpath_text ( idoc
,  './channel/description' )  
 227          for  item 
in  idoc
. findall ( './/item' ):  
 228              info 
=  self
._ get
_ video
_ info
( item
,  use_hls
)  
 232          return  self
. playlist_result (  
 233              entries
,  playlist_title
= title
,  playlist_description
= description
)  
 235      def  _extract_triforce_mgid ( self
,  webpage
,  data_zone
= None ,  video_id
= None ):  
 236          triforce_feed 
=  self
._ parse
_ json
( self
._ search
_ regex
(  
 237              r
'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n' ,  webpage
,  
 238              'triforce feed' ,  default
= '{}' ),  video_id
,  fatal
= False )  
 240          data_zone 
=  self
._ search
_ regex
(  
 241              r
'data-zone=(["\' ])( ?P
< zone
>.+ ?_lc_promo
.* ?
) \
1 ', webpage,  
 242              ' data zone
', default=data_zone, group=' zone
')  
 245              triforce_feed, lambda x: x[' manifest
'][' zones
'][data_zone][' feed
'],  
 250          feed = self._download_json(feed_url, video_id, fatal=False)  
 254          return try_get(feed, lambda x: x[' result
'][' data
'][' id '], compat_str)  
 256      def _extract_mgid(self, webpage):  
 258              # the url can be http://media.mtvnservices.com/fb/ {mgid} .swf  
 259              # or http://media.mtvnservices.com/ {mgid}  
 260              og_url = self._og_search_video_url(webpage)  
 261              mgid = url_basename(og_url)  
 262              if mgid.endswith(' . swf
'):  
 264          except RegexNotFoundError:  
 267          if mgid is None or ' : ' not in mgid:  
 268              mgid = self._search_regex(  
 269                  [r' data
- mgid
= "(.*?)" ', r' swfobject\
. embedSWF\
( ".*?(mgid:.*?)" '],  
 270                  webpage, ' mgid
', default=None)  
 273              sm4_embed = self._html_search_meta(  
 274                  ' sm4
: video
: embed
', webpage, ' sm4 embed
', default=' ')  
 275              mgid = self._search_regex(  
 276                  r' embed
/( mgid
:.+ ?
)[ " \' &?/]', sm4_embed, 'mgid', default=None)  
 279              mgid = self._extract_triforce_mgid(webpage)  
 283      def _real_extract(self, url):  
 284          title = url_basename(url)  
 285          webpage = self._download_webpage(url, title)  
 286          mgid = self._extract_mgid(webpage)  
 287          videos_info = self._get_videos_info(mgid)  
 291  class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):  
 292      IE_NAME = 'mtvservices:embedded'  
 293      _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)( \? |/|$)'  
 296          # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/  
 297          'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906',  
 298          'md5': 'cb349b21a7897164cede95bd7bf3fbb9',  
 302              'title': 'Peter Dinklage Sums Up  \' Game Of Thrones \'  In 45 Seconds',  
 303              'description': '" Sexy sexy sexy
,  stabby stabby stabby
,  beautiful language
, " says Peter Dinklage as he tries summarizing " Game of Thrones
" in under a minute.',  
 304              'timestamp': 1400126400,  
 305              'upload_date': '20140515',  
 310      def _extract_url(webpage):  
 312              r'<iframe[^>]+?src=([" \' ])( ?P
< url
>( ?
: https?
:) ?
// media
. mtvnservices
. com
/ embed
/.+ ?
) \
1 ', webpage)  
 314              return mobj.group(' url
')  
 316      def _get_feed_url(self, uri):  
 317          video_id = self._id_from_uri(uri)  
 318          config = self._download_json(  
 319              ' http
:// media
. mtvnservices
. com
/ pmt
/ e1
/ access
/ index
. html?uri
= %s& configtype
= edge
' % uri, video_id)  
 320          return self._remove_template_parameter(config[' feedWithQueryParams
'])  
 322      def _real_extract(self, url):  
 323          mobj = re.match(self._VALID_URL, url)  
 324          mgid = mobj.group(' mgid
')  
 325          return self._get_videos_info(mgid)  
 328  class MTVIE(MTVServicesInfoExtractor):  
 330      _VALID_URL = r' https?
://( ?
: www\
.) ?mtv\
. com
/( ?
: video
- clips|
( ?
: full
-) ?episodes
)/( ?P
< id >[ ^
/ ?
#.]+)'  
 331      _FEED_URL 
=  'http://www.mtv.com/feeds/mrss/'  
 334          'url' :  'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer' ,  
 335          'md5' :  '1edbcdf1e7628e414a8c5dcebca3d32b' ,  
 337              'id' :  '5e14040d-18a4-47c4-a582-43ff602de88e' ,  
 339              'title' :  'Unlocking The Truth|July 18, 2016|1|101|Trailer' ,  
 340              'description' :  '"Unlocking the Truth" premieres August 17th at 11/10c.' ,  
 341              'timestamp' :  1468846800 ,  
 342              'upload_date' :  '20160718' ,  
 345          'url' :  'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101' ,  
 346          'only_matching' :  True ,  
 348          'url' :  'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713' ,  
 349          'only_matching' :  True ,  
 353  class  MTVJapanIE ( MTVServicesInfoExtractor
):  
 355      _VALID_URL 
=  r
'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)'  
 358          'url' :  'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade' ,  
 360              'id' :  'bc01da03-6fe5-4284-8880-f291f4e368f5' ,  
 362              'title' :  '【Fresh Info】Cadillac ESCALADE Sport Edition' ,  
 365              'skip_download' :  True ,  
 368      _GEO_COUNTRIES 
= [ 'JP' ]  
 369      _FEED_URL 
=  'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'  
 371      def  _get_feed_query ( self
,  uri
):  
 373              'arcEp' :  'mtvjapan.com' ,  
 378  class  MTVVideoIE ( MTVServicesInfoExtractor
):  
 379      IE_NAME 
=  'mtv:video'  
 380      _VALID_URL 
=  r
'''(?x)^https?://  
 381          (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$|  
 382             m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''  
 384      _FEED_URL 
=  'http://www.mtv.com/player/embed/AS3/rss/'  
 388              'url' :  'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml' ,  
 389              'md5' :  '850f3f143316b1e71fa56a4edfd6e0f8' ,  
 393                  'title' :  'Taylor Swift - "Ours (VH1 Storytellers)"' ,  
 394                  'description' :  'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.' ,  
 395                  'timestamp' :  1352610000 ,  
 396                  'upload_date' :  '20121111' ,  
 401      def  _get_thumbnail_url ( self
,  uri
,  itemdoc
):  
 402          return  'http://mtv.mtvnimages.com/uri/'  +  uri
 
 404      def  _real_extract ( self
,  url
):  
 405          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 406          video_id 
=  mobj
. group ( 'videoid' )  
 407          uri 
=  mobj
. groupdict (). get ( 'mgid' )  
 409              webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 411              # Some videos come from Vevo.com  
 413                  r
'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";' ,  webpage
)  
 415                  vevo_id 
=  m_vevo
. group ( 1 )  
 416                  self
. to_screen ( 'Vevo video detected:  %s '  %  vevo_id
)  
 417                  return  self
. url_result ( 'vevo: %s '  %  vevo_id
,  ie
= 'Vevo' )  
 419              uri 
=  self
._ html
_ search
_ regex
( r
'/uri/(.*?)\?' ,  webpage
,  'uri' )  
 420          return  self
._ get
_ videos
_ info
( uri
)  
 423  class  MTVDEIE ( MTVServicesInfoExtractor
):  
 425      _VALID_URL 
=  r
'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)'  
 427          'url' :  'http://www.mtv.de/musik/videoclips/2gpnv7/Traum' ,  
 429              'id' :  'd5d472bc-f5b7-11e5-bffd-a4badb20dab5' ,  
 432              'description' :  'Traum' ,  
 436              'skip_download' :  True ,  
 438          'skip' :  'Blocked at Travis CI' ,  
 440          # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)  
 441          'url' :  'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1' ,  
 443              'id' :  '1e5a878b-31c5-11e7-a442-0e40cf2fc285' ,  
 445              'title' :  'Teen Mom 2' ,  
 446              'description' :  'md5:dc65e357ef7e1085ed53e9e9d83146a7' ,  
 450              'skip_download' :  True ,  
 452          'skip' :  'Blocked at Travis CI' ,  
 454          'url' :  'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3' ,  
 456              'id' :  'local_playlist-4e760566473c4c8c5344' ,  
 458              'title' :  'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1' ,  
 459              'description' :  'MTV Movies Supercut' ,  
 463              'skip_download' :  True ,  
 465          'skip' :  'Das Video kann zur Zeit nicht abgespielt werden.' ,  
 467      _GEO_COUNTRIES 
= [ 'DE' ]  
 468      _FEED_URL 
=  'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'  
 470      def  _get_feed_query ( self
,  uri
):