]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/mtv.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
29 def _media_xml_tag ( tag
):
30 return '{http://search.yahoo.com/mrss/} %s ' % tag
33 class MTVServicesInfoExtractor ( InfoExtractor
):
34 _MOBILE_TEMPLATE
= None
38 def _id_from_uri ( uri
):
39 return uri
. split ( ':' )[- 1 ]
42 def _remove_template_parameter ( url
):
43 # Remove the templates, like &device={device}
44 return re
. sub ( r
'&[^=]*?={.*?}(?=(&|$))' , '' , url
)
46 def _get_feed_url ( self
, uri
):
49 def _get_thumbnail_url ( self
, uri
, itemdoc
):
50 search_path
= ' %s / %s ' % ( _media_xml_tag ( 'group' ), _media_xml_tag ( 'thumbnail' ))
51 thumb_node
= itemdoc
. find ( search_path
)
52 if thumb_node
is None :
54 return thumb_node
. get ( 'url' ) or thumb_node
. text
or None
56 def _extract_mobile_video_formats ( self
, mtvn_id
):
57 webpage_url
= self
._ MOBILE
_ TEMPLATE
% mtvn_id
58 req
= sanitized_Request ( webpage_url
)
59 # Otherwise we get a webpage that would execute some javascript
60 req
. add_header ( 'User-Agent' , 'curl/7' )
61 webpage
= self
._ download
_ webpage
( req
, mtvn_id
,
62 'Downloading mobile page' )
63 metrics_url
= unescapeHTML ( self
._ search
_ regex
( r
'<a href="(http://metrics.+?)"' , webpage
, 'url' ))
64 req
= HEADRequest ( metrics_url
)
65 response
= self
._ request
_ webpage
( req
, mtvn_id
, 'Resolving url' )
66 url
= response
. geturl ()
67 # Transform the url to get the best quality:
68 url
= re
. sub ( r
'.+pxE=mp4' , 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=0+_pxK=18639+_pxE=mp4' , url
, 1 )
69 return [{ 'url' : url
, 'ext' : 'mp4' }]
71 def _extract_video_formats ( self
, mdoc
, mtvn_id
, video_id
):
72 if re
. match ( r
'.*/(error_country_block\.swf|geoblock\.mp4|copyright_error\.flv(?:\?geo\b.+?)?)$' , mdoc
. find ( './/src' ). text
) is not None :
73 if mtvn_id
is not None and self
._ MOBILE
_ TEMPLATE
is not None :
74 self
. to_screen ( 'The normal version is not available from your '
75 'country, trying with the mobile version' )
76 return self
._ extract
_ mobile
_ video
_ formats
( mtvn_id
)
77 raise ExtractorError ( 'This video is not available from your country.' ,
81 for rendition
in mdoc
. findall ( './/rendition' ):
82 if rendition
. get ( 'method' ) == 'hls' :
83 hls_url
= rendition
. find ( './src' ). text
84 formats
. extend ( self
._ extract
_ m
3u8_ formats
(
85 hls_url
, video_id
, ext
= 'mp4' , entry_protocol
= 'm3u8_native' ,
86 m3u8_id
= 'hls' , fatal
= False ))
90 _
, _
, ext
= rendition
. attrib
[ 'type' ]. partition ( '/' )
91 rtmp_video_url
= rendition
. find ( './src' ). text
92 if 'error_not_available.swf' in rtmp_video_url
:
94 ' %s said: video is not available' % self
. IE_NAME
,
96 if rtmp_video_url
. endswith ( 'siteunavail.png' ):
99 'ext' : 'flv' if rtmp_video_url
. startswith ( 'rtmp' ) else ext
,
100 'url' : rtmp_video_url
,
101 'format_id' : '-' . join ( filter ( None , [
102 'rtmp' if rtmp_video_url
. startswith ( 'rtmp' ) else None ,
103 rendition
. get ( 'bitrate' )])),
104 'width' : int ( rendition
. get ( 'width' )),
105 'height' : int ( rendition
. get ( 'height' )),
107 except ( KeyError , TypeError ):
108 raise ExtractorError ( 'Invalid rendition field.' )
110 self
._ sort
_ formats
( formats
)
113 def _extract_subtitles ( self
, mdoc
, mtvn_id
):
115 for transcript
in mdoc
. findall ( './/transcript' ):
116 if transcript
. get ( 'kind' ) != 'captions' :
118 lang
= transcript
. get ( 'srclang' )
119 for typographic
in transcript
. findall ( './typographic' ):
120 sub_src
= typographic
. get ( 'src' )
123 ext
= typographic
. get ( 'format' )
126 subtitles
. setdefault ( lang
, []). append ({
127 'url' : compat_str ( sub_src
),
132 def _get_video_info ( self
, itemdoc
, use_hls
= True ):
133 uri
= itemdoc
. find ( 'guid' ). text
134 video_id
= self
._ id
_ from
_u ri
( uri
)
135 self
. report_extraction ( video_id
)
136 content_el
= itemdoc
. find ( ' %s / %s ' % ( _media_xml_tag ( 'group' ), _media_xml_tag ( 'content' )))
137 mediagen_url
= self
._ remove
_ template
_ parameter
( content_el
. attrib
[ 'url' ])
138 mediagen_url
= mediagen_url
. replace ( 'device= {device} ' , '' )
139 if 'acceptMethods' not in mediagen_url
:
140 mediagen_url
+= '&' if '?' in mediagen_url
else '?'
141 mediagen_url
+= 'acceptMethods='
142 mediagen_url
+= 'hls' if use_hls
else 'fms'
144 mediagen_doc
= self
._ download
_ xml
(
145 mediagen_url
, video_id
, 'Downloading video urls' , fatal
= False )
147 if mediagen_doc
is False :
150 item
= mediagen_doc
. find ( './video/item' )
151 if item
is not None and item
. get ( 'type' ) == 'text' :
152 message
= ' %s returned error: ' % self
. IE_NAME
153 if item
. get ( 'code' ) is not None :
154 message
+= ' %s - ' % item
. get ( 'code' )
156 raise ExtractorError ( message
, expected
= True )
158 description
= strip_or_none ( xpath_text ( itemdoc
, 'description' ))
160 timestamp
= timeconvert ( xpath_text ( itemdoc
, 'pubDate' ))
164 title_el
= find_xpath_attr (
165 itemdoc
, './/{http://search.yahoo.com/mrss/}category' ,
166 'scheme' , 'urn:mtvn:video_title' )
168 title_el
= itemdoc
. find ( compat_xpath ( './/{http://search.yahoo.com/mrss/}title' ))
170 title_el
= itemdoc
. find ( compat_xpath ( './/title' ))
171 if title_el
. text
is None :
174 title
= title_el
. text
176 raise ExtractorError ( 'Could not find video title' )
177 title
= title
. strip ()
179 # This a short id that's used in the webpage urls
181 mtvn_id_node
= find_xpath_attr ( itemdoc
, './/{http://search.yahoo.com/mrss/}category' ,
182 'scheme' , 'urn:mtvn:id' )
183 if mtvn_id_node
is not None :
184 mtvn_id
= mtvn_id_node
. text
186 formats
= self
._ extract
_ video
_ formats
( mediagen_doc
, mtvn_id
, video_id
)
188 # Some parts of complete video may be missing (e.g. missing Act 3 in
189 # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
193 self
._ sort
_ formats
( formats
)
198 'subtitles' : self
._ extract
_ subtitles
( mediagen_doc
, mtvn_id
),
200 'thumbnail' : self
._ get
_ thumbnail
_u rl
( uri
, itemdoc
),
201 'description' : description
,
202 'duration' : float_or_none ( content_el
. attrib
. get ( 'duration' )),
203 'timestamp' : timestamp
,
206 def _get_feed_query ( self
, uri
):
209 data
[ 'lang' ] = self
._L ANG
212 def _get_videos_info ( self
, uri
, use_hls
= True ):
213 video_id
= self
._ id
_ from
_u ri
( uri
)
214 feed_url
= self
._ get
_ feed
_u rl
( uri
)
215 info_url
= update_url_query ( feed_url
, self
._ get
_ feed
_ query
( uri
))
216 return self
._ get
_ videos
_ info
_ from
_u rl
( info_url
, video_id
, use_hls
)
218 def _get_videos_info_from_url ( self
, url
, video_id
, use_hls
= True ):
219 idoc
= self
._ download
_ xml
(
221 'Downloading info' , transform_source
= fix_xml_ampersands
)
223 title
= xpath_text ( idoc
, './channel/title' )
224 description
= xpath_text ( idoc
, './channel/description' )
227 for item
in idoc
. findall ( './/item' ):
228 info
= self
._ get
_ video
_ info
( item
, use_hls
)
232 return self
. playlist_result (
233 entries
, playlist_title
= title
, playlist_description
= description
)
235 def _extract_triforce_mgid ( self
, webpage
, data_zone
= None , video_id
= None ):
236 triforce_feed
= self
._ parse
_ json
( self
._ search
_ regex
(
237 r
'triforceManifestFeed\s*=\s*({.+?})\s*;\s*\n' , webpage
,
238 'triforce feed' , default
= '{}' ), video_id
, fatal
= False )
240 data_zone
= self
._ search
_ regex
(
241 r
'data-zone=(["\' ])( ?P
< zone
>.+ ?_lc_promo
.* ?
) \
1 ', webpage,
242 ' data zone
', default=data_zone, group=' zone
')
245 triforce_feed, lambda x: x[' manifest
'][' zones
'][data_zone][' feed
'],
250 feed = self._download_json(feed_url, video_id, fatal=False)
254 return try_get(feed, lambda x: x[' result
'][' data
'][' id '], compat_str)
256 def _extract_mgid(self, webpage):
258 # the url can be http://media.mtvnservices.com/fb/ {mgid} .swf
259 # or http://media.mtvnservices.com/ {mgid}
260 og_url = self._og_search_video_url(webpage)
261 mgid = url_basename(og_url)
262 if mgid.endswith(' . swf
'):
264 except RegexNotFoundError:
267 if mgid is None or ' : ' not in mgid:
268 mgid = self._search_regex(
269 [r' data
- mgid
= "(.*?)" ', r' swfobject\
. embedSWF\
( ".*?(mgid:.*?)" '],
270 webpage, ' mgid
', default=None)
273 sm4_embed = self._html_search_meta(
274 ' sm4
: video
: embed
', webpage, ' sm4 embed
', default=' ')
275 mgid = self._search_regex(
276 r' embed
/( mgid
:.+ ?
)[ " \' &?/]', sm4_embed, 'mgid', default=None)
279 mgid = self._extract_triforce_mgid(webpage)
283 def _real_extract(self, url):
284 title = url_basename(url)
285 webpage = self._download_webpage(url, title)
286 mgid = self._extract_mgid(webpage)
287 videos_info = self._get_videos_info(mgid)
291 class MTVServicesEmbeddedIE(MTVServicesInfoExtractor):
292 IE_NAME = 'mtvservices:embedded'
293 _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)( \? |/|$)'
296 # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/
297 'url': 'http://media.mtvnservices.com/embed/mgid:uma:video:mtv.com:1043906/cp~vid%3D1043906%26uri%3Dmgid%3Auma%3Avideo%3Amtv.com%3A1043906',
298 'md5': 'cb349b21a7897164cede95bd7bf3fbb9',
302 'title': 'Peter Dinklage Sums Up \' Game Of Thrones \' In 45 Seconds',
303 'description': '" Sexy sexy sexy
, stabby stabby stabby
, beautiful language
, " says Peter Dinklage as he tries summarizing " Game of Thrones
" in under a minute.',
304 'timestamp': 1400126400,
305 'upload_date': '20140515',
310 def _extract_url(webpage):
312 r'<iframe[^>]+?src=([" \' ])( ?P
< url
>( ?
: https?
:) ?
// media
. mtvnservices
. com
/ embed
/.+ ?
) \
1 ', webpage)
314 return mobj.group(' url
')
316 def _get_feed_url(self, uri):
317 video_id = self._id_from_uri(uri)
318 config = self._download_json(
319 ' http
:// media
. mtvnservices
. com
/ pmt
/ e1
/ access
/ index
. html?uri
= %s& configtype
= edge
' % uri, video_id)
320 return self._remove_template_parameter(config[' feedWithQueryParams
'])
322 def _real_extract(self, url):
323 mobj = re.match(self._VALID_URL, url)
324 mgid = mobj.group(' mgid
')
325 return self._get_videos_info(mgid)
328 class MTVIE(MTVServicesInfoExtractor):
330 _VALID_URL = r' https?
://( ?
: www\
.) ?mtv\
. com
/( ?
: video
- clips|
( ?
: full
-) ?episodes
)/( ?P
< id >[ ^
/ ?
#.]+)'
331 _FEED_URL
= 'http://www.mtv.com/feeds/mrss/'
334 'url' : 'http://www.mtv.com/video-clips/vl8qof/unlocking-the-truth-trailer' ,
335 'md5' : '1edbcdf1e7628e414a8c5dcebca3d32b' ,
337 'id' : '5e14040d-18a4-47c4-a582-43ff602de88e' ,
339 'title' : 'Unlocking The Truth|July 18, 2016|1|101|Trailer' ,
340 'description' : '"Unlocking the Truth" premieres August 17th at 11/10c.' ,
341 'timestamp' : 1468846800 ,
342 'upload_date' : '20160718' ,
345 'url' : 'http://www.mtv.com/full-episodes/94tujl/unlocking-the-truth-gates-of-hell-season-1-ep-101' ,
346 'only_matching' : True ,
348 'url' : 'http://www.mtv.com/episodes/g8xu7q/teen-mom-2-breaking-the-wall-season-7-ep-713' ,
349 'only_matching' : True ,
353 class MTVJapanIE ( MTVServicesInfoExtractor
):
355 _VALID_URL
= r
'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)'
358 'url' : 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade' ,
360 'id' : 'bc01da03-6fe5-4284-8880-f291f4e368f5' ,
362 'title' : '【Fresh Info】Cadillac ESCALADE Sport Edition' ,
365 'skip_download' : True ,
368 _GEO_COUNTRIES
= [ 'JP' ]
369 _FEED_URL
= 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
371 def _get_feed_query ( self
, uri
):
373 'arcEp' : 'mtvjapan.com' ,
378 class MTVVideoIE ( MTVServicesInfoExtractor
):
379 IE_NAME
= 'mtv:video'
380 _VALID_URL
= r
'''(?x)^https?://
381 (?:(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$|
382 m\.mtv\.com/videos/video\.rbml\?.*?id=(?P<mgid>[^&]+))'''
384 _FEED_URL
= 'http://www.mtv.com/player/embed/AS3/rss/'
388 'url' : 'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml' ,
389 'md5' : '850f3f143316b1e71fa56a4edfd6e0f8' ,
393 'title' : 'Taylor Swift - "Ours (VH1 Storytellers)"' ,
394 'description' : 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.' ,
395 'timestamp' : 1352610000 ,
396 'upload_date' : '20121111' ,
401 def _get_thumbnail_url ( self
, uri
, itemdoc
):
402 return 'http://mtv.mtvnimages.com/uri/' + uri
404 def _real_extract ( self
, url
):
405 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
406 video_id
= mobj
. group ( 'videoid' )
407 uri
= mobj
. groupdict (). get ( 'mgid' )
409 webpage
= self
._ download
_ webpage
( url
, video_id
)
411 # Some videos come from Vevo.com
413 r
'(?s)isVevoVideo = true;.*?vevoVideoId = "(.*?)";' , webpage
)
415 vevo_id
= m_vevo
. group ( 1 )
416 self
. to_screen ( 'Vevo video detected: %s ' % vevo_id
)
417 return self
. url_result ( 'vevo: %s ' % vevo_id
, ie
= 'Vevo' )
419 uri
= self
._ html
_ search
_ regex
( r
'/uri/(.*?)\?' , webpage
, 'uri' )
420 return self
._ get
_ videos
_ info
( uri
)
423 class MTVDEIE ( MTVServicesInfoExtractor
):
425 _VALID_URL
= r
'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)'
427 'url' : 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum' ,
429 'id' : 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5' ,
432 'description' : 'Traum' ,
436 'skip_download' : True ,
438 'skip' : 'Blocked at Travis CI' ,
440 # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
441 'url' : 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1' ,
443 'id' : '1e5a878b-31c5-11e7-a442-0e40cf2fc285' ,
445 'title' : 'Teen Mom 2' ,
446 'description' : 'md5:dc65e357ef7e1085ed53e9e9d83146a7' ,
450 'skip_download' : True ,
452 'skip' : 'Blocked at Travis CI' ,
454 'url' : 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3' ,
456 'id' : 'local_playlist-4e760566473c4c8c5344' ,
458 'title' : 'Article_mtv-movies-spotlight-pixels-teil-3_short-clips_part1' ,
459 'description' : 'MTV Movies Supercut' ,
463 'skip_download' : True ,
465 'skip' : 'Das Video kann zur Zeit nicht abgespielt werden.' ,
467 _GEO_COUNTRIES
= [ 'DE' ]
468 _FEED_URL
= 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
470 def _get_feed_query ( self
, uri
):