]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/metacafe.py
1 from __future__
import unicode_literals
5 from . common
import InfoExtractor
18 class MetacafeIE ( InfoExtractor
):
19 _VALID_URL
= r
'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
20 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
21 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
26 'add_ie' : [ 'Youtube' ],
27 'url' : 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/' ,
31 'upload_date' : '20090102' ,
32 'title' : 'The Electric Company | "Short I" | PBS KIDS GO!' ,
33 'description' : 'md5:2439a8ef6d5a70e380c22f5ad323e5a8' ,
38 # Normal metacafe video
40 'url' : 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/' ,
41 'md5' : '6e0bca200eaad2552e6915ed6fd4d9ad' ,
45 'title' : 'News: Stuff You Won \' t Do with Your PlayStation 4' ,
47 'description' : 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4 \' s capabilities and limitations.' ,
52 'url' : 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/' ,
54 'id' : 'an-dVVXnuY7Jh77J' ,
56 'title' : 'The Andromeda Strain (1971): Stop the Bomb Part 3' ,
57 'uploader' : 'anyclip' ,
58 'description' : 'md5:38c711dd98f5bb87acf973d573442e67' ,
61 # age-restricted video
63 'url' : 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/' ,
64 'md5' : '98dde7c1a35d02178e8ab7560fe8bd09' ,
68 'title' : 'BBC INTERNAL Christmas Tape \' 79 - UNCENSORED Outtakes, Etc.' ,
69 'uploader' : 'Dwayne Pipe' ,
70 'description' : 'md5:950bf4c581e2c059911fa3ffbe377e4b' ,
76 'url' : 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/' ,
80 'title' : 'Open: This is Face the Nation, February 9' ,
81 'description' : 'md5:8a9ceec26d1f7ed6eab610834cc1a476' ,
86 'skip_download' : True ,
89 # Movieclips.com video
91 'url' : 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/' ,
95 'title' : 'My Week with Marilyn - Do You Love Me?' ,
96 'description' : 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.' ,
97 'uploader' : 'movie_trailers' ,
101 'skip_download' : 'requires rtmpdump' ,
106 def report_disclaimer ( self
):
107 self
. to_screen ( 'Retrieving disclaimer' )
109 def _real_initialize ( self
):
110 # Retrieve disclaimer
111 self
. report_disclaimer ()
112 self
._ download
_ webpage
( self
._ DISCLAIMER
, None , False , 'Unable to retrieve disclaimer' )
117 'submit' : "Continue - I'm over 18" ,
119 request
= compat_urllib_request
. Request ( self
._ FILTER
_ POST
, compat_urllib_parse
. urlencode ( disclaimer_form
))
120 request
. add_header ( 'Content-Type' , 'application/x-www-form-urlencoded' )
121 self
. report_age_confirmation ()
122 self
._ download
_ webpage
( request
, None , False , 'Unable to confirm age' )
124 def _real_extract ( self
, url
):
125 # Extract id and simplified title from URL
126 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
128 raise ExtractorError ( 'Invalid URL: %s ' % url
)
130 video_id
= mobj
. group ( 1 )
132 # the video may come from an external site
133 m_external
= re
. match ( '^(\w {2} )-(.*)$' , video_id
)
134 if m_external
is not None :
135 prefix
, ext_id
= m_external
. groups ()
136 # Check if video comes from YouTube
138 return self
. url_result ( 'http://www.youtube.com/watch?v= %s ' % ext_id
, 'Youtube' )
139 # CBS videos use theplatform.com
141 return self
. url_result ( 'theplatform: %s ' % ext_id
, 'ThePlatform' )
143 # Retrieve video webpage to extract further information
144 req
= compat_urllib_request
. Request ( 'http://www.metacafe.com/watch/ %s /' % video_id
)
146 # AnyClip videos require the flashversion cookie so that we get the link
148 mobj_an
= re
. match ( r
'^an-(.*?)$' , video_id
)
150 req
. headers
[ 'Cookie' ] = 'flashVersion=0;'
151 webpage
= self
._ download
_ webpage
( req
, video_id
)
153 # Extract URL, uploader and title from webpage
154 self
. report_extraction ( video_id
)
156 mobj
= re
. search ( r
'(?m)&mediaURL=([^&]+)' , webpage
)
158 mediaURL
= compat_urllib_parse
. unquote ( mobj
. group ( 1 ))
159 video_ext
= mediaURL
[- 3 :]
161 # Extract gdaKey if available
162 mobj
= re
. search ( r
'(?m)&gdaKey=(.*?)&' , webpage
)
166 gdaKey
= mobj
. group ( 1 )
167 video_url
= ' %s ?__gda__= %s ' % ( mediaURL
, gdaKey
)
168 if video_url
is None :
169 mobj
= re
. search ( r
'<video src="([^"]+)"' , webpage
)
171 video_url
= mobj
. group ( 1 )
173 if video_url
is None :
174 flashvars
= self
._ search
_ regex
(
175 r
' name="flashvars" value="(.*?)"' , webpage
, 'flashvars' ,
178 vardict
= compat_parse_qs ( flashvars
)
179 if 'mediaData' not in vardict
:
180 raise ExtractorError ( 'Unable to extract media URL' )
182 r
'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"' , vardict
[ 'mediaData' ][ 0 ])
184 raise ExtractorError ( 'Unable to extract media URL' )
185 mediaURL
= mobj
. group ( 'mediaURL' ). replace ( ' \\ /' , '/' )
186 video_url
= ' %s ?__gda__= %s ' % ( mediaURL
, mobj
. group ( 'key' ))
187 video_ext
= determine_ext ( video_url
)
188 if video_url
is None :
189 player_url
= self
._ search
_ regex
(
190 r
"swfobject\.embedSWF\('([^']+)'" ,
191 webpage
, 'config URL' , default
= None )
193 config_url
= self
._ search
_ regex
(
194 r
'config=(.+)$' , player_url
, 'config URL' )
195 config_doc
= self
._ download
_ xml
(
196 config_url
, video_id
,
197 note
= 'Downloading video config' )
198 smil_url
= config_doc
. find ( './/properties' ). attrib
[ 'smil_file' ]
199 smil_doc
= self
._ download
_ xml
(
201 note
= 'Downloading SMIL document' )
202 base_url
= smil_doc
. find ( './head/meta' ). attrib
[ 'base' ]
204 for vn
in smil_doc
. findall ( './/video' ):
205 br
= int ( vn
. attrib
[ 'system-bitrate' ])
206 play_path
= vn
. attrib
[ 'src' ]
208 'format_id' : 'smil- %d ' % br
,
210 'play_path' : play_path
,
212 'player_url' : player_url
,
213 'ext' : play_path
. partition ( ':' )[ 0 ],
216 if video_url
is None :
217 raise ExtractorError ( 'Unsupported video type' )
219 video_title
= self
._ html
_ search
_ regex
(
220 r
'(?im)<title>(.*) - Video</title>' , webpage
, 'title' )
221 description
= self
._ og
_ search
_ description
( webpage
)
222 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
)
223 video_uploader
= self
._ html
_ search
_ regex
(
224 r
'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);' ,
225 webpage
, 'uploader nickname' , fatal
= False )
226 duration
= int_or_none (
227 self
._ html
_ search
_ meta
( 'video:duration' , webpage
))
231 if re
. search ( r
'"contentRating":"restricted"' , webpage
)
234 if isinstance ( video_url
, list ):
242 self
._ sort
_ formats
( formats
)
245 'description' : description
,
246 'uploader' : video_uploader
,
247 'title' : video_title
,
248 'thumbnail' : thumbnail
,
249 'age_limit' : age_limit
,
251 'duration' : duration
,