]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/metacafe.py
1 from __future__
import unicode_literals
5 from . common
import InfoExtractor
16 class MetacafeIE ( InfoExtractor
):
17 _VALID_URL
= r
'http://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
18 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
19 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
24 'add_ie' : [ 'Youtube' ],
25 'url' : 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/' ,
29 'upload_date' : '20090102' ,
30 'title' : 'The Electric Company | "Short I" | PBS KIDS GO!' ,
31 'description' : 'md5:2439a8ef6d5a70e380c22f5ad323e5a8' ,
36 # Normal metacafe video
38 'url' : 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/' ,
39 'md5' : '6e0bca200eaad2552e6915ed6fd4d9ad' ,
43 'title' : 'News: Stuff You Won \' t Do with Your PlayStation 4' ,
45 'description' : 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4 \' s capabilities and limitations.' ,
50 'url' : 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/' ,
52 'id' : 'an-dVVXnuY7Jh77J' ,
54 'title' : 'The Andromeda Strain (1971): Stop the Bomb Part 3' ,
55 'uploader' : 'anyclip' ,
56 'description' : 'md5:38c711dd98f5bb87acf973d573442e67' ,
59 # age-restricted video
61 'url' : 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/' ,
62 'md5' : '98dde7c1a35d02178e8ab7560fe8bd09' ,
66 'title' : 'BBC INTERNAL Christmas Tape \' 79 - UNCENSORED Outtakes, Etc.' ,
67 'uploader' : 'Dwayne Pipe' ,
68 'description' : 'md5:950bf4c581e2c059911fa3ffbe377e4b' ,
74 'url' : 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/' ,
78 'title' : 'Open: This is Face the Nation, February 9' ,
79 'description' : 'md5:8a9ceec26d1f7ed6eab610834cc1a476' ,
84 'skip_download' : True ,
87 # Movieclips.com video
89 'url' : 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/' ,
93 'title' : 'My Week with Marilyn - Do You Love Me?' ,
94 'description' : 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.' ,
95 'uploader' : 'movie_trailers' ,
99 'skip_download' : 'requires rtmpdump' ,
104 def report_disclaimer ( self
):
105 self
. to_screen ( 'Retrieving disclaimer' )
107 def _real_initialize ( self
):
108 # Retrieve disclaimer
109 self
. report_disclaimer ()
110 self
._ download
_ webpage
( self
._ DISCLAIMER
, None , False , 'Unable to retrieve disclaimer' )
115 'submit' : "Continue - I'm over 18" ,
117 request
= compat_urllib_request
. Request ( self
._ FILTER
_ POST
, compat_urllib_parse
. urlencode ( disclaimer_form
))
118 request
. add_header ( 'Content-Type' , 'application/x-www-form-urlencoded' )
119 self
. report_age_confirmation ()
120 self
._ download
_ webpage
( request
, None , False , 'Unable to confirm age' )
122 def _real_extract ( self
, url
):
123 # Extract id and simplified title from URL
124 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
126 raise ExtractorError ( 'Invalid URL: %s ' % url
)
128 video_id
= mobj
. group ( 1 )
130 # the video may come from an external site
131 m_external
= re
. match ( '^(\w {2} )-(.*)$' , video_id
)
132 if m_external
is not None :
133 prefix
, ext_id
= m_external
. groups ()
134 # Check if video comes from YouTube
136 return self
. url_result ( 'http://www.youtube.com/watch?v= %s ' % ext_id
, 'Youtube' )
137 # CBS videos use theplatform.com
139 return self
. url_result ( 'theplatform: %s ' % ext_id
, 'ThePlatform' )
141 # Retrieve video webpage to extract further information
142 req
= compat_urllib_request
. Request ( 'http://www.metacafe.com/watch/ %s /' % video_id
)
144 # AnyClip videos require the flashversion cookie so that we get the link
146 mobj_an
= re
. match ( r
'^an-(.*?)$' , video_id
)
148 req
. headers
[ 'Cookie' ] = 'flashVersion=0;'
149 webpage
= self
._ download
_ webpage
( req
, video_id
)
151 # Extract URL, uploader and title from webpage
152 self
. report_extraction ( video_id
)
154 mobj
= re
. search ( r
'(?m)&mediaURL=([^&]+)' , webpage
)
156 mediaURL
= compat_urllib_parse
. unquote ( mobj
. group ( 1 ))
157 video_ext
= mediaURL
[- 3 :]
159 # Extract gdaKey if available
160 mobj
= re
. search ( r
'(?m)&gdaKey=(.*?)&' , webpage
)
164 gdaKey
= mobj
. group ( 1 )
165 video_url
= ' %s ?__gda__= %s ' % ( mediaURL
, gdaKey
)
166 if video_url
is None :
167 mobj
= re
. search ( r
'<video src="([^"]+)"' , webpage
)
169 video_url
= mobj
. group ( 1 )
171 if video_url
is None :
172 flashvars
= self
._ search
_ regex
(
173 r
' name="flashvars" value="(.*?)"' , webpage
, 'flashvars' ,
176 vardict
= compat_parse_qs ( flashvars
)
177 if 'mediaData' not in vardict
:
178 raise ExtractorError ( 'Unable to extract media URL' )
180 r
'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"' , vardict
[ 'mediaData' ][ 0 ])
182 raise ExtractorError ( 'Unable to extract media URL' )
183 mediaURL
= mobj
. group ( 'mediaURL' ). replace ( ' \\ /' , '/' )
184 video_url
= ' %s ?__gda__= %s ' % ( mediaURL
, mobj
. group ( 'key' ))
185 video_ext
= determine_ext ( video_url
)
186 if video_url
is None :
187 player_url
= self
._ search
_ regex
(
188 r
"swfobject\.embedSWF\('([^']+)'" ,
189 webpage
, 'config URL' , default
= None )
191 config_url
= self
._ search
_ regex
(
192 r
'config=(.+)$' , player_url
, 'config URL' )
193 config_doc
= self
._ download
_ xml
(
194 config_url
, video_id
,
195 note
= 'Downloading video config' )
196 smil_url
= config_doc
. find ( './/properties' ). attrib
[ 'smil_file' ]
197 smil_doc
= self
._ download
_ xml
(
199 note
= 'Downloading SMIL document' )
200 base_url
= smil_doc
. find ( './head/meta' ). attrib
[ 'base' ]
202 for vn
in smil_doc
. findall ( './/video' ):
203 br
= int ( vn
. attrib
[ 'system-bitrate' ])
204 play_path
= vn
. attrib
[ 'src' ]
206 'format_id' : 'smil- %d ' % br
,
208 'play_path' : play_path
,
210 'player_url' : player_url
,
211 'ext' : play_path
. partition ( ':' )[ 0 ],
214 if video_url
is None :
215 raise ExtractorError ( 'Unsupported video type' )
217 video_title
= self
._ html
_ search
_ regex
(
218 r
'(?im)<title>(.*) - Video</title>' , webpage
, 'title' )
219 description
= self
._ og
_ search
_ description
( webpage
)
220 thumbnail
= self
._ og
_ search
_ thumbnail
( webpage
)
221 video_uploader
= self
._ html
_ search
_ regex
(
222 r
'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);' ,
223 webpage
, 'uploader nickname' , fatal
= False )
224 duration
= int_or_none (
225 self
._ html
_ search
_ meta
( 'video:duration' , webpage
))
229 if re
. search ( r
'"contentRating":"restricted"' , webpage
)
232 if isinstance ( video_url
, list ):
240 self
._ sort
_ formats
( formats
)
243 'description' : description
,
244 'uploader' : video_uploader
,
245 'title' : video_title
,
246 'thumbnail' : thumbnail
,
247 'age_limit' : age_limit
,
249 'duration' : duration
,