6 from .common 
import InfoExtractor
 
  10     compat_urllib_request
, 
  17 from .brightcove 
import BrightcoveIE
 
  20 class GenericIE(InfoExtractor
): 
  21     IE_DESC 
= u
'Generic downloader that works on some sites' 
  26             u
'url': u
'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 
  27             u
'file': u
'13601338388002.mp4', 
  28             u
'md5': u
'85b90ccc9d73b4acd9138d3af4c27f89', 
  30                 u
"uploader": u
"www.hodiho.fr", 
  31                 u
"title": u
"R\u00e9gis plante sa Jeep" 
  34         # embedded vimeo video 
  36             u
'url': u
'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', 
  37             u
'file': u
'22444065.mp4', 
  38             u
'md5': u
'2903896e23df39722c33f015af0666e2', 
  40                 u
'title': u
'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', 
  41                 u
"uploader_id": u
"skillsmatter", 
  42                 u
"uploader": u
"Skills Matter", 
  47     def report_download_webpage(self
, video_id
): 
  48         """Report webpage download.""" 
  49         if not self
._downloader
.params
.get('test', False): 
  50             self
._downloader
.report_warning(u
'Falling back on generic information extractor.') 
  51         super(GenericIE
, self
).report_download_webpage(video_id
) 
  53     def report_following_redirect(self
, new_url
): 
  54         """Report information extraction.""" 
  55         self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
) 
  57     def _test_redirect(self
, url
): 
  58         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
  59         class HeadRequest(compat_urllib_request
.Request
): 
  63         class HEADRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
): 
  65             Subclass the HTTPRedirectHandler to make it use our 
  66             HeadRequest also on the redirected URL 
  68             def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
): 
  69                 if code 
in (301, 302, 303, 307): 
  70                     newurl 
= newurl
.replace(' ', '%20') 
  71                     newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
  72                                       if k
.lower() not in ("content-length", "content-type")) 
  73                     return HeadRequest(newurl
, 
  75                                        origin_req_host
=req
.get_origin_req_host(), 
  78                     raise compat_urllib_error
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
) 
  80         class HTTPMethodFallback(compat_urllib_request
.BaseHandler
): 
  82             Fallback to GET if HEAD is not allowed (405 HTTP error) 
  84             def http_error_405(self
, req
, fp
, code
, msg
, headers
): 
  88                 newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
  89                                   if k
.lower() not in ("content-length", "content-type")) 
  90                 return self
.parent
.open(compat_urllib_request
.Request(req
.get_full_url(), 
  92                                                  origin_req_host
=req
.get_origin_req_host(), 
  96         opener 
= compat_urllib_request
.OpenerDirector() 
  97         for handler 
in [compat_urllib_request
.HTTPHandler
, compat_urllib_request
.HTTPDefaultErrorHandler
, 
  98                         HTTPMethodFallback
, HEADRedirectHandler
, 
  99                         compat_urllib_request
.HTTPErrorProcessor
, compat_urllib_request
.HTTPSHandler
]: 
 100             opener
.add_handler(handler()) 
 102         response 
= opener
.open(HeadRequest(url
)) 
 104             raise ExtractorError(u
'Invalid URL protocol') 
 105         new_url 
= response
.geturl() 
 110         self
.report_following_redirect(new_url
) 
 113     def _real_extract(self
, url
): 
 114         parsed_url 
= compat_urlparse
.urlparse(url
) 
 115         if not parsed_url
.scheme
: 
 116             self
._downloader
.report_warning('The url doesn\'t specify the protocol, trying with http') 
 117             return self
.url_result('http://' + url
) 
 120             new_url 
= self
._test
_redirect
(url
) 
 122                 return [self
.url_result(new_url
)] 
 123         except compat_urllib_error
.HTTPError
: 
 124             # This may be a stupid server that doesn't like HEAD, our UA, or so 
 127         video_id 
= url
.split('/')[-1] 
 129             webpage 
= self
._download
_webpage
(url
, video_id
) 
 131             # since this is the last-resort InfoExtractor, if 
 132             # this error is thrown, it'll be thrown here 
 133             raise ExtractorError(u
'Failed to download URL: %s' % url
) 
 135         self
.report_extraction(video_id
) 
 136         # Look for BrightCove: 
 137         m_brightcove 
= re
.search(r
'<object[^>]+?class=([\'"])[^>]*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL) 
 138         if m_brightcove is not None: 
 139             self.to_screen(u'Brightcove video detected.') 
 140             bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) 
 141             return self.url_result(bc_url, 'Brightcove') 
 143         # Look for embedded Vimeo player 
 145             r'<iframe[^>]+?src="(https?
://player
.vimeo
.com
/video
/.+?
)"', webpage) 
 147             player_url = unescapeHTML(mobj.group(1)) 
 148             surl = smuggle_url(player_url, {'Referer': url}) 
 149             return self.url_result(surl, 'Vimeo') 
 151         # Look for embedded YouTube player 
 153             r'<iframe[^>]+?src="(https?
://(?
:www\
.)?youtube
.com
/embed
/.+?
)"', webpage) 
 155             surl = unescapeHTML(mobj.group(1)) 
 156             return self.url_result(surl, 'Youtube') 
 158         # Start with something easy: JW Player in SWFObject 
 159         mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage) 
 161             # Broaden the search a little bit 
 162             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) 
 164             # Broaden the search a little bit: JWPlayer JS loader 
 165             mobj = re.search(r'[^A
-Za
-z0
-9]?
file["\']?:\s*["\'](http
[^
\'"&]*)', webpage) 
 167             # Try to find twitter cards info 
 168             mobj = re.search(r'<meta (?:property|name)="twitter
:player
:stream
" (?:content|value)="(.+?
)"', webpage) 
 170             # We look for Open Graph info: 
 171             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) 
 172             m_video_type = re.search(r'<meta.*?property="og
:video
:type".*?content="video
/(.*?
)"', webpage) 
 173             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: 
 174             if m_video_type is not None: 
 175                 mobj = re.search(r'<meta.*?property="og
:video
".*?content="(.*?
)"', webpage) 
 178             mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^
"]+)"', webpage, flags=re.DOTALL) 
 180             raise ExtractorError(u'Unsupported URL
: %s' % url) 
 182         # It's possible that one of the regexes
 
 183         # matched, but returned an empty group: 
 184         if mobj
.group(1) is None: 
 185             raise ExtractorError(u
'Did not find a valid video URL at %s' % url
) 
 187         video_url 
= mobj
.group(1) 
 188         video_url 
= compat_urlparse
.urljoin(url
, video_url
) 
 189         video_id 
= compat_urllib_parse
.unquote(os
.path
.basename(video_url
)) 
 191         # here's a fun little line of code for you: 
 192         video_extension 
= os
.path
.splitext(video_id
)[1][1:] 
 193         video_id 
= os
.path
.splitext(video_id
)[0] 
 195         # it's tempting to parse this further, but you would 
 196         # have to take into account all the variations like 
 197         #   Video Title - Site Name 
 198         #   Site Name | Video Title 
 199         #   Video Title - Tagline | Site Name 
 200         # and so on and so forth; it's just not practical 
 201         video_title 
= self
._html
_search
_regex
(r
'<title>(.*)</title>', 
 202             webpage
, u
'video title', default
=u
'video', flags
=re
.DOTALL
) 
 204         # video uploader is domain name 
 205         video_uploader 
= self
._search
_regex
(r
'(?:https?://)?([^/]*)/.*', 
 206             url
, u
'video uploader') 
 211             'uploader': video_uploader
, 
 213             'title':    video_title
, 
 214             'ext':      video_extension
,