4 from .common 
import InfoExtractor
 
  13 class GenericIE(InfoExtractor
): 
  14     IE_DESC 
= u
'Generic downloader that works on some sites' 
  18         u
'url': u
'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 
  19         u
'file': u
'13601338388002.mp4', 
  20         u
'md5': u
'85b90ccc9d73b4acd9138d3af4c27f89', 
  22             u
"uploader": u
"www.hodiho.fr",  
  23             u
"title": u
"R\u00e9gis plante sa Jeep" 
  27     def report_download_webpage(self
, video_id
): 
  28         """Report webpage download.""" 
  29         if not self
._downloader
.params
.get('test', False): 
  30             self
._downloader
.report_warning(u
'Falling back on generic information extractor.') 
  31         super(GenericIE
, self
).report_download_webpage(video_id
) 
  33     def report_following_redirect(self
, new_url
): 
  34         """Report information extraction.""" 
  35         self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
) 
  37     def _test_redirect(self
, url
): 
  38         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
  39         class HeadRequest(compat_urllib_request
.Request
): 
  43         class HEADRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
): 
  45             Subclass the HTTPRedirectHandler to make it use our 
  46             HeadRequest also on the redirected URL 
  48             def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
): 
  49                 if code 
in (301, 302, 303, 307): 
  50                     newurl 
= newurl
.replace(' ', '%20') 
  51                     newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
  52                                       if k
.lower() not in ("content-length", "content-type")) 
  53                     return HeadRequest(newurl
, 
  55                                        origin_req_host
=req
.get_origin_req_host(), 
  58                     raise compat_urllib_error
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
) 
  60         class HTTPMethodFallback(compat_urllib_request
.BaseHandler
): 
  62             Fallback to GET if HEAD is not allowed (405 HTTP error) 
  64             def http_error_405(self
, req
, fp
, code
, msg
, headers
): 
  68                 newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
  69                                   if k
.lower() not in ("content-length", "content-type")) 
  70                 return self
.parent
.open(compat_urllib_request
.Request(req
.get_full_url(), 
  72                                                  origin_req_host
=req
.get_origin_req_host(), 
  76         opener 
= compat_urllib_request
.OpenerDirector() 
  77         for handler 
in [compat_urllib_request
.HTTPHandler
, compat_urllib_request
.HTTPDefaultErrorHandler
, 
  78                         HTTPMethodFallback
, HEADRedirectHandler
, 
  79                         compat_urllib_request
.HTTPErrorProcessor
, compat_urllib_request
.HTTPSHandler
]: 
  80             opener
.add_handler(handler()) 
  82         response 
= opener
.open(HeadRequest(url
)) 
  84             raise ExtractorError(u
'Invalid URL protocol') 
  85         new_url 
= response
.geturl() 
  90         self
.report_following_redirect(new_url
) 
  93     def _real_extract(self
, url
): 
  94         new_url 
= self
._test
_redirect
(url
) 
  95         if new_url
: return [self
.url_result(new_url
)] 
  97         video_id 
= url
.split('/')[-1] 
  99             webpage 
= self
._download
_webpage
(url
, video_id
) 
 101             # since this is the last-resort InfoExtractor, if 
 102             # this error is thrown, it'll be thrown here 
 103             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 105         self
.report_extraction(video_id
) 
 106         # Start with something easy: JW Player in SWFObject 
 107         mobj 
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
 109             # Broaden the search a little bit 
 110             mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage) 
 112             # Broaden the search a little bit: JWPlayer JS loader 
 113             mobj = re.search(r'[^A-Za-z0-9]?file["\']?
:\s
*["\'](http[^\'"&]*)', webpage) 
 115             # Try to find twitter cards info 
 116             mobj = re.search(r'<meta (?
:property|name
)="twitter:player:stream" (?
:content|value
)="(.+?)"', webpage) 
 118             # We look for Open Graph info: 
 119             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) 
 120             m_video_type = re.search(r'<meta
.*?
property="og:video:type".*?content
="video/(.*?)"', webpage) 
 121             # We only look in og:video if the MIME type is a video, don't 
try if it
's a Flash player: 
 122             if m_video_type is not None: 
 123                 mobj = re.search(r'<meta
.*?
property="og:video".*?content
="(.*?)"', webpage) 
 125             raise ExtractorError(u'Invalid URL
: %s' % url) 
 127         # It's possible that one of the regexes
 
 128         # matched, but returned an empty group: 
 129         if mobj
.group(1) is None: 
 130             raise ExtractorError(u
'Invalid URL: %s' % url
) 
 132         video_url 
= compat_urllib_parse
.unquote(mobj
.group(1)) 
 133         video_id 
= os
.path
.basename(video_url
) 
 135         # here's a fun little line of code for you: 
 136         video_extension 
= os
.path
.splitext(video_id
)[1][1:] 
 137         video_id 
= os
.path
.splitext(video_id
)[0] 
 139         # it's tempting to parse this further, but you would 
 140         # have to take into account all the variations like 
 141         #   Video Title - Site Name 
 142         #   Site Name | Video Title 
 143         #   Video Title - Tagline | Site Name 
 144         # and so on and so forth; it's just not practical 
 145         video_title 
= self
._html
_search
_regex
(r
'<title>(.*)</title>', 
 146             webpage
, u
'video title', default
=u
'video', flags
=re
.DOTALL
) 
 148         # video uploader is domain name 
 149         video_uploader 
= self
._search
_regex
(r
'(?:https?://)?([^/]*)/.*', 
 150             url
, u
'video uploader') 
 155             'uploader': video_uploader
, 
 157             'title':    video_title
, 
 158             'ext':      video_extension
,