]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/generic.py
c7552fddb587a60454bec6faa174c36bd4aa9a4a
   6 from .common 
import InfoExtractor
 
  10     compat_urllib_request
, 
  17 from .brightcove 
import BrightcoveIE
 
  20 class GenericIE(InfoExtractor
): 
  21     IE_DESC 
= u
'Generic downloader that works on some sites' 
  26             u
'url': u
'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 
  27             u
'file': u
'13601338388002.mp4', 
  28             u
'md5': u
'6e15c93721d7ec9e9ca3fdbf07982cfd', 
  30                 u
"uploader": u
"www.hodiho.fr", 
  31                 u
"title": u
"R\u00e9gis plante sa Jeep" 
  34         # embedded vimeo video 
  37             u
'url': u
'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', 
  38             u
'file': u
'22444065.mp4', 
  39             u
'md5': u
'2903896e23df39722c33f015af0666e2', 
  41                 u
'title': u
'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', 
  42                 u
"uploader_id": u
"skillsmatter", 
  43                 u
"uploader": u
"Skills Matter", 
  46         # bandcamp page with custom domain 
  48             u
'add_ie': ['Bandcamp'], 
  49             u
'url': u
'http://bronyrock.com/track/the-pony-mash', 
  50             u
'file': u
'3235767654.mp3', 
  52                 u
'title': u
'The Pony Mash', 
  53                 u
'uploader': u
'M_Pallante', 
  55             u
'skip': u
'There is a limit of 200 free downloads / month for the test song', 
  57         # embedded brightcove video 
  58         # it also tests brightcove videos that need to set the 'Referer' in the 
  61             u
'add_ie': ['Brightcove'], 
  62             u
'url': u
'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 
  64                 u
'id': u
'2765128793001', 
  66                 u
'title': u
'Le cours de bourse : l’analyse technique', 
  67                 u
'description': u
'md5:7e9ad046e968cb2d1114004aba466fd9', 
  68                 u
'uploader': u
'BFM BUSINESS', 
  71                 u
'skip_download': True, 
  76     def report_download_webpage(self
, video_id
): 
  77         """Report webpage download.""" 
  78         if not self
._downloader
.params
.get('test', False): 
  79             self
._downloader
.report_warning(u
'Falling back on generic information extractor.') 
  80         super(GenericIE
, self
).report_download_webpage(video_id
) 
  82     def report_following_redirect(self
, new_url
): 
  83         """Report information extraction.""" 
  84         self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
) 
  86     def _test_redirect(self
, url
): 
  87         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
  88         class HeadRequest(compat_urllib_request
.Request
): 
  92         class HEADRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
): 
  94             Subclass the HTTPRedirectHandler to make it use our 
  95             HeadRequest also on the redirected URL 
  97             def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
): 
  98                 if code 
in (301, 302, 303, 307): 
  99                     newurl 
= newurl
.replace(' ', '%20') 
 100                     newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
 101                                       if k
.lower() not in ("content-length", "content-type")) 
 102                     return HeadRequest(newurl
, 
 104                                        origin_req_host
=req
.get_origin_req_host(), 
 107                     raise compat_urllib_error
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
) 
 109         class HTTPMethodFallback(compat_urllib_request
.BaseHandler
): 
 111             Fallback to GET if HEAD is not allowed (405 HTTP error) 
 113             def http_error_405(self
, req
, fp
, code
, msg
, headers
): 
 117                 newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
 118                                   if k
.lower() not in ("content-length", "content-type")) 
 119                 return self
.parent
.open(compat_urllib_request
.Request(req
.get_full_url(), 
 121                                                  origin_req_host
=req
.get_origin_req_host(), 
 125         opener 
= compat_urllib_request
.OpenerDirector() 
 126         for handler 
in [compat_urllib_request
.HTTPHandler
, compat_urllib_request
.HTTPDefaultErrorHandler
, 
 127                         HTTPMethodFallback
, HEADRedirectHandler
, 
 128                         compat_urllib_request
.HTTPErrorProcessor
, compat_urllib_request
.HTTPSHandler
]: 
 129             opener
.add_handler(handler()) 
 131         response 
= opener
.open(HeadRequest(url
)) 
 133             raise ExtractorError(u
'Invalid URL protocol') 
 134         new_url 
= response
.geturl() 
 139         self
.report_following_redirect(new_url
) 
 142     def _real_extract(self
, url
): 
 143         parsed_url 
= compat_urlparse
.urlparse(url
) 
 144         if not parsed_url
.scheme
: 
 145             self
._downloader
.report_warning('The url doesn\'t specify the protocol, trying with http') 
 146             return self
.url_result('http://' + url
) 
 149             new_url 
= self
._test
_redirect
(url
) 
 151                 return [self
.url_result(new_url
)] 
 152         except compat_urllib_error
.HTTPError
: 
 153             # This may be a stupid server that doesn't like HEAD, our UA, or so 
 156         video_id 
= url
.split('/')[-1] 
 158             webpage 
= self
._download
_webpage
(url
, video_id
) 
 160             # since this is the last-resort InfoExtractor, if 
 161             # this error is thrown, it'll be thrown here 
 162             raise ExtractorError(u
'Failed to download URL: %s' % url
) 
 164         self
.report_extraction(video_id
) 
 165         # Look for BrightCove: 
 166         bc_url 
= BrightcoveIE
._extract
_brightcove
_url
(webpage
) 
 167         if bc_url 
is not None: 
 168             self
.to_screen(u
'Brightcove video detected.') 
 169             return self
.url_result(bc_url
, 'Brightcove') 
 171         # Look for embedded Vimeo player 
 173             r
'<iframe[^>]+?src="(https?://player.vimeo.com/video/.+?)"', webpage
) 
 175             player_url 
= unescapeHTML(mobj
.group(1)) 
 176             surl 
= smuggle_url(player_url
, {'Referer': url
}) 
 177             return self
.url_result(surl
, 'Vimeo') 
 179         # Look for embedded YouTube player 
 181             r
'<iframe[^>]+?src=(["\'])(?P
<url
>https?
://(?
:www\
.)?youtube
.com
/embed
/.+?
)\
1', webpage) 
 183             surl = unescapeHTML(mobj.group(u'url
')) 
 184             return self.url_result(surl, 'Youtube
') 
 186         # Look for Bandcamp pages with custom domain 
 187         mobj = re.search(r'<meta 
property="og:url"[^
>]*?content
="(.*?bandcamp\.com.*?)"', webpage) 
 189             burl = unescapeHTML(mobj.group(1)) 
 190             return self.url_result(burl, 'Bandcamp
') 
 192         # Start with something easy: JW Player in SWFObject 
 193         mobj = re.search(r'flashvars
: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
 195             # Broaden the search a little bit 
 196             mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage) 
 198             # Broaden the search a little bit: JWPlayer JS loader 
 199             mobj = re.search(r'[^A-Za-z0-9]?file["\']?
:\s
*["\'](http[^\'"&]*)', webpage) 
 201             # Try to find twitter cards info 
 202             mobj = re.search(r'<meta (?
:property|name
)="twitter:player:stream" (?
:content|value
)="(.+?)"', webpage) 
 204             # We look for Open Graph info: 
 205             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) 
 206             m_video_type = re.search(r'<meta
.*?
property="og:video:type".*?content
="video/(.*?)"', webpage) 
 207             # We only look in og:video if the MIME type is a video, don't 
try if it
's a Flash player: 
 208             if m_video_type is not None: 
 209                 mobj = re.search(r'<meta
.*?
property="og:video".*?content
="(.*?)"', webpage) 
 212             mobj = re.search(r'<video
[^
<]*(?
:>.*?
<source
.*?
)? src
="([^"]+)"', webpage, flags=re.DOTALL) 
 214             raise ExtractorError(u'Unsupported URL: %s' % url) 
 216         # It's possible that one of the regexes 
 217         # matched, but returned an empty group: 
 218         if mobj.group(1) is None: 
 219             raise ExtractorError(u'Did not find a valid video URL at %s' % url) 
 221         video_url = mobj.group(1) 
 222         video_url = compat_urlparse.urljoin(url, video_url) 
 223         video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) 
 225         # here's a fun little line of code for you: 
 226         video_extension = os.path.splitext(video_id)[1][1:] 
 227         video_id = os.path.splitext(video_id)[0] 
 229         # it's tempting to parse this further, but you would 
 230         # have to take into account all the variations like 
 231         #   Video Title - Site Name 
 232         #   Site Name | Video Title 
 233         #   Video Title - Tagline | Site Name 
 234         # and so on and so forth; it's just not practical 
 235         video_title = self._html_search_regex(r'<title>(.*)</title>', 
 236             webpage, u'video title', default=u'video', flags=re.DOTALL) 
 238         # video uploader is domain name 
 239         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*', 
 240             url, u'video uploader') 
 245             'uploader': video_uploader, 
 247             'title':    video_title, 
 248             'ext':      video_extension,