]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/generic.py
839530982e7db4704ece2a589420ab4ecfc31c66
   3 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
   9 from .youtube 
import YoutubeIE
 
  13     compat_urllib_request
, 
  23 from .brightcove 
import BrightcoveIE
 
  24 from .ooyala 
import OoyalaIE
 
  27 class GenericIE(InfoExtractor
): 
  28     IE_DESC 
= 'Generic downloader that works on some sites' 
  33             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 
  34             'file': '13601338388002.mp4', 
  35             'md5': '6e15c93721d7ec9e9ca3fdbf07982cfd', 
  37                 'uploader': 'www.hodiho.fr', 
  38                 'title': 'R\u00e9gis plante sa Jeep', 
  41         # embedded vimeo video 
  44             'url': 'http://skillsmatter.com/podcast/home/move-semanticsperfect-forwarding-and-rvalue-references', 
  45             'file': '22444065.mp4', 
  46             'md5': '2903896e23df39722c33f015af0666e2', 
  48                 'title': 'ACCU 2011: Move Semantics,Perfect Forwarding, and Rvalue references- Scott Meyers- 13/04/2011', 
  49                 'uploader_id': 'skillsmatter', 
  50                 'uploader': 'Skills Matter', 
  53         # bandcamp page with custom domain 
  55             'add_ie': ['Bandcamp'], 
  56             'url': 'http://bronyrock.com/track/the-pony-mash', 
  57             'file': '3235767654.mp3', 
  59                 'title': 'The Pony Mash', 
  60                 'uploader': 'M_Pallante', 
  62             'skip': 'There is a limit of 200 free downloads / month for the test song', 
  64         # embedded brightcove video 
  65         # it also tests brightcove videos that need to set the 'Referer' in the 
  68             'add_ie': ['Brightcove'], 
  69             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 
  71                 'id': '2765128793001', 
  73                 'title': 'Le cours de bourse : l’analyse technique', 
  74                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', 
  75                 'uploader': 'BFM BUSINESS', 
  78                 'skip_download': True, 
  81         # Direct link to a video 
  83             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 
  84             'file': 'trailer.mp4', 
  85             'md5': '67d406c2bcb6af27fa886f31aa934bbe', 
  89                 'upload_date': '20100513', 
  94             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', 
  95             'md5': '5644c6ca5d5782c1d0d350dad9bd840c', 
  97                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 
  99                 'title': '2cc213299525360.mov', #that's what we get 
 104     def report_download_webpage(self
, video_id
): 
 105         """Report webpage download.""" 
 106         if not self
._downloader
.params
.get('test', False): 
 107             self
._downloader
.report_warning('Falling back on generic information extractor.') 
 108         super(GenericIE
, self
).report_download_webpage(video_id
) 
 110     def report_following_redirect(self
, new_url
): 
 111         """Report information extraction.""" 
 112         self
._downloader
.to_screen('[redirect] Following redirect to %s' % new_url
) 
 114     def _send_head(self
, url
): 
 115         """Check if it is a redirect, like url shorteners, in case return the new url.""" 
 117         class HEADRedirectHandler(compat_urllib_request
.HTTPRedirectHandler
): 
 119             Subclass the HTTPRedirectHandler to make it use our 
 120             HEADRequest also on the redirected URL 
 122             def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
): 
 123                 if code 
in (301, 302, 303, 307): 
 124                     newurl 
= newurl
.replace(' ', '%20') 
 125                     newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
 126                                       if k
.lower() not in ("content-length", "content-type")) 
 127                     return HEADRequest(newurl
, 
 129                                        origin_req_host
=req
.get_origin_req_host(), 
 132                     raise compat_urllib_error
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
) 
 134         class HTTPMethodFallback(compat_urllib_request
.BaseHandler
): 
 136             Fallback to GET if HEAD is not allowed (405 HTTP error) 
 138             def http_error_405(self
, req
, fp
, code
, msg
, headers
): 
 142                 newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
 143                                   if k
.lower() not in ("content-length", "content-type")) 
 144                 return self
.parent
.open(compat_urllib_request
.Request(req
.get_full_url(), 
 146                                                  origin_req_host
=req
.get_origin_req_host(), 
 150         opener 
= compat_urllib_request
.OpenerDirector() 
 151         for handler 
in [compat_urllib_request
.HTTPHandler
, compat_urllib_request
.HTTPDefaultErrorHandler
, 
 152                         HTTPMethodFallback
, HEADRedirectHandler
, 
 153                         compat_urllib_request
.HTTPErrorProcessor
, compat_urllib_request
.HTTPSHandler
]: 
 154             opener
.add_handler(handler()) 
 156         response 
= opener
.open(HEADRequest(url
)) 
 158             raise ExtractorError('Invalid URL protocol') 
 161     def _real_extract(self
, url
): 
 162         parsed_url 
= compat_urlparse
.urlparse(url
) 
 163         if not parsed_url
.scheme
: 
 164             self
._downloader
.report_warning('The url doesn\'t specify the protocol, trying with http') 
 165             return self
.url_result('http://' + url
) 
 166         video_id 
= os
.path
.splitext(url
.split('/')[-1])[0] 
 168         self
.to_screen('%s: Requesting header' % video_id
) 
 171             response 
= self
._send
_head
(url
) 
 174             new_url 
= response
.geturl() 
 176                 self
.report_following_redirect(new_url
) 
 177                 return self
.url_result(new_url
) 
 179             # Check for direct link to a video 
 180             content_type 
= response
.headers
.get('Content-Type', '') 
 181             m 
= re
.match(r
'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type
) 
 183                 upload_date 
= response
.headers
.get('Last-Modified') 
 185                     upload_date 
= unified_strdate(upload_date
) 
 188                     'title': os
.path
.splitext(url_basename(url
))[0], 
 190                         'format_id': m
.group('format_id'), 
 192                         'vcodec': 'none' if m
.group('type') == 'audio' else None 
 194                     'upload_date': upload_date
, 
 197         except compat_urllib_error
.HTTPError
: 
 198             # This may be a stupid server that doesn't like HEAD, our UA, or so 
 202             webpage 
= self
._download
_webpage
(url
, video_id
) 
 204             # since this is the last-resort InfoExtractor, if 
 205             # this error is thrown, it'll be thrown here 
 206             raise ExtractorError('Failed to download URL: %s' % url
) 
 208         self
.report_extraction(video_id
) 
 210         # it's tempting to parse this further, but you would 
 211         # have to take into account all the variations like 
 212         #   Video Title - Site Name 
 213         #   Site Name | Video Title 
 214         #   Video Title - Tagline | Site Name 
 215         # and so on and so forth; it's just not practical 
 216         video_title 
= self
._html
_search
_regex
( 
 217             r
'(?s)<title>(.*?)</title>', webpage
, 'video title', 
 220         # video uploader is domain name 
 221         video_uploader 
= self
._search
_regex
( 
 222             r
'^(?:https?://)?([^/]*)/.*', url
, 'video uploader') 
 224         # Look for BrightCove: 
 225         bc_url 
= BrightcoveIE
._extract
_brightcove
_url
(webpage
) 
 226         if bc_url 
is not None: 
 227             self
.to_screen('Brightcove video detected.') 
 228             surl 
= smuggle_url(bc_url
, {'Referer': url
}) 
 229             return self
.url_result(surl
, 'Brightcove') 
 231         # Look for embedded (iframe) Vimeo player 
 233             r
'<iframe[^>]+?src="((?:https?:)?//player.vimeo.com/video/.+?)"', webpage
) 
 235             player_url 
= unescapeHTML(mobj
.group(1)) 
 236             surl 
= smuggle_url(player_url
, {'Referer': url
}) 
 237             return self
.url_result(surl
, 'Vimeo') 
 239         # Look for embedded (swf embed) Vimeo player 
 241             r
'<embed[^>]+?src="(https?://(?:www\.)?vimeo.com/moogaloop.swf.+?)"', webpage
) 
 243             return self
.url_result(mobj
.group(1), 'Vimeo') 
 245         # Look for embedded YouTube player 
 246         matches 
= re
.findall(r
'''(?x) 
 247             (?:<iframe[^>]+?src=|embedSWF\(\s*) 
 248             (["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?youtube\
.com
/ 
 252             urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube') 
 253                      for tuppl in matches] 
 254             return self.playlist_result( 
 255                 urlrs, playlist_id=video_id, playlist_title=video_title) 
 257         # Look for embedded Dailymotion player 
 258         matches = re.findall( 
 259             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) 
 261             urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion') 
 262                      for tuppl in matches] 
 263             return self.playlist_result( 
 264                 urlrs, playlist_id=video_id, playlist_title=video_title) 
 266         # Look for embedded Wistia player 
 268             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) 
 271                 '_type': 'url_transparent', 
 272                 'url': unescapeHTML(match.group('url')), 
 274                 'uploader': video_uploader, 
 275                 'title': video_title, 
 279         # Look for embedded blip.tv player 
 280         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) 
 282             return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') 
 283         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage) 
 285             return self.url_result(mobj.group(1), 'BlipTV') 
 287         # Look for Bandcamp pages with custom domain 
 288         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) 
 290             burl = unescapeHTML(mobj.group(1)) 
 291             # Don't set the extractor because it can be a track url or an album 
 292             return self.url_result(burl) 
 294         # Look for embedded Vevo player 
 296             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) 
 298             return self.url_result(mobj.group('url')) 
 300         # Look for Ooyala videos 
 301         mobj = re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=([^"&]+)', webpage) 
 303             return OoyalaIE._build_url_result(mobj.group(1)) 
 305         # Look for Aparat videos 
 306         mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage) 
 308             return self.url_result(mobj.group(1), 'Aparat') 
 310         # Look for MPORA videos 
 311         mobj = re.search(r'<iframe .*?src="(http://mpora\.com/videos/[^"]+)"', webpage) 
 313             return self.url_result(mobj.group(1), 'Mpora') 
 315         # Look for embedded Novamov player 
 317             r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage) 
 319             return self.url_result(mobj.group('url'), 'Novamov') 
 321         # Start with something easy: JW Player in SWFObject 
 322         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
 324             # Look for gorilla-vid style embedding 
 325             mobj = re.search(r'(?s)jw_plugins.*?file:\s*["\'](.*?)["\']', webpage) 
 327             # Broaden the search a little bit 
 328             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) 
 330             # Broaden the search a little bit: JWPlayer JS loader 
 331             mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) 
 333             # Try to find twitter cards info 
 334             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage) 
 336             # We look for Open Graph info: 
 337             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) 
 338             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) 
 339             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: 
 340             if m_video_type is not None: 
 341                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage) 
 344             mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL) 
 346             raise ExtractorError('Unsupported URL: %s' % url) 
 348         # It's possible that one of the regexes 
 349         # matched, but returned an empty group: 
 350         if mobj.group(1) is None: 
 351             raise ExtractorError('Did not find a valid video URL at %s' % url) 
 353         video_url = mobj.group(1) 
 354         video_url = compat_urlparse.urljoin(url, video_url) 
 355         video_id = compat_urllib_parse.unquote(os.path.basename(video_url)) 
 357         # Sometimes, jwplayer extraction will result in a YouTube URL 
 358         if YoutubeIE.suitable(video_url): 
 359             return self.url_result(video_url, 'Youtube') 
 361         # here's a fun little line of code for you: 
 362         video_id = os.path.splitext(video_id)[0] 
 367             'uploader': video_uploader, 
 368             'title': video_title,