]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/generic.py 
 
 
 
 
 
 
 
 
   3  from  __future__ 
import  unicode_literals
 
   8  from  . common 
import  InfoExtractor
 
   9  from  . youtube 
import  YoutubeIE
 
  13      compat_urllib_request
,  
  23  from  . brightcove 
import  BrightcoveIE
 
  24  from  . ooyala 
import  OoyalaIE
 
  27  class  GenericIE ( InfoExtractor
):  
  28      IE_DESC 
=  'Generic downloader that works on some sites'  
  33              'url' :  'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html' ,  
  34              'file' :  '13601338388002.mp4' ,  
  35              'md5' :  '6e15c93721d7ec9e9ca3fdbf07982cfd' ,  
  37                  'uploader' :  'www.hodiho.fr' ,  
  38                  'title' :  'R\u00e9gis plante sa Jeep' ,  
  41          # bandcamp page with custom domain  
  43              'add_ie' : [ 'Bandcamp' ],  
  44              'url' :  'http://bronyrock.com/track/the-pony-mash' ,  
  45              'file' :  '3235767654.mp3' ,  
  47                  'title' :  'The Pony Mash' ,  
  48                  'uploader' :  'M_Pallante' ,  
  50              'skip' :  'There is a limit of 200 free downloads / month for the test song' ,  
  52          # embedded brightcove video  
  53          # it also tests brightcove videos that need to set the 'Referer' in the  
  56              'add_ie' : [ 'Brightcove' ],  
  57              'url' :  'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/' ,  
  59                  'id' :  '2765128793001' ,  
  61                  'title' :  'Le cours de bourse : l’analyse technique' ,  
  62                  'description' :  'md5:7e9ad046e968cb2d1114004aba466fd9' ,  
  63                  'uploader' :  'BFM BUSINESS' ,  
  66                  'skip_download' :  True ,  
  70              # https://github.com/rg3/youtube-dl/issues/2253  
  71              'url' :  'http://bcove.me/i6nfkrc3' ,  
  72              'file' :  '3101154703001.mp4' ,  
  73              'md5' :  '0ba9446db037002366bab3b3eb30c88c' ,  
  75                  'title' :  'Still no power' ,  
  76                  'uploader' :  'thestar.com' ,  
  77                  'description' :  'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.' ,  
  79              'add_ie' : [ 'Brightcove' ],  
  81          # Direct link to a video  
  83              'url' :  'http://media.w3.org/2010/05/sintel/trailer.mp4' ,  
  84              'file' :  'trailer.mp4' ,  
  85              'md5' :  '67d406c2bcb6af27fa886f31aa934bbe' ,  
  89                  'upload_date' :  '20100513' ,  
  94              'url' :  'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219' ,  
  95              'file' :  'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ.mp4' ,  
  96              'md5' :  '5644c6ca5d5782c1d0d350dad9bd840c' ,  
  98                  'id' :  'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ' ,  
 100                  'title' :  '2cc213299525360.mov' ,   # that's what we get  
 105      def  report_download_webpage ( self
,  video_id
):  
 106          """Report webpage download."""  
 107          if not  self
._ downloader
. params
. get ( 'test' ,  False ):  
 108              self
._ downloader
. report_warning ( 'Falling back on generic information extractor.' )  
 109          super ( GenericIE
,  self
). report_download_webpage ( video_id
)  
 111      def  report_following_redirect ( self
,  new_url
):  
 112          """Report information extraction."""  
 113          self
._ downloader
. to_screen ( '[redirect] Following redirect to  %s '  %  new_url
)  
 115      def  _send_head ( self
,  url
):  
 116          """Check if it is a redirect, like url shorteners, in case return the new url."""  
 118          class  HEADRedirectHandler ( compat_urllib_request
. HTTPRedirectHandler
):  
 120              Subclass the HTTPRedirectHandler to make it use our  
 121              HEADRequest also on the redirected URL  
 123              def  redirect_request ( self
,  req
,  fp
,  code
,  msg
,  headers
,  newurl
):  
 124                  if  code 
in  ( 301 ,  302 ,  303 ,  307 ):  
 125                      newurl 
=  newurl
. replace ( ' ' ,  '%20' )  
 126                      newheaders 
=  dict (( k
, v
)  for  k
, v 
in  req
. headers
. items ()  
 127                                        if  k
. lower ()  not in  ( "content-length" ,  "content-type" ))  
 128                      return  HEADRequest ( newurl
,  
 130                                         origin_req_host
= req
. get_origin_req_host (),  
 133                      raise  compat_urllib_error
. HTTPError ( req
. get_full_url (),  code
,  msg
,  headers
,  fp
)  
 135          class  HTTPMethodFallback ( compat_urllib_request
. BaseHandler
):  
 137              Fallback to GET if HEAD is not allowed (405 HTTP error)  
 139              def  http_error_405 ( self
,  req
,  fp
,  code
,  msg
,  headers
):  
 143                  newheaders 
=  dict (( k
, v
)  for  k
, v 
in  req
. headers
. items ()  
 144                                    if  k
. lower ()  not in  ( "content-length" ,  "content-type" ))  
 145                  return  self
. parent
. open ( compat_urllib_request
. Request ( req
. get_full_url (),  
 147                                                   origin_req_host
= req
. get_origin_req_host (),  
 151          opener 
=  compat_urllib_request
. OpenerDirector ()  
 152          for  handler 
in  [ compat_urllib_request
. HTTPHandler
,  compat_urllib_request
. HTTPDefaultErrorHandler
,  
 153                          HTTPMethodFallback
,  HEADRedirectHandler
,  
 154                          compat_urllib_request
. HTTPErrorProcessor
,  compat_urllib_request
. HTTPSHandler
]:  
 155              opener
. add_handler ( handler ())  
 157          response 
=  opener
. open ( HEADRequest ( url
))  
 159              raise  ExtractorError ( 'Invalid URL protocol' )  
 162      def  _real_extract ( self
,  url
):  
 163          parsed_url 
=  compat_urlparse
. urlparse ( url
)  
 164          if not  parsed_url
. scheme
:  
 165              default_search 
=  self
._ downloader
. params
. get ( 'default_search' )  
 166              if  default_search 
is None :  
 167                  default_search 
=  'auto'  
 169              if  default_search 
==  'auto' :  
 171                      self
._ downloader
. report_warning ( 'The url doesn \' t specify the protocol, trying with http' )  
 172                      return  self
. url_result ( 'http://'  +  url
)  
 174                      return  self
. url_result ( 'ytsearch:'  +  url
)  
 176                  assert  ':'  in  default_search
 
 177                  return  self
. url_result ( default_search 
+  url
)  
 178          video_id 
=  os
. path
. splitext ( url
. split ( '/' )[- 1 ])[ 0 ]  
 180          self
. to_screen ( ' %s : Requesting header'  %  video_id
)  
 183              response 
=  self
._ send
_ head
( url
)  
 186              new_url 
=  response
. geturl ()  
 188                  self
. report_following_redirect ( new_url
)  
 189                  return  self
. url_result ( new_url
)  
 191              # Check for direct link to a video  
 192              content_type 
=  response
. headers
. get ( 'Content-Type' ,  '' )  
 193              m 
=  re
. match ( r
'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$' ,  content_type
)  
 195                  upload_date 
=  response
. headers
. get ( 'Last-Modified' )  
 197                      upload_date 
=  unified_strdate ( upload_date
)  
 200                      'title' :  os
. path
. splitext ( url_basename ( url
))[ 0 ],  
 202                          'format_id' :  m
. group ( 'format_id' ),  
 204                          'vcodec' :  'none'  if  m
. group ( 'type' ) ==  'audio'  else None  
 206                      'upload_date' :  upload_date
,  
 209          except  compat_urllib_error
. HTTPError
:  
 210              # This may be a stupid server that doesn't like HEAD, our UA, or so  
 214              webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 216              # since this is the last-resort InfoExtractor, if  
 217              # this error is thrown, it'll be thrown here  
 218              raise  ExtractorError ( 'Failed to download URL:  %s '  %  url
)  
 220          self
. report_extraction ( video_id
)  
 222          # it's tempting to parse this further, but you would  
 223          # have to take into account all the variations like  
 224          #   Video Title - Site Name  
 225          #   Site Name | Video Title  
 226          #   Video Title - Tagline | Site Name  
 227          # and so on and so forth; it's just not practical  
 228          video_title 
=  self
._ html
_ search
_ regex
(  
 229              r
'(?s)<title>(.*?)</title>' ,  webpage
,  'video title' ,  
 232          # video uploader is domain name  
 233          video_uploader 
=  self
._ search
_ regex
(  
 234              r
'^(?:https?://)?([^/]*)/.*' ,  url
,  'video uploader' )  
 236          # Look for BrightCove:  
 237          bc_urls 
=  BrightcoveIE
._ extract
_ brightcove
_u rls
( webpage
)  
 239              self
. to_screen ( 'Brightcove video detected.' )  
 242                  'url' :  smuggle_url ( bc_url
, { 'Referer' :  url
}),  
 243                  'ie_key' :  'Brightcove'  
 244              }  for  bc_url 
in  bc_urls
]  
 248                  'title' :  video_title
,  
 253          # Look for embedded (iframe) Vimeo player  
 255              r
'<iframe[^>]+?src="((?:https?:)?//player\.vimeo\.com/video/.+?)"' ,  webpage
)  
 257              player_url 
=  unescapeHTML ( mobj
. group ( 1 ))  
 258              surl 
=  smuggle_url ( player_url
, { 'Referer' :  url
})  
 259              return  self
. url_result ( surl
,  'Vimeo' )  
 261          # Look for embedded (swf embed) Vimeo player  
 263              r
'<embed[^>]+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"' ,  webpage
)  
 265              return  self
. url_result ( mobj
. group ( 1 ),  'Vimeo' )  
 267          # Look for embedded YouTube player  
 268          matches 
=  re
. findall ( r
'''(?x)  
 269              (?:<iframe[^>]+?src=|embedSWF\(\s*)  
 270              (["\' ])( ?P
< url
>( ?
: https?
:) ?
//( ?
: www\
.) ?youtube\
. com
/  
 274              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')  
 275                       for tuppl in matches]  
 276              return self.playlist_result(  
 277                  urlrs, playlist_id=video_id, playlist_title=video_title)  
 279          # Look for embedded Dailymotion player  
 280          matches = re.findall(  
 281              r'<iframe[^>]+?src=([" \' ])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)  
 283              urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')  
 284                       for tuppl in matches]  
 285              return self.playlist_result(  
 286                  urlrs, playlist_id=video_id, playlist_title=video_title)  
 288          # Look for embedded Wistia player  
 290              r'<iframe[^>]+?src=([" \' ])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)  
 293                  '_type': 'url_transparent',  
 294                  'url': unescapeHTML(match.group('url')),  
 296                  'uploader': video_uploader,  
 297                  'title': video_title,  
 301          # Look for embedded blip.tv player  
 302          mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)  
 304              return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV')  
 305          mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage)  
 307              return self.url_result(mobj.group(1), 'BlipTV')  
 309          # Look for Bandcamp pages with custom domain  
 310          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)  
 312              burl = unescapeHTML(mobj.group(1))  
 313              # Don't set the extractor because it can be a track url or an album  
 314              return self.url_result(burl)  
 316          # Look for embedded Vevo player  
 318              r'<iframe[^>]+?src=([" \' ])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)  
 320              return self.url_result(mobj.group('url'))  
 322          # Look for Ooyala videos  
 323          mobj = re.search(r'player.ooyala.com/[^"?]+ \? [^"]*?(?:embedCode|ec)=([^"&]+)', webpage)  
 325              return OoyalaIE._build_url_result(mobj.group(1))  
 327          # Look for Aparat videos  
 328          mobj = re.search(r'<iframe src="(http://www\.aparat\.com/video/[^"]+)"', webpage)  
 330              return self.url_result(mobj.group(1), 'Aparat')  
 332          # Look for MPORA videos  
 333          mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)  
 335              return self.url_result(mobj.group(1), 'Mpora')  
 337          # Look for embedded Novamov player  
 339              r'<iframe[^>]+?src=([" \' ])(?P<url>http://(?:(?:embed|www)\.)?novamov\.com/embed\.php.+?)\1', webpage)  
 341              return self.url_result(mobj.group('url'), 'Novamov')  
 343          # Look for embedded Facebook player  
 345              r'<iframe[^>]+?src=([" \' ])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)  
 347              return self.url_result(mobj.group('url'), 'Facebook')  
 349          # Look for embedded Huffington Post player  
 351              r'<iframe[^>]+?src=([" \' ])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)  
 353              return self.url_result(mobj.group('url'), 'HuffPost')  
 355          # Start with something easy: JW Player in SWFObject  
 356          mobj = re.search(r'flashvars: [ \' "](?:.*&)?file=(http[^ \' "&]*)', webpage)  
 358              # Look for gorilla-vid style embedding  
 359              mobj = re.search(r'(?s)(?:jw_plugins|JWPlayerOptions).*?file\s*:\s*[" \' ](.*?)[" \' ]', webpage)  
 361              # Broaden the search a little bit  
 362              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^ \' "&]*)', webpage)  
 364              # Broaden the search a little bit: JWPlayer JS loader  
 365              mobj = re.search(r'[^A-Za-z0-9]?file[" \' ]?:\s*[" \' ](http(?![^ \' "]+\.[0-9]+[ \' "])[^ \' "]+)[" \' ]', webpage)  
 367              # Try to find twitter cards info  
 368              mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)  
 370              # We look for Open Graph info:  
 371              # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)  
 372              m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)  
 373              # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:  
 374              if m_video_type is not None:  
 375                  mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)  
 378              mobj = re.search(r'<video[^<]*(?:>.*?<source.*?)? src="([^"]+)"', webpage, flags=re.DOTALL)  
 380              raise ExtractorError('Unsupported URL:  %s ' % url)  
 382          # It's possible that one of the regexes  
 383          # matched, but returned an empty group:  
 384          if mobj.group(1) is None:  
 385              raise ExtractorError('Did not find a valid video URL at  %s ' % url)  
 387          video_url = mobj.group(1)  
 388          video_url = compat_urlparse.urljoin(url, video_url)  
 389          video_id = compat_urllib_parse.unquote(os.path.basename(video_url))  
 391          # Sometimes, jwplayer extraction will result in a YouTube URL  
 392          if YoutubeIE.suitable(video_url):  
 393              return self.url_result(video_url, 'Youtube')  
 395          # here's a fun little line of code for you:  
 396          video_id = os.path.splitext(video_id)[0]  
 401              'uploader': video_uploader,  
 402              'title': video_title,