]> Raphaƫl G. Git Repositories - youtubedl/blobdiff - youtube_dl/extractor/generic.py
debian/control: Update list of extractors.
[youtubedl] / youtube_dl / extractor / generic.py
index c7552fddb587a60454bec6faa174c36bd4aa9a4a..10ae06263ef1349ff6526575feb61cb71591f3cc 100644 (file)
@@ -162,6 +162,16 @@ class GenericIE(InfoExtractor):
             raise ExtractorError(u'Failed to download URL: %s' % url)
 
         self.report_extraction(video_id)
             raise ExtractorError(u'Failed to download URL: %s' % url)
 
         self.report_extraction(video_id)
+
+        # it's tempting to parse this further, but you would
+        # have to take into account all the variations like
+        #   Video Title - Site Name
+        #   Site Name | Video Title
+        #   Video Title - Tagline | Site Name
+        # and so on and so forth; it's just not practical
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title', default=u'video', flags=re.DOTALL)
+
         # Look for BrightCove:
         bc_url = BrightcoveIE._extract_brightcove_url(webpage)
         if bc_url is not None:
         # Look for BrightCove:
         bc_url = BrightcoveIE._extract_brightcove_url(webpage)
         if bc_url is not None:
@@ -177,17 +187,29 @@ class GenericIE(InfoExtractor):
             return self.url_result(surl, 'Vimeo')
 
         # Look for embedded YouTube player
             return self.url_result(surl, 'Vimeo')
 
         # Look for embedded YouTube player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)
-        if mobj:
-            surl = unescapeHTML(mobj.group(u'url'))
-            return self.url_result(surl, 'Youtube')
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
 
         # Look for Bandcamp pages with custom domain
         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
         if mobj is not None:
             burl = unescapeHTML(mobj.group(1))
 
         # Look for Bandcamp pages with custom domain
         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
         if mobj is not None:
             burl = unescapeHTML(mobj.group(1))
-            return self.url_result(burl, 'Bandcamp')
+            # Don't set the extractor because it can be a track url or an album
+            return self.url_result(burl)
 
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 
         # Start with something easy: JW Player in SWFObject
         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
@@ -196,7 +218,7 @@ class GenericIE(InfoExtractor):
             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
         if mobj is None:
             # Broaden the search a little bit: JWPlayer JS loader
             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
         if mobj is None:
             # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
         if mobj is None:
             # Try to find twitter cards info
             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
         if mobj is None:
             # Try to find twitter cards info
             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -223,27 +245,16 @@ class GenericIE(InfoExtractor):
         video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
 
         # here's a fun little line of code for you:
         video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
 
         # here's a fun little line of code for you:
-        video_extension = os.path.splitext(video_id)[1][1:]
         video_id = os.path.splitext(video_id)[0]
 
         video_id = os.path.splitext(video_id)[0]
 
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
-
         # video uploader is domain name
         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
             url, u'video uploader')
 
         # video uploader is domain name
         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
             url, u'video uploader')
 
-        return [{
+        return {
             'id':       video_id,
             'url':      video_url,
             'uploader': video_uploader,
             'upload_date':  None,
             'title':    video_title,
             'id':       video_id,
             'url':      video_url,
             'uploader': video_uploader,
             'upload_date':  None,
             'title':    video_title,
-            'ext':      video_extension,
-        }]
+        }