New upstream version 2017.10.15.1

[youtubedl] / youtube_dl / extractor / bandcamp.py
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index 489d0ba53f672363213c7f788e83b692eb11894d..be41bd5a22477fce2aca4a043799574e148fdc57 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -14,14 +14,16 @@ from ..utils import (
      ExtractorError,
      float_or_none,
      int_or_none,
      ExtractorError,
      float_or_none,
      int_or_none,
+    KNOWN_EXTENSIONS,
      parse_filesize,
      unescapeHTML,
      update_url_query,
      parse_filesize,
      unescapeHTML,
      update_url_query,
+    unified_strdate,
  )
  
  
  class BandcampIE(InfoExtractor):
  )
  
  
  class BandcampIE(InfoExtractor):
-    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>.*)'
+    _VALID_URL = r'https?://.*?\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
      _TESTS = [{
          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
          'md5': 'c557841d5e50261777a6585648adf439',
      _TESTS = [{
          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
          'md5': 'c557841d5e50261777a6585648adf439',
@@ -155,7 +157,7 @@ class BandcampIE(InfoExtractor):
  
  class BandcampAlbumIE(InfoExtractor):
      IE_NAME = 'Bandcamp:album'
  
  class BandcampAlbumIE(InfoExtractor):
      IE_NAME = 'Bandcamp:album'
-    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^?#]+)|/?(?:$|[?#]))'
+    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
  
      _TESTS = [{
          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
  
      _TESTS = [{
          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -222,6 +224,12 @@ class BandcampAlbumIE(InfoExtractor):
          'playlist_count': 2,
      }]
  
          'playlist_count': 2,
      }]
  
+    @classmethod
+    def suitable(cls, url):
+        return (False
+                if BandcampWeeklyIE.suitable(url) or BandcampIE.suitable(url)
+                else super(BandcampAlbumIE, cls).suitable(url))
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          uploader_id = mobj.group('subdomain')
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          uploader_id = mobj.group('subdomain')
@@ -234,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor):
              raise ExtractorError('The page doesn\'t contain any tracks')
          # Only tracks with duration info have songs
          entries = [
              raise ExtractorError('The page doesn\'t contain any tracks')
          # Only tracks with duration info have songs
          entries = [
-            self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+            self.url_result(
+                compat_urlparse.urljoin(url, t_path),
+                ie=BandcampIE.ie_key(),
+                video_title=self._search_regex(
+                    r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+                    elem_content, 'track title', fatal=False))
              for elem_content, t_path in track_elements
              if self._html_search_meta('duration', elem_content, default=None)]
  
              for elem_content, t_path in track_elements
              if self._html_search_meta('duration', elem_content, default=None)]
  
@@ -250,3 +263,92 @@ class BandcampAlbumIE(InfoExtractor):
              'title': title,
              'entries': entries,
          }
              'title': title,
              'entries': entries,
          }
+
+
+class BandcampWeeklyIE(InfoExtractor):
+    IE_NAME = 'Bandcamp:weekly'
+    _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
+    _TESTS = [{
+        'url': 'https://bandcamp.com/?show=224',
+        'md5': 'b00df799c733cf7e0c567ed187dea0fd',
+        'info_dict': {
+            'id': '224',
+            'ext': 'opus',
+            'title': 'BC Weekly April 4th 2017 - Magic Moments',
+            'description': 'md5:5d48150916e8e02d030623a48512c874',
+            'duration': 5829.77,
+            'release_date': '20170404',
+            'series': 'Bandcamp Weekly',
+            'episode': 'Magic Moments',
+            'episode_number': 208,
+            'episode_id': '224',
+        }
+    }, {
+        'url': 'https://bandcamp.com/?blah/blah@&show=228',
+        'only_matching': True
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        blob = self._parse_json(
+            self._search_regex(
+                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
+                'blob', group='blob'),
+            video_id, transform_source=unescapeHTML)
+
+        show = blob['bcw_show']
+
+        # This is desired because any invalid show id redirects to `bandcamp.com`
+        # which happens to expose the latest Bandcamp Weekly episode.
+        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+
+        formats = []
+        for format_id, format_url in show['audio_stream'].items():
+            if not isinstance(format_url, compat_str):
+                continue
+            for known_ext in KNOWN_EXTENSIONS:
+                if known_ext in format_id:
+                    ext = known_ext
+                    break
+            else:
+                ext = None
+            formats.append({
+                'format_id': format_id,
+                'url': format_url,
+                'ext': ext,
+                'vcodec': 'none',
+            })
+        self._sort_formats(formats)
+
+        title = show.get('audio_title') or 'Bandcamp Weekly'
+        subtitle = show.get('subtitle')
+        if subtitle:
+            title += ' - %s' % subtitle
+
+        episode_number = None
+        seq = blob.get('bcw_seq')
+
+        if seq and isinstance(seq, list):
+            try:
+                episode_number = next(
+                    int_or_none(e.get('episode_number'))
+                    for e in seq
+                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
+            except StopIteration:
+                pass
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': show.get('desc') or show.get('short_desc'),
+            'duration': float_or_none(show.get('audio_duration')),
+            'is_live': False,
+            'release_date': unified_strdate(show.get('published_date')),
+            'series': 'Bandcamp Weekly',
+            'episode': show.get('subtitle'),
+            'episode_number': episode_number,
+            'episode_id': compat_str(video_id),
+            'formats': formats
+        }