Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/chirbit.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import parse_duration
   9
  10
  11 class ChirbitIE(InfoExtractor):
  12     IE_NAME = 'chirbit'
  13     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
  14     _TESTS = [{
  15         'url': 'http://chirb.it/be2abG',
  16         'info_dict': {
  17             'id': 'be2abG',
  18             'ext': 'mp3',
  19             'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
  20             'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
  21             'duration': 306,
  22         },
  23         'params': {
  24             'skip_download': True,
  25         }
  26     }, {
  27         'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
  28         'only_matching': True,
  29     }, {
  30         'url': 'https://chirb.it/wp/MN58c2',
  31         'only_matching': True,
  32     }]
  33
  34     def _real_extract(self, url):
  35         audio_id = self._match_id(url)
  36
  37         webpage = self._download_webpage(
  38             'http://chirb.it/%s' % audio_id, audio_id)
  39
  40         data_fd = self._search_regex(
  41             r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
  42             webpage, 'data fd', group='url')
  43
  44         # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
  45         # for soundURL)
  46         audio_url = base64.b64decode(
  47             data_fd[::-1].encode('ascii')).decode('utf-8')
  48
  49         title = self._search_regex(
  50             r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
  51         description = self._search_regex(
  52             r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
  53             webpage, 'description', default=None)
  54         duration = parse_duration(self._search_regex(
  55             r'class=["\']c-length["\'][^>]*>([^<]+)',
  56             webpage, 'duration', fatal=False))
  57
  58         return {
  59             'id': audio_id,
  60             'url': audio_url,
  61             'title': title,
  62             'description': description,
  63             'duration': duration,
  64         }
  65
  66
  67 class ChirbitProfileIE(InfoExtractor):
  68     IE_NAME = 'chirbit:profile'
  69     _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)'
  70     _TEST = {
  71         'url': 'http://chirbit.com/ScarletBeauty',
  72         'info_dict': {
  73             'id': 'ScarletBeauty',
  74         },
  75         'playlist_mincount': 3,
  76     }
  77
  78     def _real_extract(self, url):
  79         profile_id = self._match_id(url)
  80
  81         webpage = self._download_webpage(url, profile_id)
  82
  83         entries = [
  84             self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
  85             for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
  86
  87         return self.playlist_result(entries, profile_id)