Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/chirbit.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import parse_duration
   9
  10
  11 class ChirbitIE(InfoExtractor):
  12     IE_NAME = 'chirbit'
  13     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
  14     _TESTS = [{
  15         'url': 'http://chirb.it/be2abG',
  16         'info_dict': {
  17             'id': 'be2abG',
  18             'ext': 'mp3',
  19             'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
  20             'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
  21             'duration': 306,
  22             'uploader': 'Gerryaudio',
  23         },
  24         'params': {
  25             'skip_download': True,
  26         }
  27     }, {
  28         'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
  29         'only_matching': True,
  30     }, {
  31         'url': 'https://chirb.it/wp/MN58c2',
  32         'only_matching': True,
  33     }]
  34
  35     def _real_extract(self, url):
  36         audio_id = self._match_id(url)
  37
  38         webpage = self._download_webpage(
  39             'http://chirb.it/%s' % audio_id, audio_id)
  40
  41         data_fd = self._search_regex(
  42             r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
  43             webpage, 'data fd', group='url')
  44
  45         # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
  46         # for soundURL)
  47         audio_url = base64.b64decode(
  48             data_fd[::-1].encode('ascii')).decode('utf-8')
  49
  50         title = self._search_regex(
  51             r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
  52         description = self._search_regex(
  53             r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
  54             webpage, 'description', default=None)
  55         duration = parse_duration(self._search_regex(
  56             r'class=["\']c-length["\'][^>]*>([^<]+)',
  57             webpage, 'duration', fatal=False))
  58         uploader = self._search_regex(
  59             r'id=["\']chirbit-username["\'][^>]*>([^<]+)',
  60             webpage, 'uploader', fatal=False)
  61
  62         return {
  63             'id': audio_id,
  64             'url': audio_url,
  65             'title': title,
  66             'description': description,
  67             'duration': duration,
  68             'uploader': uploader,
  69         }
  70
  71
  72 class ChirbitProfileIE(InfoExtractor):
  73     IE_NAME = 'chirbit:profile'
  74     _VALID_URL = r'https?://(?:www\.)?chirbit\.com/(?:rss/)?(?P<id>[^/]+)'
  75     _TEST = {
  76         'url': 'http://chirbit.com/ScarletBeauty',
  77         'info_dict': {
  78             'id': 'ScarletBeauty',
  79         },
  80         'playlist_mincount': 3,
  81     }
  82
  83     def _real_extract(self, url):
  84         profile_id = self._match_id(url)
  85
  86         webpage = self._download_webpage(url, profile_id)
  87
  88         entries = [
  89             self.url_result(self._proto_relative_url('//chirb.it/' + video_id))
  90             for _, video_id in re.findall(r'<input[^>]+id=([\'"])copy-btn-(?P<id>[0-9a-zA-Z]+)\1', webpage)]
  91
  92         return self.playlist_result(entries, profile_id)