Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/chirbit.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5
   6 from .common import InfoExtractor
   7 from ..utils import parse_duration
   8
   9
  10 class ChirbitIE(InfoExtractor):
  11     IE_NAME = 'chirbit'
  12     _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)'
  13     _TESTS = [{
  14         'url': 'http://chirb.it/be2abG',
  15         'info_dict': {
  16             'id': 'be2abG',
  17             'ext': 'mp3',
  18             'title': 'md5:f542ea253f5255240be4da375c6a5d7e',
  19             'description': 'md5:f24a4e22a71763e32da5fed59e47c770',
  20             'duration': 306,
  21         },
  22         'params': {
  23             'skip_download': True,
  24         }
  25     }, {
  26         'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5',
  27         'only_matching': True,
  28     }, {
  29         'url': 'https://chirb.it/wp/MN58c2',
  30         'only_matching': True,
  31     }]
  32
  33     def _real_extract(self, url):
  34         audio_id = self._match_id(url)
  35
  36         webpage = self._download_webpage(
  37             'http://chirb.it/%s' % audio_id, audio_id)
  38
  39         data_fd = self._search_regex(
  40             r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1',
  41             webpage, 'data fd', group='url')
  42
  43         # Reverse engineered from https://chirb.it/js/chirbit.player.js (look
  44         # for soundURL)
  45         audio_url = base64.b64decode(
  46             data_fd[::-1].encode('ascii')).decode('utf-8')
  47
  48         title = self._search_regex(
  49             r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title')
  50         description = self._search_regex(
  51             r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>',
  52             webpage, 'description', default=None)
  53         duration = parse_duration(self._search_regex(
  54             r'class=["\']c-length["\'][^>]*>([^<]+)',
  55             webpage, 'duration', fatal=False))
  56
  57         return {
  58             'id': audio_id,
  59             'url': audio_url,
  60             'title': title,
  61             'description': description,
  62             'duration': duration,
  63         }
  64
  65
  66 class ChirbitProfileIE(InfoExtractor):
  67     IE_NAME = 'chirbit:profile'
  68     _VALID_URL = r'https?://(?:www\.)?chirbit.com/(?:rss/)?(?P<id>[^/]+)'
  69     _TEST = {
  70         'url': 'http://chirbit.com/ScarletBeauty',
  71         'info_dict': {
  72             'id': 'ScarletBeauty',
  73             'title': 'Chirbits by ScarletBeauty',
  74         },
  75         'playlist_mincount': 3,
  76     }
  77
  78     def _real_extract(self, url):
  79         profile_id = self._match_id(url)
  80
  81         rss = self._download_xml(
  82             'http://chirbit.com/rss/%s' % profile_id, profile_id)
  83
  84         entries = [
  85             self.url_result(audio_url.text, 'Chirbit')
  86             for audio_url in rss.findall('./channel/item/link')]
  87
  88         title = rss.find('./channel/title').text
  89
  90         return self.playlist_result(entries, profile_id, title)