2 from __future__ 
import unicode_literals
 
   7 from .common 
import InfoExtractor
 
  10     compat_urllib_parse_unquote
, 
  21 class PolskieRadioIE(InfoExtractor
): 
  22     _VALID_URL 
= r
'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' 
  24         'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 
  27             'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', 
  28             'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', 
  31             'md5': '2984ee6ce9046d91fc233bc1a864a09a', 
  35                 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', 
  36                 'timestamp': 1456594200, 
  37                 'upload_date': '20160227', 
  39                 'thumbnail': r
're:^https?://static\.prsa\.pl/images/.*\.jpg$' 
  43         'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', 
  46             'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', 
  47             'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', 
  49         'playlist_mincount': 12, 
  51         'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 
  52         'only_matching': True, 
  54         'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', 
  55         'only_matching': True, 
  58         'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 
  59         'only_matching': True, 
  62     def _real_extract(self
, url
): 
  63         playlist_id 
= self
._match
_id
(url
) 
  65         webpage 
= self
._download
_webpage
(url
, playlist_id
) 
  67         content 
= self
._search
_regex
( 
  68             r
'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', 
  71         timestamp 
= unified_timestamp(self
._html
_search
_regex
( 
  72             r
'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', 
  73             webpage
, 'timestamp', fatal
=False)) 
  75         thumbnail_url 
= self
._og
_search
_thumbnail
(webpage
) 
  81         for data_media 
in re
.findall(r
'<[^>]+data-media=({[^>]+})', content
): 
  82             media 
= self
._parse
_json
(data_media
, playlist_id
, fatal
=False) 
  83             if not media
.get('file') or not media
.get('desc'): 
  85             media_url 
= self
._proto
_relative
_url
(media
['file'], 'http:') 
  86             if media_url 
in media_urls
: 
  88             media_urls
.add(media_url
) 
  90                 'id': compat_str(media
['id']), 
  92                 'title': compat_urllib_parse_unquote(media
['desc']), 
  93                 'duration': int_or_none(media
.get('length')), 
  94                 'vcodec': 'none' if media
.get('provider') == 'audio' else None, 
  95                 'timestamp': timestamp
, 
  96                 'thumbnail': thumbnail_url
 
  99         title 
= self
._og
_search
_title
(webpage
).strip() 
 100         description 
= strip_or_none(self
._og
_search
_description
(webpage
)) 
 102         return self
.playlist_result(entries
, playlist_id
, title
, description
) 
 105 class PolskieRadioCategoryIE(InfoExtractor
): 
 106     _VALID_URL 
= r
'https?://(?:www\.)?polskieradio\.pl/\d+(?:,[^/]+)?/(?P<id>\d+)' 
 108         'url': 'http://www.polskieradio.pl/7/5102,HISTORIA-ZYWA', 
 111             'title': 'HISTORIA ŻYWA', 
 113         'playlist_mincount': 38, 
 115         'url': 'http://www.polskieradio.pl/7/4807', 
 118             'title': 'Vademecum 1050. rocznicy Chrztu Polski' 
 120         'playlist_mincount': 5 
 122         'url': 'http://www.polskieradio.pl/7/129,Sygnaly-dnia?ref=source', 
 123         'only_matching': True 
 125         'url': 'http://www.polskieradio.pl/37,RedakcjaKatolicka/4143,Kierunek-Krakow', 
 128             'title': 'Kierunek Kraków', 
 130         'playlist_mincount': 61 
 132         'url': 'http://www.polskieradio.pl/10,czworka/214,muzyka', 
 137         'playlist_mincount': 61 
 139         'url': 'http://www.polskieradio.pl/7,Jedynka/5102,HISTORIA-ZYWA', 
 140         'only_matching': True, 
 142         'url': 'http://www.polskieradio.pl/8,Dwojka/196,Publicystyka', 
 143         'only_matching': True, 
 147     def suitable(cls
, url
): 
 148         return False if PolskieRadioIE
.suitable(url
) else super(PolskieRadioCategoryIE
, cls
).suitable(url
) 
 150     def _entries(self
, url
, page
, category_id
): 
 152         for page_num 
in itertools
.count(2): 
 153             for a_entry
, entry_id 
in re
.findall( 
 154                     r
'(?s)<article[^>]+>.*?(<a[^>]+href=["\']/\d
+/\d
+/Artykul
/(\d
+)[^
>]+>).*?
</article
>', 
 156                 entry = extract_attributes(a_entry) 
 157                 href = entry.get('href
') 
 160                 yield self.url_result( 
 161                     compat_urlparse.urljoin(url, href), PolskieRadioIE.ie_key(), 
 162                     entry_id, entry.get('title
')) 
 164                 r'<div
[^
>]+class=["\']next["\'][^
>]*>\s
*<a
[^
>]+href
=(["\'])(?P<url>(?:(?!\1).)+)\1', 
 168             next_url = compat_urlparse.urljoin(url, mobj.group('url')) 
 169             content = self._download_webpage( 
 170                 next_url, category_id, 'Downloading page %s' % page_num) 
 172     def _real_extract(self, url): 
 173         category_id = self._match_id(url) 
 174         webpage = self._download_webpage(url, category_id) 
 175         title = self._html_search_regex( 
 176             r'<title>([^<]+) - [^<]+ - [^<]+</title>', 
 177             webpage, 'title', fatal=False) 
 178         return self.playlist_result( 
 179             self._entries(url, webpage, category_id),