]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/heise.py
   2 from __future__ 
import unicode_literals
 
   4 from .common 
import InfoExtractor
 
   5 from .kaltura 
import KalturaIE
 
   6 from .youtube 
import YoutubeIE
 
  17 class HeiseIE(InfoExtractor
): 
  18     _VALID_URL 
= r
'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P<id>[0-9]+)\.html' 
  21         'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', 
  25             'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", 
  26             'timestamp': 1512734959, 
  27             'upload_date': '20171208', 
  28             'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 
  31             'skip_download': True, 
  35         'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', 
  36         'md5': 'e403d2b43fea8e405e88e3f8623909f1', 
  40             'title': 'NEU IM SEPTEMBER | Netflix', 
  41             'description': 'md5:2131f3c7525e540d5fd841de938bd452', 
  42             'upload_date': '20170830', 
  43             'uploader': 'Netflix Deutschland, Österreich und Schweiz', 
  44             'uploader_id': 'netflixdach', 
  47             'skip_download': True, 
  50         'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', 
  54             'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?", 
  55             'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 
  56             'timestamp': 1512470717, 
  57             'upload_date': '20171205', 
  60             'skip_download': True, 
  63         'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 
  67             'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten", 
  68             'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc', 
  69             'timestamp': 1517567237, 
  70             'upload_date': '20180202', 
  73             'skip_download': True, 
  76         'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 
  77         'only_matching': True, 
  79         'url': 'http://www.heise.de/newsticker/meldung/c-t-uplink-Owncloud-Tastaturen-Peilsender-Smartphone-2404251.html?wt_mc=rss.ho.beitrag.atom', 
  80         'only_matching': True, 
  82         'url': 'http://www.heise.de/ct/ausgabe/2016-12-Spiele-3214137.html', 
  83         'only_matching': True, 
  86     def _real_extract(self
, url
): 
  87         video_id 
= self
._match
_id
(url
) 
  88         webpage 
= self
._download
_webpage
(url
, video_id
) 
  90         def extract_title(default
=NO_DEFAULT
): 
  91             title 
= self
._html
_search
_meta
( 
  92                 ('fulltitle', 'title'), webpage
, default
=None) 
  93             if not title 
or title 
== "c't": 
  94                 title 
= self
._search
_regex
( 
  95                     r
'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', 
  96                     webpage
, 'title', default
=None) 
  98                 title 
= self
._html
_search
_regex
( 
  99                     r
'<h1[^>]+\bclass=["\']article_page_title
[^
>]+>(.+?
)<', 
 100                     webpage, 'title
', default=default) 
 103         title = extract_title(default=None) 
 104         description = self._og_search_description( 
 105             webpage, default=None) or self._html_search_meta( 
 106             'description
', webpage) 
 108         kaltura_url = KalturaIE._extract_url(webpage) 
 111                 '_type
': 'url_transparent
', 
 112                 'url
': smuggle_url(kaltura_url, {'source_url
': url}), 
 113                 'ie_key
': KalturaIE.ie_key(), 
 115                 'description
': description, 
 118         yt_urls = YoutubeIE._extract_urls(webpage) 
 120             return self.playlist_from_matches( 
 121                 yt_urls, video_id, title, ie=YoutubeIE.ie_key()) 
 123         title = extract_title() 
 125         container_id = self._search_regex( 
 126             r'<div 
class="videoplayerjw"[^
>]+data
-container
="([0-9]+)"', 
 127             webpage, 'container ID
') 
 129         sequenz_id = self._search_regex( 
 130             r'<div 
class="videoplayerjw"[^
>]+data
-sequenz
="([0-9]+)"', 
 131             webpage, 'sequenz ID
') 
 133         doc = self._download_xml( 
 134             'http
://www
.heise
.de
/videout
/feed
', video_id, query={ 
 135                 'container
': container_id, 
 136                 'sequenz
': sequenz_id, 
 140         for source_node in doc.findall('.//{http
://rss
.jwpcdn
.com
/}source
'): 
 141             label = source_node.attrib['label
'] 
 142             height = int_or_none(self._search_regex( 
 143                 r'^
(.*?_
)?
([0-9]+)p$
', label, 'height
', default=None)) 
 144             video_url = source_node.attrib['file'] 
 145             ext = determine_ext(video_url, '') 
 148                 'format_note
': label, 
 149                 'format_id
': '%s_%s' % (ext, label), 
 152         self._sort_formats(formats) 
 157             'description
': description, 
 158             'thumbnail
': (xpath_text(doc, './/{http
://rss
.jwpcdn
.com
/}image
') or 
 159                           self._og_search_thumbnail(webpage)), 
 160             'timestamp
': parse_iso8601( 
 161                 self._html_search_meta('date
', webpage)),