]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/heise.py
   2 from __future__ 
import unicode_literals
 
   4 from .common 
import InfoExtractor
 
   5 from .kaltura 
import KalturaIE
 
   6 from .youtube 
import YoutubeIE
 
  17 class HeiseIE(InfoExtractor
): 
  18     _VALID_URL 
= r
'https?://(?:www\.)?heise\.de/(?:[^/]+/)+[^/]+-(?P<id>[0-9]+)\.html' 
  21         'url': 'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html', 
  25             'title': "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / Peilsender Smartphone", 
  26             'timestamp': 1512734959, 
  27             'upload_date': '20171208', 
  28             'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', 
  31             'skip_download': True, 
  35         'url': 'http://www.heise.de/newsticker/meldung/Netflix-In-20-Jahren-vom-Videoverleih-zum-TV-Revolutionaer-3814130.html', 
  36         'md5': 'e403d2b43fea8e405e88e3f8623909f1', 
  40             'title': 'NEU IM SEPTEMBER | Netflix', 
  41             'description': 'md5:2131f3c7525e540d5fd841de938bd452', 
  42             'upload_date': '20170830', 
  43             'uploader': 'Netflix Deutschland, Österreich und Schweiz', 
  44             'uploader_id': 'netflixdach', 
  47             'skip_download': True, 
  50         'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', 
  54             'title': "nachgehakt: Wie sichert das c't-Tool Restric'tor Windows 10 ab?", 
  55             'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 
  56             'timestamp': 1512470717, 
  57             'upload_date': '20171205', 
  60             'skip_download': True, 
  63         'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 
  67             'title': "c't uplink 20.8: Staubsaugerroboter Xiaomi Vacuum 2, AR-Brille Meta 2 und Android rooten", 
  68             'description': 'md5:f50fe044d3371ec73a8f79fcebd74afc', 
  69             'timestamp': 1517567237, 
  70             'upload_date': '20180202', 
  73             'skip_download': True, 
  76         'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 
  77         'only_matching': True, 
  79         'url': 'http://www.heise.de/newsticker/meldung/c-t-uplink-Owncloud-Tastaturen-Peilsender-Smartphone-2404251.html?wt_mc=rss.ho.beitrag.atom', 
  80         'only_matching': True, 
  82         'url': 'http://www.heise.de/ct/ausgabe/2016-12-Spiele-3214137.html', 
  83         'only_matching': True, 
  86     def _real_extract(self
, url
): 
  87         video_id 
= self
._match
_id
(url
) 
  88         webpage 
= self
._download
_webpage
(url
, video_id
) 
  90         def extract_title(default
=NO_DEFAULT
): 
  91             title 
= self
._html
_search
_meta
( 
  92                 ('fulltitle', 'title'), webpage
, default
=None) 
  93             if not title 
or title 
== "c't": 
  94                 title 
= self
._search
_regex
( 
  95                     r
'<div[^>]+class="videoplayerjw"[^>]+data-title="([^"]+)"', 
  96                     webpage
, 'title', default
=None) 
  98                 title 
= self
._html
_search
_regex
( 
  99                     r
'<h1[^>]+\bclass=["\']article_page_title
[^
>]+>(.+?
)<', 
 100                     webpage, 'title
', default=default) 
 103         title = extract_title(default=None) 
 104         description = self._og_search_description( 
 105             webpage, default=None) or self._html_search_meta( 
 106             'description
', webpage) 
 108         def _make_kaltura_result(kaltura_url): 
 110                 '_type
': 'url_transparent
', 
 111                 'url
': smuggle_url(kaltura_url, {'source_url
': url}), 
 112                 'ie_key
': KalturaIE.ie_key(), 
 114                 'description
': description, 
 117         kaltura_url = KalturaIE._extract_url(webpage) 
 119             return _make_kaltura_result(kaltura_url) 
 121         kaltura_id = self._search_regex( 
 122             r'entry
-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'kaltura id', 
 123             default=None, group='id') 
 125             return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) 
 127         yt_urls = YoutubeIE._extract_urls(webpage) 
 129             return self.playlist_from_matches( 
 130                 yt_urls, video_id, title, ie=YoutubeIE.ie_key()) 
 132         title = extract_title() 
 134         container_id = self._search_regex( 
 135             r'<div class="videoplayerjw
"[^>]+data-container="([0-9]+)"', 
 136             webpage, 'container ID') 
 138         sequenz_id = self._search_regex( 
 139             r'<div class="videoplayerjw
"[^>]+data-sequenz="([0-9]+)"', 
 140             webpage, 'sequenz ID') 
 142         doc = self._download_xml( 
 143             'http://www.heise.de/videout/feed', video_id, query={ 
 144                 'container': container_id, 
 145                 'sequenz': sequenz_id, 
 149         for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): 
 150             label = source_node.attrib['label'] 
 151             height = int_or_none(self._search_regex( 
 152                 r'^(.*?_)?([0-9]+)p$', label, 'height', default=None)) 
 153             video_url = source_node.attrib['file'] 
 154             ext = determine_ext(video_url, '') 
 157                 'format_note': label, 
 158                 'format_id': '%s_%s' % (ext, label), 
 161         self._sort_formats(formats) 
 166             'description': description, 
 167             'thumbnail': (xpath_text(doc, './/{http://rss.jwpcdn.com/}image') 
 168                           or self._og_search_thumbnail(webpage)), 
 169             'timestamp': parse_iso8601( 
 170                 self._html_search_meta('date', webpage)),