Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/orf.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import json
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     HEADRequest,
  10     unified_strdate,
  11     ExtractorError,
  12 )
  13
  14
  15 class ORFIE(InfoExtractor):
  16     _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
  17
  18     _TEST = {
  19         'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
  20         'file': '7319747.mp4',
  21         'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
  22         'info_dict': {
  23             'title': 'Was Sie schon immer über Klassik wissen wollten',
  24             'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
  25             'duration': 3508,
  26             'upload_date': '20140105',
  27         },
  28         'skip': 'Blocked outside of Austria',
  29     }
  30
  31     def _real_extract(self, url):
  32         mobj = re.match(self._VALID_URL, url)
  33         playlist_id = mobj.group('id')
  34         webpage = self._download_webpage(url, playlist_id)
  35
  36         data_json = self._search_regex(
  37             r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
  38         all_data = json.loads(data_json)
  39
  40         def get_segments(all_data):
  41             for data in all_data:
  42                 if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
  43                     return data['values']['segments']
  44
  45         sdata = get_segments(all_data)
  46         if not sdata:
  47             raise ExtractorError('Unable to extract segments')
  48
  49         def quality_to_int(s):
  50             m = re.search('([0-9]+)', s)
  51             if m is None:
  52                 return -1
  53             return int(m.group(1))
  54
  55         entries = []
  56         for sd in sdata:
  57             video_id = sd['id']
  58             formats = [{
  59                 'preference': -10 if fd['delivery'] == 'hls' else None,
  60                 'format_id': '%s-%s-%s' % (
  61                     fd['delivery'], fd['quality'], fd['quality_string']),
  62                 'url': fd['src'],
  63                 'protocol': fd['protocol'],
  64                 'quality': quality_to_int(fd['quality']),
  65             } for fd in sd['playlist_item_array']['sources']]
  66
  67             # Check for geoblocking.
  68             # There is a property is_geoprotection, but that's always false
  69             geo_str = sd.get('geoprotection_string')
  70             if geo_str:
  71                 try:
  72                     http_url = next(
  73                         f['url']
  74                         for f in formats
  75                         if re.match(r'^https?://.*\.mp4$', f['url']))
  76                 except StopIteration:
  77                     pass
  78                 else:
  79                     req = HEADRequest(http_url)
  80                     self._request_webpage(
  81                         req, video_id,
  82                         note='Testing for geoblocking',
  83                         errnote=((
  84                             'This video seems to be blocked outside of %s. '
  85                             'You may want to try the streaming-* formats.')
  86                             % geo_str),
  87                         fatal=False)
  88
  89             self._sort_formats(formats)
  90
  91             upload_date = unified_strdate(sd['created_date'])
  92             entries.append({
  93                 '_type': 'video',
  94                 'id': video_id,
  95                 'title': sd['header'],
  96                 'formats': formats,
  97                 'description': sd.get('description'),
  98                 'duration': int(sd['duration_in_seconds']),
  99                 'upload_date': upload_date,
 100                 'thumbnail': sd.get('image_full_url'),
 101             })
 102
 103         return {
 104             '_type': 'playlist',
 105             'entries': entries,
 106             'id': playlist_id,
 107         }