Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/justintv.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import os
   5 import re
   6
   7 from .common import InfoExtractor
   8 from ..utils import (
   9     compat_str,
  10     ExtractorError,
  11     formatSeconds,
  12 )
  13
  14
  15 class JustinTVIE(InfoExtractor):
  16     """Information extractor for justin.tv and twitch.tv"""
  17     # TODO: One broadcast may be split into multiple videos. The key
  18     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  19     # starts at 1 and increases. Can we treat all parts as one video?
  20
  21     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
  22         (?:
  23             (?P<channelid>[^/]+)|
  24             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  25             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  26         )
  27         /?(?:\#.*)?$
  28         """
  29     _JUSTIN_PAGE_LIMIT = 100
  30     IE_NAME = 'justin.tv'
  31     IE_DESC = 'justin.tv and twitch.tv'
  32     _TEST = {
  33         'url': 'http://www.twitch.tv/thegamedevhub/b/296128360',
  34         'md5': 'ecaa8a790c22a40770901460af191c9a',
  35         'info_dict': {
  36             'id': '296128360',
  37             'ext': 'flv',
  38             'upload_date': '20110927',
  39             'uploader_id': 25114803,
  40             'uploader': 'thegamedevhub',
  41             'title': 'Beginner Series - Scripting With Python Pt.1'
  42         }
  43     }
  44
  45     # Return count of items, list of *valid* items
  46     def _parse_page(self, url, video_id):
  47         info_json = self._download_webpage(url, video_id,
  48                                            'Downloading video info JSON',
  49                                            'unable to download video info JSON')
  50
  51         response = json.loads(info_json)
  52         if type(response) != list:
  53             error_text = response.get('error', 'unknown error')
  54             raise ExtractorError('Justin.tv API: %s' % error_text)
  55         info = []
  56         for clip in response:
  57             video_url = clip['video_file_url']
  58             if video_url:
  59                 video_extension = os.path.splitext(video_url)[1][1:]
  60                 video_date = re.sub('-', '', clip['start_time'][:10])
  61                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
  62                 video_id = clip['id']
  63                 video_title = clip.get('title', video_id)
  64                 info.append({
  65                     'id': compat_str(video_id),
  66                     'url': video_url,
  67                     'title': video_title,
  68                     'uploader': clip.get('channel_name', video_uploader_id),
  69                     'uploader_id': video_uploader_id,
  70                     'upload_date': video_date,
  71                     'ext': video_extension,
  72                 })
  73         return (len(response), info)
  74
  75     def _real_extract(self, url):
  76         mobj = re.match(self._VALID_URL, url)
  77
  78         api_base = 'http://api.justin.tv'
  79         paged = False
  80         if mobj.group('channelid'):
  81             paged = True
  82             video_id = mobj.group('channelid')
  83             api = api_base + '/channel/archives/%s.json' % video_id
  84         elif mobj.group('chapterid'):
  85             chapter_id = mobj.group('chapterid')
  86
  87             webpage = self._download_webpage(url, chapter_id)
  88             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
  89             if not m:
  90                 raise ExtractorError('Cannot find archive of a chapter')
  91             archive_id = m.group(1)
  92
  93             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
  94             doc = self._download_xml(
  95                 api, chapter_id,
  96                 note='Downloading chapter information',
  97                 errnote='Chapter information download failed')
  98             for a in doc.findall('.//archive'):
  99                 if archive_id == a.find('./id').text:
 100                     break
 101             else:
 102                 raise ExtractorError('Could not find chapter in chapter information')
 103
 104             video_url = a.find('./video_file_url').text
 105             video_ext = video_url.rpartition('.')[2] or 'flv'
 106
 107             chapter_api_url = 'https://api.twitch.tv/kraken/videos/c' + chapter_id
 108             chapter_info = self._download_json(
 109                 chapter_api_url, 'c' + chapter_id,
 110                 note='Downloading chapter metadata',
 111                 errnote='Download of chapter metadata failed')
 112
 113             bracket_start = int(doc.find('.//bracket_start').text)
 114             bracket_end = int(doc.find('.//bracket_end').text)
 115
 116             # TODO determine start (and probably fix up file)
 117             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 118             #video_url += '?start=' + TODO:start_timestamp
 119             # bracket_start is 13290, but we want 51670615
 120             self._downloader.report_warning('Chapter detected, but we can just download the whole file. '
 121                                             'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 122
 123             info = {
 124                 'id': 'c' + chapter_id,
 125                 'url': video_url,
 126                 'ext': video_ext,
 127                 'title': chapter_info['title'],
 128                 'thumbnail': chapter_info['preview'],
 129                 'description': chapter_info['description'],
 130                 'uploader': chapter_info['channel']['display_name'],
 131                 'uploader_id': chapter_info['channel']['name'],
 132             }
 133             return info
 134         else:
 135             video_id = mobj.group('videoid')
 136             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 137
 138         entries = []
 139         offset = 0
 140         limit = self._JUSTIN_PAGE_LIMIT
 141         while True:
 142             if paged:
 143                 self.report_download_page(video_id, offset)
 144             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 145             page_count, page_info = self._parse_page(page_url, video_id)
 146             entries.extend(page_info)
 147             if not paged or page_count != limit:
 148                 break
 149             offset += limit
 150         return {
 151             '_type': 'playlist',
 152             'id': video_id,
 153             'entries': entries,
 154         }