Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py

   1 import itertools
   2 import json
   3 import re
   4
   5 from .common import InfoExtractor, SearchInfoExtractor
   6 from ..utils import (
   7     compat_urllib_parse,
   8     compat_urlparse,
   9     determine_ext,
  10     clean_html,
  11 )
  12
  13
  14 class YahooIE(InfoExtractor):
  15     IE_DESC = u'Yahoo screen'
  16     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  17     _TESTS = [
  18         {
  19             u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  20             u'file': u'214727115.mp4',
  21             u'md5': u'4962b075c08be8690a922ee026d05e69',
  22             u'info_dict': {
  23                 u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
  24                 u'description': u'Julian and Travis watch Julian Smith',
  25             },
  26         },
  27         {
  28             u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  29             u'file': u'103000935.mp4',
  30             u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
  31             u'info_dict': {
  32                 u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
  33                 u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
  34             },
  35         },
  36     ]
  37
  38     def _real_extract(self, url):
  39         mobj = re.match(self._VALID_URL, url)
  40         video_id = mobj.group('id')
  41         webpage = self._download_webpage(url, video_id)
  42
  43         items_json = self._search_regex(r'mediaItems: ({.*?})$',
  44             webpage, u'items', flags=re.MULTILINE)
  45         items = json.loads(items_json)
  46         info = items['mediaItems']['query']['results']['mediaObj'][0]
  47         # The 'meta' field is not always in the video webpage, we request it
  48         # from another page
  49         long_id = info['id']
  50         return self._get_info(info['id'], video_id)
  51
  52     def _get_info(self, long_id, video_id):
  53         query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
  54                  ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
  55                  ' AND protocol="http"' % long_id)
  56         data = compat_urllib_parse.urlencode({
  57             'q': query,
  58             'env': 'prod',
  59             'format': 'json',
  60         })
  61         query_result_json = self._download_webpage(
  62             'http://video.query.yahoo.com/v1/public/yql?' + data,
  63             video_id, u'Downloading video info')
  64         query_result = json.loads(query_result_json)
  65         info = query_result['query']['results']['mediaObj'][0]
  66         meta = info['meta']
  67
  68         formats = []
  69         for s in info['streams']:
  70             format_info = {
  71                 'width': s.get('width'),
  72                 'height': s.get('height'),
  73                 'bitrate': s.get('bitrate'),
  74             }
  75
  76             host = s['host']
  77             path = s['path']
  78             if host.startswith('rtmp'):
  79                 format_info.update({
  80                     'url': host,
  81                     'play_path': path,
  82                     'ext': 'flv',
  83                 })
  84             else:
  85                 format_url = compat_urlparse.urljoin(host, path)
  86                 format_info['url'] = format_url
  87                 format_info['ext'] = determine_ext(format_url)
  88
  89             formats.append(format_info)
  90         formats = sorted(formats, key=lambda f:(f['height'], f['width']))
  91
  92         return {
  93             'id': video_id,
  94             'title': meta['title'],
  95             'formats': formats,
  96             'description': clean_html(meta['description']),
  97             'thumbnail': meta['thumbnail'],
  98         }
  99
 100
 101 class YahooNewsIE(YahooIE):
 102     IE_NAME = 'yahoo:news'
 103     _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
 104
 105     _TEST = {
 106         u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
 107         u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
 108         u'info_dict': {
 109             u'id': u'104538833',
 110             u'ext': u'mp4',
 111             u'title': u'China Moses Is Crazy About the Blues',
 112             u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
 113         },
 114     }
 115
 116     # Overwrite YahooIE properties we don't want
 117     _TESTS = []
 118
 119     def _real_extract(self, url):
 120         mobj = re.match(self._VALID_URL, url)
 121         video_id = mobj.group('id')
 122         webpage = self._download_webpage(url, video_id)
 123         long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
 124         return self._get_info(long_id, video_id)
 125
 126
 127 class YahooSearchIE(SearchInfoExtractor):
 128     IE_DESC = u'Yahoo screen search'
 129     _MAX_RESULTS = 1000
 130     IE_NAME = u'screen.yahoo:search'
 131     _SEARCH_KEY = 'yvsearch'
 132
 133     def _get_n_results(self, query, n):
 134         """Get a specified number of results for a query"""
 135
 136         res = {
 137             '_type': 'playlist',
 138             'id': query,
 139             'entries': []
 140         }
 141         for pagenum in itertools.count(0):
 142             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 143             webpage = self._download_webpage(result_url, query,
 144                                              note='Downloading results page '+str(pagenum+1))
 145             info = json.loads(webpage)
 146             m = info[u'm']
 147             results = info[u'results']
 148
 149             for (i, r) in enumerate(results):
 150                 if (pagenum * 30) +i >= n:
 151                     break
 152                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 153                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 154                 res['entries'].append(e)
 155             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):
 156                 break
 157
 158         return res