]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py
1 from __future__
import unicode_literals
7 from . common
import InfoExtractor
, SearchInfoExtractor
16 class YahooIE ( InfoExtractor
):
17 IE_DESC
= 'Yahoo screen and movies'
18 _VALID_URL
= r
'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'
21 'url' : 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html' ,
22 'md5' : '4962b075c08be8690a922ee026d05e69' ,
24 'id' : '2d25e626-2378-391f-ada0-ddaf1417e588' ,
26 'title' : 'Julian Smith & Travis Legg Watch Julian Smith' ,
27 'description' : 'Julian and Travis watch Julian Smith' ,
31 'url' : 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html' ,
32 'md5' : 'd6e6fc6e1313c608f316ddad7b82b306' ,
34 'id' : 'd1dedf8c-d58c-38c3-8963-e899929ae0a9' ,
36 'title' : 'Codefellas - The Cougar Lies with Spanish Moss' ,
37 'description' : 'Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?' ,
41 'url' : 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html' ,
42 'md5' : '410b7104aa9893b765bc22787a22f3d9' ,
44 'id' : '516ed8e2-2c4f-339f-a211-7a8b49d30845' ,
46 'title' : 'The World Loves Spider-Man' ,
47 'description' : '''People all over the world are celebrating the release of \" The Amazing Spider-Man 2. \" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''' ,
52 def _real_extract ( self
, url
):
53 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
54 video_id
= mobj
. group ( 'id' )
55 webpage
= self
._ download
_ webpage
( url
, video_id
)
57 items_json
= self
._ search
_ regex
(
58 r
'mediaItems: ({.*?})$' , webpage
, 'items' , flags
= re
. MULTILINE
,
60 if items_json
is None :
61 CONTENT_ID_REGEXES
= [
62 r
'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"' ,
63 r
'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"'
65 long_id
= self
._ search
_ regex
( CONTENT_ID_REGEXES
, webpage
, 'content ID' )
68 items
= json
. loads ( items_json
)
69 info
= items
[ 'mediaItems' ][ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
70 # The 'meta' field is not always in the video webpage, we request it
73 return self
._ get
_ info
( long_id
, video_id
, webpage
)
75 def _get_info ( self
, long_id
, video_id
, webpage
):
76 query
= ( 'SELECT * FROM yahoo.media.video.streams WHERE id=" %s "'
77 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
78 ' AND protocol="http"' % long_id
)
79 data
= compat_urllib_parse
. urlencode ({
84 query_result
= self
._ download
_ json
(
85 'http://video.query.yahoo.com/v1/public/yql?' + data
,
86 video_id
, 'Downloading video info' )
87 info
= query_result
[ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
91 for s
in info
[ 'streams' ]:
93 'width' : int_or_none ( s
. get ( 'width' )),
94 'height' : int_or_none ( s
. get ( 'height' )),
95 'tbr' : int_or_none ( s
. get ( 'bitrate' )),
100 if host
. startswith ( 'rtmp' ):
107 format_url
= compat_urlparse
. urljoin ( host
, path
)
108 format_info
[ 'url' ] = format_url
109 formats
. append ( format_info
)
111 self
._ sort
_ formats
( formats
)
115 'title' : meta
[ 'title' ],
117 'description' : clean_html ( meta
[ 'description' ]),
118 'thumbnail' : meta
[ 'thumbnail' ] if meta
. get ( 'thumbnail' ) else self
._ og
_ search
_ thumbnail
( webpage
),
122 class YahooNewsIE ( YahooIE
):
123 IE_NAME
= 'yahoo:news'
124 _VALID_URL
= r
'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
127 'url' : 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html' ,
128 'md5' : '67010fdf3a08d290e060a4dd96baa07b' ,
132 'title' : 'China Moses Is Crazy About the Blues' ,
133 'description' : 'md5:9900ab8cd5808175c7b3fe55b979bed0' ,
137 def _real_extract ( self
, url
):
138 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
139 video_id
= mobj
. group ( 'id' )
140 webpage
= self
._ download
_ webpage
( url
, video_id
)
141 long_id
= self
._ search
_ regex
( r
'contentId: \' (.+ ?
) \' , ', webpage, ' long id ')
142 return self._get_info(long_id, video_id, webpage)
145 class YahooSearchIE(SearchInfoExtractor):
146 IE_DESC = ' Yahoo screen search
'
148 IE_NAME = ' screen
. yahoo
: search
'
149 _SEARCH_KEY = ' yvsearch
'
151 def _get_n_results(self, query, n):
152 """Get a specified number of results for a query"""
154 for pagenum in itertools.count(0):
155 result_url = ' http
:// video
. search
. yahoo
. com
/ search
/ ?p
= %s& fr
= screen
& o
= js
& gs
= 0 & b
= %d ' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
156 info = self._download_json(result_url, query,
157 note=' Downloading results page
'+str(pagenum+1))
159 results = info[' results
']
161 for (i, r) in enumerate(results):
162 if (pagenum * 30) + i >= n:
164 mobj = re.search(r' ( ?P
< url
> screen\
. yahoo\
. com
/.* ?
- \d
* ?\
. html
) "', r)
165 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
167 if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):