]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py
1 from __future__
import unicode_literals
7 from . common
import InfoExtractor
, SearchInfoExtractor
16 class YahooIE ( InfoExtractor
):
17 IE_DESC
= 'Yahoo screen'
18 _VALID_URL
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
21 'url' : 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html' ,
22 'file' : '214727115.mp4' ,
23 'md5' : '4962b075c08be8690a922ee026d05e69' ,
25 'title' : 'Julian Smith & Travis Legg Watch Julian Smith' ,
26 'description' : 'Julian and Travis watch Julian Smith' ,
30 'url' : 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html' ,
31 'file' : '103000935.mp4' ,
32 'md5' : 'd6e6fc6e1313c608f316ddad7b82b306' ,
34 'title' : 'Codefellas - The Cougar Lies with Spanish Moss' ,
35 'description' : 'Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?' ,
40 def _real_extract ( self
, url
):
41 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
42 video_id
= mobj
. group ( 'id' )
43 webpage
= self
._ download
_ webpage
( url
, video_id
)
45 items_json
= self
._ search
_ regex
( r
'mediaItems: ({.*?})$' ,
46 webpage
, 'items' , flags
= re
. MULTILINE
)
47 items
= json
. loads ( items_json
)
48 info
= items
[ 'mediaItems' ][ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
49 # The 'meta' field is not always in the video webpage, we request it
52 return self
._ get
_ info
( long_id
, video_id
)
54 def _get_info ( self
, long_id
, video_id
):
55 query
= ( 'SELECT * FROM yahoo.media.video.streams WHERE id=" %s "'
56 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
57 ' AND protocol="http"' % long_id
)
58 data
= compat_urllib_parse
. urlencode ({
63 query_result_json
= self
._ download
_ webpage
(
64 'http://video.query.yahoo.com/v1/public/yql?' + data
,
65 video_id
, 'Downloading video info' )
66 query_result
= json
. loads ( query_result_json
)
67 info
= query_result
[ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
71 for s
in info
[ 'streams' ]:
73 'width' : int_or_none ( s
. get ( 'width' )),
74 'height' : int_or_none ( s
. get ( 'height' )),
75 'tbr' : int_or_none ( s
. get ( 'bitrate' )),
80 if host
. startswith ( 'rtmp' ):
87 format_url
= compat_urlparse
. urljoin ( host
, path
)
88 format_info
[ 'url' ] = format_url
90 formats
. append ( format_info
)
92 self
._ sort
_ formats
( formats
)
96 'title' : meta
[ 'title' ],
98 'description' : clean_html ( meta
[ 'description' ]),
99 'thumbnail' : meta
[ 'thumbnail' ],
103 class YahooNewsIE ( YahooIE
):
104 IE_NAME
= 'yahoo:news'
105 _VALID_URL
= r
'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
108 'url' : 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html' ,
109 'md5' : '67010fdf3a08d290e060a4dd96baa07b' ,
113 'title' : 'China Moses Is Crazy About the Blues' ,
114 'description' : 'md5:9900ab8cd5808175c7b3fe55b979bed0' ,
118 # Overwrite YahooIE properties we don't want
121 def _real_extract ( self
, url
):
122 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
123 video_id
= mobj
. group ( 'id' )
124 webpage
= self
._ download
_ webpage
( url
, video_id
)
125 long_id
= self
._ search
_ regex
( r
'contentId: \' (.+ ?
) \' , ', webpage, ' long id ')
126 return self._get_info(long_id, video_id)
129 class YahooSearchIE(SearchInfoExtractor):
130 IE_DESC = ' Yahoo screen search
'
132 IE_NAME = ' screen
. yahoo
: search
'
133 _SEARCH_KEY = ' yvsearch
'
135 def _get_n_results(self, query, n):
136 """Get a specified number of results for a query"""
143 for pagenum in itertools.count(0):
144 result_url = ' http
:// video
. search
. yahoo
. com
/ search
/ ?p
= %s& fr
= screen
& o
= js
& gs
= 0 & b
= %d ' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
145 webpage = self._download_webpage(result_url, query,
146 note=' Downloading results page
'+str(pagenum+1))
147 info = json.loads(webpage)
149 results = info[' results
']
151 for (i, r) in enumerate(results):
152 if (pagenum * 30) +i >= n:
154 mobj = re.search(r' ( ?P
< url
> screen\
. yahoo\
. com
/.* ?
- \d
* ?\
. html
) "', r)
155 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
156 res['entries'].append(e)
157 if (pagenum * 30 +i >= n) or (m['last'] >= (m['total'] -1)):