]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py
e457c4707a8feda7c3d0709c18671282b6da3814
5 from . common
import InfoExtractor
, SearchInfoExtractor
14 class YahooIE ( InfoExtractor
):
15 IE_DESC
= u
'Yahoo screen'
16 _VALID_URL
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
19 u
'url' : u
'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html' ,
20 u
'file' : u
'214727115.mp4' ,
21 u
'md5' : u
'4962b075c08be8690a922ee026d05e69' ,
23 u
'title' : u
'Julian Smith & Travis Legg Watch Julian Smith' ,
24 u
'description' : u
'Julian and Travis watch Julian Smith' ,
28 u
'url' : u
'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html' ,
29 u
'file' : u
'103000935.mp4' ,
30 u
'md5' : u
'd6e6fc6e1313c608f316ddad7b82b306' ,
32 u
'title' : u
'Codefellas - The Cougar Lies with Spanish Moss' ,
33 u
'description' : u
'Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?' ,
38 def _real_extract ( self
, url
):
39 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
40 video_id
= mobj
. group ( 'id' )
41 webpage
= self
._ download
_ webpage
( url
, video_id
)
43 items_json
= self
._ search
_ regex
( r
'mediaItems: ({.*?})$' ,
44 webpage
, u
'items' , flags
= re
. MULTILINE
)
45 items
= json
. loads ( items_json
)
46 info
= items
[ 'mediaItems' ][ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
47 # The 'meta' field is not always in the video webpage, we request it
50 return self
._ get
_ info
( info
[ 'id' ], video_id
)
52 def _get_info ( self
, long_id
, video_id
):
53 query
= ( 'SELECT * FROM yahoo.media.video.streams WHERE id=" %s "'
54 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
55 ' AND protocol="http"' % long_id
)
56 data
= compat_urllib_parse
. urlencode ({
61 query_result_json
= self
._ download
_ webpage
(
62 'http://video.query.yahoo.com/v1/public/yql?' + data
,
63 video_id
, u
'Downloading video info' )
64 query_result
= json
. loads ( query_result_json
)
65 info
= query_result
[ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]
69 for s
in info
[ 'streams' ]:
71 'width' : s
. get ( 'width' ),
72 'height' : s
. get ( 'height' ),
73 'bitrate' : s
. get ( 'bitrate' ),
78 if host
. startswith ( 'rtmp' ):
85 format_url
= compat_urlparse
. urljoin ( host
, path
)
86 format_info
[ 'url' ] = format_url
87 format_info
[ 'ext' ] = determine_ext ( format_url
)
89 formats
. append ( format_info
)
90 formats
= sorted ( formats
, key
= lambda f
:( f
[ 'height' ], f
[ 'width' ]))
94 'title' : meta
[ 'title' ],
96 'description' : clean_html ( meta
[ 'description' ]),
97 'thumbnail' : meta
[ 'thumbnail' ],
101 class YahooNewsIE ( YahooIE
):
102 IE_NAME
= 'yahoo:news'
103 _VALID_URL
= r
'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
106 u
'url' : u
'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html' ,
107 u
'md5' : u
'67010fdf3a08d290e060a4dd96baa07b' ,
111 u
'title' : u
'China Moses Is Crazy About the Blues' ,
112 u
'description' : u
'md5:9900ab8cd5808175c7b3fe55b979bed0' ,
116 # Overwrite YahooIE properties we don't want
119 def _real_extract ( self
, url
):
120 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
121 video_id
= mobj
. group ( 'id' )
122 webpage
= self
._ download
_ webpage
( url
, video_id
)
123 long_id
= self
._ search
_ regex
( r
'contentId: \' (.+ ?
) \' , ', webpage, u' long id ')
124 return self._get_info(long_id, video_id)
127 class YahooSearchIE(SearchInfoExtractor):
128 IE_DESC = u' Yahoo screen search
'
130 IE_NAME = u' screen
. yahoo
: search
'
131 _SEARCH_KEY = ' yvsearch
'
133 def _get_n_results(self, query, n):
134 """Get a specified number of results for a query"""
141 for pagenum in itertools.count(0):
142 result_url = u' http
:// video
. search
. yahoo
. com
/ search
/ ?p
= %s& fr
= screen
& o
= js
& gs
= 0 & b
= %d ' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
143 webpage = self._download_webpage(result_url, query,
144 note=' Downloading results page
'+str(pagenum+1))
145 info = json.loads(webpage)
147 results = info[u' results
']
149 for (i, r) in enumerate(results):
150 if (pagenum * 30) +i >= n:
152 mobj = re.search(r' ( ?P
< url
> screen\
. yahoo\
. com
/.* ?
- \d
* ?\
. html
) "', r)
153 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
154 res['entries'].append(e)
155 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1)):