]>
 
 
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py 
 
 
 
 
 
 
 
 
d84be25620eecb944845b74299510067772c583f
   1  from  __future__ 
import  unicode_literals
 
   7  from  . common 
import  InfoExtractor
,  SearchInfoExtractor
 
  16  class  YahooIE ( InfoExtractor
):  
  17      IE_DESC 
=  'Yahoo screen and movies'  
  18      _VALID_URL 
=  r
'https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html'  
  21              'url' :  'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html' ,  
  22              'md5' :  '4962b075c08be8690a922ee026d05e69' ,  
  24                  'id' :  '2d25e626-2378-391f-ada0-ddaf1417e588' ,  
  26                  'title' :  'Julian Smith & Travis Legg Watch Julian Smith' ,  
  27                  'description' :  'Julian and Travis watch Julian Smith' ,  
  31              'url' :  'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html' ,  
  32              'md5' :  'd6e6fc6e1313c608f316ddad7b82b306' ,  
  34                  'id' :  'd1dedf8c-d58c-38c3-8963-e899929ae0a9' ,  
  36                  'title' :  'Codefellas - The Cougar Lies with Spanish Moss' ,  
  37                  'description' :  'Agent Topple \' s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?' ,  
  41              'url' :  'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html' ,  
  42              'md5' :  '410b7104aa9893b765bc22787a22f3d9' ,  
  44                  'id' :  '516ed8e2-2c4f-339f-a211-7a8b49d30845' ,  
  46                  'title' :  'The World Loves Spider-Man' ,  
  47                  'description' :  '''People all over the world are celebrating the release of  \" The Amazing Spider-Man 2. \"  We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''' ,  
  52      def  _real_extract ( self
,  url
):  
  53          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
  54          video_id 
=  mobj
. group ( 'id' )  
  55          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
  57          items_json 
=  self
._ search
_ regex
(  
  58              r
'mediaItems: ({.*?})$' ,  webpage
,  'items' ,  flags
= re
. MULTILINE
,  
  60          if  items_json 
is None :  
  61              CONTENT_ID_REGEXES 
= [  
  62                  r
'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"' ,  
  63                  r
'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"'  
  65              long_id 
=  self
._ search
_ regex
( CONTENT_ID_REGEXES
,  webpage
,  'content ID' )  
  68              items 
=  json
. loads ( items_json
)  
  69              info 
=  items
[ 'mediaItems' ][ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]  
  70              # The 'meta' field is not always in the video webpage, we request it  
  73          return  self
._ get
_ info
( long_id
,  video_id
,  webpage
)  
  75      def  _get_info ( self
,  long_id
,  video_id
,  webpage
):  
  76          query 
= ( 'SELECT * FROM yahoo.media.video.streams WHERE id=" %s "'  
  77                   ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'  
  78                   ' AND protocol="http"'  %  long_id
)  
  79          data 
=  compat_urllib_parse
. urlencode ({  
  84          query_result 
=  self
._ download
_ json
(  
  85              'http://video.query.yahoo.com/v1/public/yql?'  +  data
,  
  86              video_id
,  'Downloading video info' )  
  87          info 
=  query_result
[ 'query' ][ 'results' ][ 'mediaObj' ][ 0 ]  
  91          for  s 
in  info
[ 'streams' ]:  
  93                  'width' :  int_or_none ( s
. get ( 'width' )),  
  94                  'height' :  int_or_none ( s
. get ( 'height' )),  
  95                  'tbr' :  int_or_none ( s
. get ( 'bitrate' )),  
 100              if  host
. startswith ( 'rtmp' ):  
 107                  format_url 
=  compat_urlparse
. urljoin ( host
,  path
)  
 108                  format_info
[ 'url' ] =  format_url
 
 109              formats
. append ( format_info
)  
 111          self
._ sort
_ formats
( formats
)  
 115              'title' :  meta
[ 'title' ],  
 117              'description' :  clean_html ( meta
[ 'description' ]),  
 118              'thumbnail' :  meta
[ 'thumbnail' ]  if  meta
. get ( 'thumbnail' )  else  self
._ og
_ search
_ thumbnail
( webpage
),  
 122  class  YahooNewsIE ( YahooIE
):  
 123      IE_NAME 
=  'yahoo:news'  
 124      _VALID_URL 
=  r
'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'  
 127          'url' :  'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html' ,  
 128          'md5' :  '67010fdf3a08d290e060a4dd96baa07b' ,  
 132              'title' :  'China Moses Is Crazy About the Blues' ,  
 133              'description' :  'md5:9900ab8cd5808175c7b3fe55b979bed0' ,  
 137      def  _real_extract ( self
,  url
):  
 138          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 139          video_id 
=  mobj
. group ( 'id' )  
 140          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 141          long_id 
=  self
._ search
_ regex
( r
'contentId: \' (.+ ?
) \' , ', webpage, ' long id ')  
 142          return self._get_info(long_id, video_id, webpage)  
 145  class YahooSearchIE(SearchInfoExtractor):  
 146      IE_DESC = ' Yahoo screen search
'  
 148      IE_NAME = ' screen
. yahoo
: search
'  
 149      _SEARCH_KEY = ' yvsearch
'  
 151      def _get_n_results(self, query, n):  
 152          """Get a specified number of results for a query"""  
 154          for pagenum in itertools.count(0):  
 155              result_url = ' http
:// video
. search
. yahoo
. com
/ search
/ ?p
= %s& fr
= screen
& o
= js
& gs
= 0 & b
= %d ' % (compat_urllib_parse.quote_plus(query), pagenum * 30)  
 156              info = self._download_json(result_url, query,  
 157                  note=' Downloading results page 
'+str(pagenum+1))  
 159              results = info[' results
']  
 161              for (i, r) in enumerate(results):  
 162                  if (pagenum * 30) + i >= n:  
 164                  mobj = re.search(r' ( ?P
< url
> screen\
. yahoo\
. com
/.* ?
- \d
* ?\
. html
) "', r)  
 165                  e = self.url_result('http://' + mobj.group('url'), 'Yahoo')  
 167              if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):