]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/comedycentral.py
1 from __future__
import unicode_literals
5 from . mtv
import MTVServicesInfoExtractor
17 class ComedyCentralIE ( MTVServicesInfoExtractor
):
18 _VALID_URL
= r
'''(?x)https?://(?:www\.)?cc\.com/
19 (video-clips|episodes|cc-studios|video-collections|full-episodes|shows)
21 _FEED_URL
= 'http://comedycentral.com/feeds/mrss/'
24 'url' : 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' ,
25 'md5' : 'c4f48e9eda1b16dd10add0744344b6d8' ,
27 'id' : 'cef0cbb3-e776-4bc9-b62e-8016deccb354' ,
29 'title' : 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother' ,
30 'description' : 'After a certain point, breastfeeding becomes c**kblocking.' ,
33 'url' : 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview' ,
34 'only_matching' : True ,
38 class ComedyCentralShowsIE ( MTVServicesInfoExtractor
):
39 IE_DESC
= 'The Daily Show / The Colbert Report'
40 # urls can be abbreviations like :thedailyshow
41 # urls for episodes like:
42 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
43 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
44 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
45 _VALID_URL
= r
'''(?x)^(:(?P<shortname>tds|thedailyshow)
47 (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
48 ((?:full-)?episodes/(?:[0-9a-z] {6} /)?(?P<episode>.*)|
50 (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
51 |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
52 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
55 extended-interviews/(?P<interID>[0-9a-z]+)/
56 (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?)
57 (?:/[^/?#]?|[?#]|$))))
60 'url' : 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart' ,
61 'md5' : '4e2f5cb088a83cd8cdb7756132f9739d' ,
63 'id' : 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55' ,
65 'upload_date' : '20121213' ,
66 'description' : 'Kristen Stewart learns to let loose in "On the Road."' ,
67 'uploader' : 'thedailyshow' ,
68 'title' : 'thedailyshow kristen-stewart part 1' ,
71 'url' : 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview' ,
73 'id' : 'sarah-chayes-extended-interview' ,
74 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
75 'title' : 'thedailyshow Sarah Chayes Extended Interview' ,
80 'id' : '0baad492-cbec-4ec1-9e50-ad91c291127f' ,
82 'upload_date' : '20150129' ,
83 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
84 'uploader' : 'thedailyshow' ,
85 'title' : 'thedailyshow sarah-chayes-extended-interview part 1' ,
90 'id' : '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283' ,
92 'upload_date' : '20150129' ,
93 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
94 'uploader' : 'thedailyshow' ,
95 'title' : 'thedailyshow sarah-chayes-extended-interview part 2' ,
100 'skip_download' : True ,
103 'url' : 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview' ,
104 'only_matching' : True ,
106 'url' : 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news' ,
107 'only_matching' : True ,
109 'url' : 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114' ,
110 'only_matching' : True ,
112 'url' : 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3' ,
113 'only_matching' : True ,
115 'url' : 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary' ,
116 'only_matching' : True ,
118 'url' : 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall' ,
119 'only_matching' : True ,
121 'url' : 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights' ,
122 'only_matching' : True ,
124 'url' : 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo' ,
125 'only_matching' : True ,
127 'url' : 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food' ,
128 'only_matching' : True ,
130 'url' : 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel' ,
131 'only_matching' : True ,
134 _available_formats
= [ '3500' , '2200' , '1700' , '1200' , '750' , '400' ]
136 _video_extensions
= {
144 _video_dimensions
= {
153 def _real_extract ( self
, url
):
154 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
156 if mobj
. group ( 'shortname' ):
157 return self
. url_result ( 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes' )
159 if mobj
. group ( 'clip' ):
160 if mobj
. group ( 'videotitle' ):
161 epTitle
= mobj
. group ( 'videotitle' )
162 elif mobj
. group ( 'showname' ) == 'thedailyshow' :
163 epTitle
= mobj
. group ( 'tdstitle' )
165 epTitle
= mobj
. group ( 'cntitle' )
167 elif mobj
. group ( 'interview' ):
168 epTitle
= mobj
. group ( 'interview_title' )
171 dlNewest
= not mobj
. group ( 'episode' )
173 epTitle
= mobj
. group ( 'showname' )
175 epTitle
= mobj
. group ( 'episode' )
176 show_name
= mobj
. group ( 'showname' )
178 webpage
, htmlHandle
= self
._ download
_ webpage
_ handle
( url
, epTitle
)
180 url
= htmlHandle
. geturl ()
181 mobj
= re
. match ( self
._ VALID
_U RL
, url
, re
. VERBOSE
)
183 raise ExtractorError ( 'Invalid redirected URL: ' + url
)
184 if mobj
. group ( 'episode' ) == '' :
185 raise ExtractorError ( 'Redirected URL is still not specific: ' + url
)
186 epTitle
= ( mobj
. group ( 'episode' ) or mobj
. group ( 'videotitle' )). rpartition ( '/' )[- 1 ]
188 mMovieParams
= re
. findall ( '(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"' , webpage
)
189 if len ( mMovieParams
) == 0 :
190 # The Colbert Report embeds the information in a without
191 # a URL prefix; so extract the alternate reference
192 # and then add the URL prefix manually.
194 altMovieParams
= re
. findall ( 'data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"' , webpage
)
195 if len ( altMovieParams
) == 0 :
196 raise ExtractorError ( 'unable to find Flash URL in webpage ' + url
)
198 mMovieParams
= [( 'http://media.mtvnservices.com/' + altMovieParams
[ 0 ], altMovieParams
[ 0 ])]
200 uri
= mMovieParams
[ 0 ][ 1 ]
201 # Correct cc.com in uri
202 uri
= re
. sub ( r
'(episode:[^.]+)(\.cc)?\.com' , r
'\1.com' , uri
)
204 index_url
= 'http:// %s .cc.com/feeds/mrss? %s ' % ( show_name
, compat_urllib_parse
. urlencode ({ 'uri' : uri
}))
205 idoc
= self
._ download
_ xml
(
207 'Downloading show index' , 'Unable to download episode index' )
209 title
= idoc
. find ( './channel/title' ). text
210 description
= idoc
. find ( './channel/description' ). text
213 item_els
= idoc
. findall ( './/item' )
214 for part_num
, itemEl
in enumerate ( item_els
):
215 upload_date
= unified_strdate ( itemEl
. findall ( './pubDate' )[ 0 ]. text
)
216 thumbnail
= itemEl
. find ( './/{http://search.yahoo.com/mrss/}thumbnail' ). attrib
. get ( 'url' )
218 content
= itemEl
. find ( './/{http://search.yahoo.com/mrss/}content' )
219 duration
= float_or_none ( content
. attrib
. get ( 'duration' ))
220 mediagen_url
= content
. attrib
[ 'url' ]
221 guid
= itemEl
. find ( './guid' ). text
. rpartition ( ':' )[- 1 ]
223 cdoc
= self
._ download
_ xml
(
224 mediagen_url
, epTitle
,
225 'Downloading configuration for segment %d / %d ' % ( part_num
+ 1 , len ( item_els
)))
228 for rendition
in cdoc
. findall ( './/rendition' ):
229 finfo
= ( rendition
. attrib
[ 'bitrate' ], rendition
. findall ( './src' )[ 0 ]. text
)
233 for format
, rtmp_video_url
in turls
:
234 w
, h
= self
._ video
_ dimensions
. get ( format
, ( None , None ))
236 'format_id' : 'vhttp- %s ' % format
,
237 'url' : self
._ transform
_ rtmp
_u rl
( rtmp_video_url
),
238 'ext' : self
._ video
_ extensions
. get ( format
, 'mp4' ),
243 'format_id' : 'rtmp- %s ' % format
,
244 'url' : rtmp_video_url
. replace ( 'viacomccstrm' , 'viacommtvstrm' ),
245 'ext' : self
._ video
_ extensions
. get ( format
, 'mp4' ),
249 self
._ sort
_ formats
( formats
)
251 subtitles
= self
._ extract
_ subtitles
( cdoc
, guid
)
253 virtual_id
= show_name
+ ' ' + epTitle
+ ' part ' + compat_str ( part_num
+ 1 )
258 'uploader' : show_name
,
259 'upload_date' : upload_date
,
260 'duration' : duration
,
261 'thumbnail' : thumbnail
,
262 'description' : description
,
263 'subtitles' : subtitles
,
270 'title' : show_name
+ ' ' + title
,
271 'description' : description
,