]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/comedycentral.py
1 from __future__
import unicode_literals
5 from . mtv
import MTVServicesInfoExtractor
17 class ComedyCentralIE ( MTVServicesInfoExtractor
):
18 _VALID_URL
= r
'''(?x)https?://(?:www\.)?cc\.com/
19 (video-clips|episodes|cc-studios|video-collections|full-episodes)
21 _FEED_URL
= 'http://comedycentral.com/feeds/mrss/'
24 'url' : 'http://www.cc.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother' ,
25 'md5' : 'c4f48e9eda1b16dd10add0744344b6d8' ,
27 'id' : 'cef0cbb3-e776-4bc9-b62e-8016deccb354' ,
29 'title' : 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother' ,
30 'description' : 'After a certain point, breastfeeding becomes c**kblocking.' ,
35 class ComedyCentralShowsIE ( MTVServicesInfoExtractor
):
36 IE_DESC
= 'The Daily Show / The Colbert Report'
37 # urls can be abbreviations like :thedailyshow
38 # urls for episodes like:
39 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
40 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
41 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
42 _VALID_URL
= r
'''(?x)^(:(?P<shortname>tds|thedailyshow)
44 (?P<showname>thedailyshow|thecolbertreport)\.(?:cc\.)?com/
45 ((?:full-)?episodes/(?:[0-9a-z] {6} /)?(?P<episode>.*)|
47 (?:(?:guests/[^/]+|videos|video-playlists|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+))
48 |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
49 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*))
52 extended-interviews/(?P<interID>[0-9a-z]+)/
53 (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?)
54 (?:/[^/?#]?|[?#]|$))))
57 'url' : 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart' ,
58 'md5' : '4e2f5cb088a83cd8cdb7756132f9739d' ,
60 'id' : 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55' ,
62 'upload_date' : '20121213' ,
63 'description' : 'Kristen Stewart learns to let loose in "On the Road."' ,
64 'uploader' : 'thedailyshow' ,
65 'title' : 'thedailyshow kristen-stewart part 1' ,
68 'url' : 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview' ,
70 'id' : 'sarah-chayes-extended-interview' ,
71 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
72 'title' : 'thedailyshow Sarah Chayes Extended Interview' ,
77 'id' : '0baad492-cbec-4ec1-9e50-ad91c291127f' ,
79 'upload_date' : '20150129' ,
80 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
81 'uploader' : 'thedailyshow' ,
82 'title' : 'thedailyshow sarah-chayes-extended-interview part 1' ,
87 'id' : '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283' ,
89 'upload_date' : '20150129' ,
90 'description' : 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."' ,
91 'uploader' : 'thedailyshow' ,
92 'title' : 'thedailyshow sarah-chayes-extended-interview part 2' ,
97 'skip_download' : True ,
100 'url' : 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview' ,
101 'only_matching' : True ,
103 'url' : 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news' ,
104 'only_matching' : True ,
106 'url' : 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114' ,
107 'only_matching' : True ,
109 'url' : 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3' ,
110 'only_matching' : True ,
112 'url' : 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary' ,
113 'only_matching' : True ,
115 'url' : 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall' ,
116 'only_matching' : True ,
118 'url' : 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights' ,
119 'only_matching' : True ,
121 'url' : 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo' ,
122 'only_matching' : True ,
124 'url' : 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food' ,
125 'only_matching' : True ,
127 'url' : 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel' ,
128 'only_matching' : True ,
131 _available_formats
= [ '3500' , '2200' , '1700' , '1200' , '750' , '400' ]
133 _video_extensions
= {
141 _video_dimensions
= {
150 def _real_extract ( self
, url
):
151 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
153 if mobj
. group ( 'shortname' ):
154 if mobj
. group ( 'shortname' ) in ( 'tds' , 'thedailyshow' ):
155 url
= 'http://thedailyshow.cc.com/full-episodes/'
157 url
= 'http://thecolbertreport.cc.com/full-episodes/'
158 mobj
= re
. match ( self
._ VALID
_U RL
, url
, re
. VERBOSE
)
159 assert mobj
is not None
161 if mobj
. group ( 'clip' ):
162 if mobj
. group ( 'videotitle' ):
163 epTitle
= mobj
. group ( 'videotitle' )
164 elif mobj
. group ( 'showname' ) == 'thedailyshow' :
165 epTitle
= mobj
. group ( 'tdstitle' )
167 epTitle
= mobj
. group ( 'cntitle' )
169 elif mobj
. group ( 'interview' ):
170 epTitle
= mobj
. group ( 'interview_title' )
173 dlNewest
= not mobj
. group ( 'episode' )
175 epTitle
= mobj
. group ( 'showname' )
177 epTitle
= mobj
. group ( 'episode' )
178 show_name
= mobj
. group ( 'showname' )
180 webpage
, htmlHandle
= self
._ download
_ webpage
_ handle
( url
, epTitle
)
182 url
= htmlHandle
. geturl ()
183 mobj
= re
. match ( self
._ VALID
_U RL
, url
, re
. VERBOSE
)
185 raise ExtractorError ( 'Invalid redirected URL: ' + url
)
186 if mobj
. group ( 'episode' ) == '' :
187 raise ExtractorError ( 'Redirected URL is still not specific: ' + url
)
188 epTitle
= ( mobj
. group ( 'episode' ) or mobj
. group ( 'videotitle' )). rpartition ( '/' )[- 1 ]
190 mMovieParams
= re
. findall ( '(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"' , webpage
)
191 if len ( mMovieParams
) == 0 :
192 # The Colbert Report embeds the information in a without
193 # a URL prefix; so extract the alternate reference
194 # and then add the URL prefix manually.
196 altMovieParams
= re
. findall ( 'data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"' , webpage
)
197 if len ( altMovieParams
) == 0 :
198 raise ExtractorError ( 'unable to find Flash URL in webpage ' + url
)
200 mMovieParams
= [( "http://media.mtvnservices.com/" + altMovieParams
[ 0 ], altMovieParams
[ 0 ])]
202 uri
= mMovieParams
[ 0 ][ 1 ]
203 # Correct cc.com in uri
204 uri
= re
. sub ( r
'(episode:[^.]+)(\.cc)?\.com' , r
'\1.com' , uri
)
206 index_url
= 'http:// %s .cc.com/feeds/mrss? %s ' % ( show_name
, compat_urllib_parse
. urlencode ({ 'uri' : uri
}))
207 idoc
= self
._ download
_ xml
(
209 'Downloading show index' , 'Unable to download episode index' )
211 title
= idoc
. find ( './channel/title' ). text
212 description
= idoc
. find ( './channel/description' ). text
215 item_els
= idoc
. findall ( './/item' )
216 for part_num
, itemEl
in enumerate ( item_els
):
217 upload_date
= unified_strdate ( itemEl
. findall ( './pubDate' )[ 0 ]. text
)
218 thumbnail
= itemEl
. find ( './/{http://search.yahoo.com/mrss/}thumbnail' ). attrib
. get ( 'url' )
220 content
= itemEl
. find ( './/{http://search.yahoo.com/mrss/}content' )
221 duration
= float_or_none ( content
. attrib
. get ( 'duration' ))
222 mediagen_url
= content
. attrib
[ 'url' ]
223 guid
= itemEl
. find ( './guid' ). text
. rpartition ( ':' )[- 1 ]
225 cdoc
= self
._ download
_ xml
(
226 mediagen_url
, epTitle
,
227 'Downloading configuration for segment %d / %d ' % ( part_num
+ 1 , len ( item_els
)))
230 for rendition
in cdoc
. findall ( './/rendition' ):
231 finfo
= ( rendition
. attrib
[ 'bitrate' ], rendition
. findall ( './src' )[ 0 ]. text
)
235 for format
, rtmp_video_url
in turls
:
236 w
, h
= self
._ video
_ dimensions
. get ( format
, ( None , None ))
238 'format_id' : 'vhttp- %s ' % format
,
239 'url' : self
._ transform
_ rtmp
_u rl
( rtmp_video_url
),
240 'ext' : self
._ video
_ extensions
. get ( format
, 'mp4' ),
245 'format_id' : 'rtmp- %s ' % format
,
246 'url' : rtmp_video_url
. replace ( 'viacomccstrm' , 'viacommtvstrm' ),
247 'ext' : self
._ video
_ extensions
. get ( format
, 'mp4' ),
251 self
._ sort
_ formats
( formats
)
253 subtitles
= self
._ extract
_ subtitles
( cdoc
, guid
)
255 virtual_id
= show_name
+ ' ' + epTitle
+ ' part ' + compat_str ( part_num
+ 1 )
260 'uploader' : show_name
,
261 'upload_date' : upload_date
,
262 'duration' : duration
,
263 'thumbnail' : thumbnail
,
264 'description' : description
,
265 'subtitles' : subtitles
,
272 'title' : show_name
+ ' ' + title
,
273 'description' : description
,