]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/twitter.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
7 from .. compat
import compat_urlparse
19 from . periscope
import PeriscopeIE
22 class TwitterBaseIE ( InfoExtractor
):
23 def _extract_formats_from_vmap_url ( self
, vmap_url
, video_id
):
24 vmap_data
= self
._ download
_ xml
( vmap_url
, video_id
)
25 video_url
= xpath_text ( vmap_data
, './/MediaFile' ). strip ()
26 if determine_ext ( video_url
) == 'm3u8' :
27 return self
._ extract
_ m
3u8_ formats
(
28 video_url
, video_id
, ext
= 'mp4' , m3u8_id
= 'hls' ,
29 entry_protocol
= 'm3u8_native' )
35 def _search_dimensions_in_video_url ( a_format
, video_url
):
36 m
= re
. search ( r
'/(?P<width>\d+)x(?P<height>\d+)/' , video_url
)
39 'width' : int ( m
. group ( 'width' )),
40 'height' : int ( m
. group ( 'height' )),
44 class TwitterCardIE ( TwitterBaseIE
):
45 IE_NAME
= 'twitter:card'
46 _VALID_URL
= r
'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'
49 'url' : 'https://twitter.com/i/cards/tfw/v1/560070183650213889' ,
50 # MD5 checksums are different in different places
52 'id' : '560070183650213889' ,
54 'title' : 'Twitter web player' ,
55 'thumbnail' : r
're:^https?://.*\.jpg$' ,
60 'url' : 'https://twitter.com/i/cards/tfw/v1/623160978427936768' ,
61 'md5' : '7ee2a553b63d1bccba97fbed97d9e1c8' ,
63 'id' : '623160978427936768' ,
65 'title' : 'Twitter web player' ,
66 'thumbnail' : r
're:^https?://.*$' ,
70 'url' : 'https://twitter.com/i/cards/tfw/v1/654001591733886977' ,
71 'md5' : 'b6d9683dd3f48e340ded81c0e917ad46' ,
75 'title' : 'Ubuntu 11.10 Overview' ,
76 'description' : 'md5:a831e97fa384863d6e26ce48d1c43376' ,
77 'upload_date' : '20111013' ,
78 'uploader' : 'OMG! Ubuntu!' ,
79 'uploader_id' : 'omgubuntu' ,
81 'add_ie' : [ 'Youtube' ],
84 'url' : 'https://twitter.com/i/cards/tfw/v1/665289828897005568' ,
85 'md5' : '6dabeaca9e68cbb71c99c322a4b42a11' ,
89 'upload_date' : '20151113' ,
90 'uploader_id' : '1189339351084113920' ,
91 'uploader' : 'ArsenalTerje' ,
92 'title' : 'Vine by ArsenalTerje' ,
93 'timestamp' : 1447451307 ,
97 'url' : 'https://twitter.com/i/videos/tweet/705235433198714880' ,
98 'md5' : '884812a2adc8aaf6fe52b15ccbfa3b88' ,
100 'id' : '705235433198714880' ,
102 'title' : 'Twitter web player' ,
103 'thumbnail' : r
're:^https?://.*' ,
106 'url' : 'https://twitter.com/i/videos/752274308186120192' ,
107 'only_matching' : True ,
111 _API_BASE
= 'https://api.twitter.com/1.1'
113 def _parse_media_info ( self
, media_info
, video_id
):
115 for media_variant
in media_info
. get ( 'variants' , []):
116 media_url
= media_variant
[ 'url' ]
117 if media_url
. endswith ( '.m3u8' ):
118 formats
. extend ( self
._ extract
_ m
3u8_ formats
( media_url
, video_id
, ext
= 'mp4' , m3u8_id
= 'hls' ))
119 elif media_url
. endswith ( '.mpd' ):
120 formats
. extend ( self
._ extract
_ mpd
_ formats
( media_url
, video_id
, mpd_id
= 'dash' ))
122 tbr
= int_or_none ( dict_get ( media_variant
, ( 'bitRate' , 'bitrate' )), scale
= 1000 )
125 'format_id' : 'http- %d ' % tbr
if tbr
else 'http' ,
128 # Reported bitRate may be zero
129 if not a_format
[ 'tbr' ]:
132 self
._ search
_ dimensions
_ in
_ video
_u rl
( a_format
, media_url
)
134 formats
. append ( a_format
)
137 def _extract_mobile_formats ( self
, username
, video_id
):
138 webpage
= self
._ download
_ webpage
(
139 'https://mobile.twitter.com/ %s /status/ %s ' % ( username
, video_id
),
140 video_id
, 'Downloading mobile webpage' ,
142 # A recent mobile UA is necessary for `gt` cookie
143 'User-Agent' : 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0' ,
145 main_script_url
= self
._ html
_ search
_ regex
(
146 r
'<script[^>]+src="([^"]+main\.[^"]+)"' , webpage
, 'main script URL' )
147 main_script
= self
._ download
_ webpage
(
148 main_script_url
, video_id
, 'Downloading main script' )
149 bearer_token
= self
._ search
_ regex
(
150 r
'BEARER_TOKEN\s*:\s*"([^"]+)"' ,
151 main_script
, 'bearer token' )
152 # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id
153 api_data
= self
._ download
_ json
(
154 ' %s /statuses/show/ %s .json' % ( self
._ API
_ BASE
, video_id
),
155 video_id
, 'Downloading API data' ,
157 'Authorization' : 'Bearer ' + bearer_token
,
159 media_info
= try_get ( api_data
, lambda o
: o
[ 'extended_entities' ][ 'media' ][ 0 ][ 'video_info' ]) or {}
160 return self
._ parse
_ media
_ info
( media_info
, video_id
)
162 def _real_extract ( self
, url
):
163 path
, video_id
= re
. search ( self
._ VALID
_U RL
, url
). groups ()
170 if path
. startswith ( 'cards/' ):
171 urls
. append ( 'https://twitter.com/i/videos/' + video_id
)
174 webpage
= self
._ download
_ webpage
(
175 u
, video_id
, headers
={ 'Referer' : 'https://twitter.com/' })
177 iframe_url
= self
._ html
_ search
_ regex
(
178 r
'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"' ,
179 webpage
, 'video iframe' , default
= None )
181 return self
. url_result ( iframe_url
)
183 config
= self
._ parse
_ json
( self
._ html
_ search
_ regex
(
184 r
'data-(?:player-)?config="([^"]+)"' , webpage
,
185 'data player config' , default
= '{}' ),
188 if config
. get ( 'source_type' ) == 'vine' :
189 return self
. url_result ( config
[ 'player_url' ], 'Vine' )
191 periscope_url
= PeriscopeIE
._ extract
_u rl
( webpage
)
193 return self
. url_result ( periscope_url
, PeriscopeIE
. ie_key ())
195 video_url
= config
. get ( 'video_url' ) or config
. get ( 'playlist' , [{}])[ 0 ]. get ( 'source' )
198 if determine_ext ( video_url
) == 'm3u8' :
199 formats
. extend ( self
._ extract
_ m
3u8_ formats
( video_url
, video_id
, ext
= 'mp4' , m3u8_id
= 'hls' ))
205 self
._ search
_ dimensions
_ in
_ video
_u rl
( f
, video_url
)
209 vmap_url
= config
. get ( 'vmapUrl' ) or config
. get ( 'vmap_url' )
212 self
._ extract
_ formats
_ from
_ vmap
_u rl
( vmap_url
, video_id
))
216 for entity
in config
. get ( 'status' , {}). get ( 'entities' , []):
217 if 'mediaInfo' in entity
:
218 media_info
= entity
[ 'mediaInfo' ]
221 formats
. extend ( self
._ parse
_ media
_ info
( media_info
, video_id
))
222 duration
= float_or_none ( media_info
. get ( 'duration' , {}). get ( 'nanos' ), scale
= 1 e9
)
224 username
= config
. get ( 'user' , {}). get ( 'screen_name' )
226 formats
. extend ( self
._ extract
_ mobile
_ formats
( username
, video_id
))
229 title
= self
._ search
_ regex
( r
'<title>([^<]+)</title>' , webpage
, 'title' )
230 thumbnail
= config
. get ( 'posterImageUrl' ) or config
. get ( 'image_src' )
231 duration
= float_or_none ( config
. get ( 'duration' ), scale
= 1000 ) or duration
236 'Authorization' : 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' ,
239 ct0
= self
._ get
_ cookies
( url
). get ( 'ct0' )
241 headers
[ 'csrf_token' ] = ct0
. value
242 guest_token
= self
._ download
_ json
(
243 ' %s /guest/activate.json' % self
._ API
_ BASE
, video_id
,
244 'Downloading guest token' , data
= b
'' ,
245 headers
= headers
)[ 'guest_token' ]
246 headers
[ 'x-guest-token' ] = guest_token
247 self
._ set
_ cookie
( 'api.twitter.com' , 'gt' , guest_token
)
248 config
= self
._ download
_ json
(
249 ' %s /videos/tweet/config/ %s .json' % ( self
._ API
_ BASE
, video_id
),
250 video_id
, headers
= headers
)
251 track
= config
[ 'track' ]
252 vmap_url
= track
. get ( 'vmapUrl' )
254 formats
= self
._ extract
_ formats
_ from
_ vmap
_u rl
( vmap_url
, video_id
)
256 playback_url
= track
[ 'playbackUrl' ]
257 if determine_ext ( playback_url
) == 'm3u8' :
258 formats
= self
._ extract
_ m
3u8_ formats
(
259 playback_url
, video_id
, 'mp4' ,
260 entry_protocol
= 'm3u8_native' , m3u8_id
= 'hls' )
265 title
= 'Twitter web player'
266 thumbnail
= config
. get ( 'posterImage' )
267 duration
= float_or_none ( track
. get ( 'durationMs' ), scale
= 1000 )
269 self
._ remove
_ duplicate
_ formats
( formats
)
270 self
._ sort
_ formats
( formats
)
275 'thumbnail' : thumbnail
,
276 'duration' : duration
,
281 class TwitterIE ( InfoExtractor
):
283 _VALID_URL
= r
'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P<user_id>[^/]+))/status/(?P<id>\d+)'
284 _TEMPLATE_URL
= 'https://twitter.com/ %s /status/ %s '
285 _TEMPLATE_STATUSES_URL
= 'https://twitter.com/statuses/ %s '
288 'url' : 'https://twitter.com/freethenipple/status/643211948184596480' ,
290 'id' : '643211948184596480' ,
292 'title' : 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!' ,
293 'thumbnail' : r
're:^https?://.*\.jpg' ,
294 'description' : 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"' ,
295 'uploader' : 'FREE THE NIPPLE' ,
296 'uploader_id' : 'freethenipple' ,
300 'url' : 'https://twitter.com/giphz/status/657991469417025536/photo/1' ,
301 'md5' : 'f36dcd5fb92bf7057f155e7d927eeb42' ,
303 'id' : '657991469417025536' ,
305 'title' : 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai' ,
306 'description' : 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"' ,
307 'thumbnail' : r
're:^https?://.*\.png' ,
309 'uploader_id' : 'giphz' ,
311 'expected_warnings' : [ 'height' , 'width' ],
312 'skip' : 'Account suspended' ,
314 'url' : 'https://twitter.com/starwars/status/665052190608723968' ,
316 'id' : '665052190608723968' ,
318 'title' : 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.' ,
319 'description' : 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."' ,
320 'uploader_id' : 'starwars' ,
321 'uploader' : 'Star Wars' ,
324 'url' : 'https://twitter.com/BTNBrentYarina/status/705235433198714880' ,
326 'id' : '705235433198714880' ,
328 'title' : 'Brent Yarina - Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight.' ,
329 'description' : 'Brent Yarina on Twitter: "Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight."' ,
330 'uploader_id' : 'BTNBrentYarina' ,
331 'uploader' : 'Brent Yarina' ,
334 # The same video as https://twitter.com/i/videos/tweet/705235433198714880
335 # Test case of TwitterCardIE
336 'skip_download' : True ,
339 'url' : 'https://twitter.com/jaydingeer/status/700207533655363584' ,
341 'id' : '700207533655363584' ,
343 'title' : 'JG - BEAT PROD: @suhmeduh #Damndaniel' ,
344 'description' : 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"' ,
345 'thumbnail' : r
're:^https?://.*\.jpg' ,
347 'uploader_id' : 'jaydingeer' ,
351 'url' : 'https://twitter.com/Filmdrunk/status/713801302971588609' ,
352 'md5' : '89a15ed345d13b86e9a5a5e051fa308a' ,
356 'title' : 'Vince Mancini - Vine of the day' ,
357 'description' : 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"' ,
358 'uploader' : 'Vince Mancini' ,
359 'uploader_id' : 'Filmdrunk' ,
360 'timestamp' : 1402826626 ,
361 'upload_date' : '20140615' ,
365 'url' : 'https://twitter.com/captainamerica/status/719944021058060289' ,
367 'id' : '719944021058060289' ,
369 'title' : 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.' ,
370 'description' : 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"' ,
371 'uploader_id' : 'captainamerica' ,
372 'uploader' : 'Captain America' ,
376 'url' : 'https://twitter.com/OPP_HSD/status/779210622571536384' ,
378 'id' : '1zqKVVlkqLaKB' ,
380 'title' : 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence' ,
381 'description' : 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"' ,
382 'upload_date' : '20160923' ,
383 'uploader_id' : 'OPP_HSD' ,
384 'uploader' : 'Sgt Kerry Schmidt' ,
385 'timestamp' : 1474613214 ,
387 'add_ie' : [ 'Periscope' ],
389 # has mp4 formats via mobile API
390 'url' : 'https://twitter.com/news_al3alm/status/852138619213144067' ,
392 'id' : '852138619213144067' ,
394 'title' : 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة' ,
395 'description' : 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"' ,
396 'uploader' : 'عالم الأخبار' ,
397 'uploader_id' : 'news_al3alm' ,
401 'url' : 'https://twitter.com/i/web/status/910031516746514432' ,
403 'id' : '910031516746514432' ,
405 'title' : 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.' ,
406 'thumbnail' : r
're:^https?://.*\.jpg' ,
407 'description' : 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"' ,
408 'uploader' : 'Préfet de Guadeloupe' ,
409 'uploader_id' : 'Prefet971' ,
413 'skip_download' : True , # requires ffmpeg
416 # card via api.twitter.com/1.1/videos/tweet/config
417 'url' : 'https://twitter.com/LisPower1/status/1001551623938805763' ,
419 'id' : '1001551623938805763' ,
421 'title' : 're:.*?Shep is on a roll today.*?' ,
422 'thumbnail' : r
're:^https?://.*\.jpg' ,
423 'description' : 'md5:63b036c228772523ae1924d5f8e5ed6b' ,
424 'uploader' : 'Lis Power' ,
425 'uploader_id' : 'LisPower1' ,
429 'skip_download' : True , # requires ffmpeg
432 'url' : 'https://twitter.com/foobar/status/1087791357756956680' ,
434 'id' : '1087791357756956680' ,
436 'title' : 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!' ,
437 'thumbnail' : r
're:^https?://.*\.jpg' ,
438 'description' : 'md5:66d493500c013e3e2d434195746a7f78' ,
439 'uploader' : 'Twitter' ,
440 'uploader_id' : 'Twitter' ,
445 def _real_extract ( self
, url
):
446 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
447 twid
= mobj
. group ( 'id' )
449 webpage
, urlh
= self
._ download
_ webpage
_ handle
(
450 self
._ TEMPLATE
_ STATUSES
_U RL
% twid
, twid
)
452 if 'twitter.com/account/suspended' in urlh
. geturl ():
453 raise ExtractorError ( 'Account suspended by Twitter.' , expected
= True )
457 redirect_mobj
= re
. match ( self
._ VALID
_U RL
, urlh
. geturl ())
459 user_id
= redirect_mobj
. group ( 'user_id' )
462 user_id
= mobj
. group ( 'user_id' )
464 username
= remove_end ( self
._ og
_ search
_ title
( webpage
), ' on Twitter' )
466 title
= description
= self
._ og
_ search
_ description
( webpage
). strip ( '' ). replace ( ' \n ' , ' ' ). strip ( '“”' )
468 # strip 'https -_t.co_BJYgOjSeGA' junk from filenames
469 title
= re
. sub ( r
'\s+(https?://[^ ]+)' , '' , title
)
472 'uploader_id' : user_id
,
473 'uploader' : username
,
475 'description' : ' %s on Twitter: " %s "' % ( username
, description
),
476 'title' : username
+ ' - ' + title
,
479 mobj
= re
. search ( r
'''(?x)
480 <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
481 <source[^>]+video-src="(?P<url>[^"]+)"
485 more_info
= mobj
. group ( 'more_info' )
486 height
= int_or_none ( self
._ search
_ regex
(
487 r
'data-height="(\d+)"' , more_info
, 'height' , fatal
= False ))
488 width
= int_or_none ( self
._ search
_ regex
(
489 r
'data-width="(\d+)"' , more_info
, 'width' , fatal
= False ))
490 thumbnail
= self
._ search
_ regex
(
491 r
'poster="([^"]+)"' , more_info
, 'poster' , fatal
= False )
494 'url' : mobj
. group ( 'url' ),
497 'thumbnail' : thumbnail
,
501 twitter_card_url
= None
502 if 'class="PlayableMedia' in webpage
:
503 twitter_card_url
= ' %s //twitter.com/i/videos/tweet/ %s ' % ( self
. http_scheme (), twid
)
505 twitter_card_iframe_url
= self
._ search
_ regex
(
506 r
'data-full-card-iframe-url=([\' "])(?P<url>(?:(?!\1).)+)\1',
507 webpage, 'Twitter card iframe URL', default=None, group='url')
508 if twitter_card_iframe_url:
509 twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url)
513 '_type': 'url_transparent',
514 'ie_key': 'TwitterCard',
515 'url': twitter_card_url,
519 raise ExtractorError('There \' s no video in this tweet.')
522 class TwitterAmplifyIE(TwitterBaseIE):
523 IE_NAME = 'twitter:amplify'
524 _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-] {36} )'
527 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
528 'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
530 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
532 'title': 'Twitter Video',
533 'thumbnail': 're:^https?://.*',
537 def _real_extract(self, url):
538 video_id = self._match_id(url)
539 webpage = self._download_webpage(url, video_id)
541 vmap_url = self._html_search_meta(
542 'twitter:amplify:vmap', webpage, 'vmap url')
543 formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
546 thumbnail = self._html_search_meta(
547 'twitter:image:src', webpage, 'thumbnail', fatal=False)
549 def _find_dimension(target):
550 w = int_or_none(self._html_search_meta(
551 'twitter: %s :width' % target, webpage, fatal=False))
552 h = int_or_none(self._html_search_meta(
553 'twitter: %s :height' % target, webpage, fatal=False))
557 thumbnail_w, thumbnail_h = _find_dimension('image')
560 'width': thumbnail_w,
561 'height': thumbnail_h,
564 video_w, video_h = _find_dimension('player')
572 'title': 'Twitter Video',
574 'thumbnails': thumbnails,