]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/twitter.py 
cebb6238c561cbfdf77c5ada514d55c216c19a34
   2  from  __future__ 
import  unicode_literals
   6  from  . common 
import  InfoExtractor
   7  from  .. compat 
import  compat_urlparse
  19  from  . periscope 
import  PeriscopeIE
  22  class  TwitterBaseIE ( InfoExtractor
):   23      def  _extract_formats_from_vmap_url ( self
,  vmap_url
,  video_id
):   24          vmap_data 
=  self
._ download
_ xml
( vmap_url
,  video_id
)   25          video_url 
=  xpath_text ( vmap_data
,  './/MediaFile' ). strip ()   26          if  determine_ext ( video_url
) ==  'm3u8' :   27              return  self
._ extract
_ m
3u8_ formats
(   28                  video_url
,  video_id
,  ext
= 'mp4' ,  m3u8_id
= 'hls' ,   29                  entry_protocol
= 'm3u8_native' )   35      def  _search_dimensions_in_video_url ( a_format
,  video_url
):   36          m 
=  re
. search ( r
'/(?P<width>\d+)x(?P<height>\d+)/' ,  video_url
)   39                  'width' :  int ( m
. group ( 'width' )),   40                  'height' :  int ( m
. group ( 'height' )),   44  class  TwitterCardIE ( TwitterBaseIE
):   45      IE_NAME 
=  'twitter:card'   46      _VALID_URL 
=  r
'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'   49              'url' :  'https://twitter.com/i/cards/tfw/v1/560070183650213889' ,   50              # MD5 checksums are different in different places   52                  'id' :  '560070183650213889' ,   54                  'title' :  'Twitter web player' ,   55                  'thumbnail' :  r
're:^https?://.*\.jpg$' ,   60              'url' :  'https://twitter.com/i/cards/tfw/v1/623160978427936768' ,   61              'md5' :  '7ee2a553b63d1bccba97fbed97d9e1c8' ,   63                  'id' :  '623160978427936768' ,   65                  'title' :  'Twitter web player' ,   66                  'thumbnail' :  r
're:^https?://.*$' ,   70              'url' :  'https://twitter.com/i/cards/tfw/v1/654001591733886977' ,   71              'md5' :  'b6d9683dd3f48e340ded81c0e917ad46' ,   75                  'title' :  'Ubuntu 11.10 Overview' ,   76                  'description' :  'md5:a831e97fa384863d6e26ce48d1c43376' ,   77                  'upload_date' :  '20111013' ,   78                  'uploader' :  'OMG! Ubuntu!' ,   79                  'uploader_id' :  'omgubuntu' ,   81              'add_ie' : [ 'Youtube' ],   84              'url' :  'https://twitter.com/i/cards/tfw/v1/665289828897005568' ,   85              'md5' :  '6dabeaca9e68cbb71c99c322a4b42a11' ,   89                  'upload_date' :  '20151113' ,   90                  'uploader_id' :  '1189339351084113920' ,   91                  'uploader' :  'ArsenalTerje' ,   92                  'title' :  'Vine by ArsenalTerje' ,   93                  'timestamp' :  1447451307 ,   97              'url' :  'https://twitter.com/i/videos/tweet/705235433198714880' ,   98              'md5' :  '884812a2adc8aaf6fe52b15ccbfa3b88' ,  100                  'id' :  '705235433198714880' ,  102                  'title' :  'Twitter web player' ,  103                  'thumbnail' :  r
're:^https?://.*' ,  106              'url' :  'https://twitter.com/i/videos/752274308186120192' ,  107              'only_matching' :  True ,  111      _API_BASE 
=  'https://api.twitter.com/1.1'  113      def  _parse_media_info ( self
,  media_info
,  video_id
):  115          for  media_variant 
in  media_info
. get ( 'variants' , []):  116              media_url 
=  media_variant
[ 'url' ]  117              if  media_url
. endswith ( '.m3u8' ):  118                  formats
. extend ( self
._ extract
_ m
3u8_ formats
( media_url
,  video_id
,  ext
= 'mp4' ,  m3u8_id
= 'hls' ))  119              elif  media_url
. endswith ( '.mpd' ):  120                  formats
. extend ( self
._ extract
_ mpd
_ formats
( media_url
,  video_id
,  mpd_id
= 'dash' ))  122                  tbr 
=  int_or_none ( dict_get ( media_variant
, ( 'bitRate' ,  'bitrate' )),  scale
= 1000 )  125                      'format_id' :  'http- %d '  %  tbr 
if  tbr 
else  'http' ,  128                  # Reported bitRate may be zero  129                  if not  a_format
[ 'tbr' ]:  132                  self
._ search
_ dimensions
_ in
_ video
_u rl
( a_format
,  media_url
)  134                  formats
. append ( a_format
)  137      def  _extract_mobile_formats ( self
,  username
,  video_id
):  138          webpage 
=  self
._ download
_ webpage
(  139              'https://mobile.twitter.com/ %s /status/ %s '  % ( username
,  video_id
),  140              video_id
,  'Downloading mobile webpage' ,  142                  # A recent mobile UA is necessary for `gt` cookie  143                  'User-Agent' :  'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0' ,  145          main_script_url 
=  self
._ html
_ search
_ regex
(  146              r
'<script[^>]+src="([^"]+main\.[^"]+)"' ,  webpage
,  'main script URL' )  147          main_script 
=  self
._ download
_ webpage
(  148              main_script_url
,  video_id
,  'Downloading main script' )  149          bearer_token 
=  self
._ search
_ regex
(  150              r
'BEARER_TOKEN\s*:\s*"([^"]+)"' ,  151              main_script
,  'bearer token' )  152          # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id  153          api_data 
=  self
._ download
_ json
(  154              ' %s /statuses/show/ %s .json'  % ( self
._ API
_ BASE
,  video_id
),  155              video_id
,  'Downloading API data' ,  157                  'Authorization' :  'Bearer '  +  bearer_token
,  159          media_info 
=  try_get ( api_data
,  lambda  o
:  o
[ 'extended_entities' ][ 'media' ][ 0 ][ 'video_info' ])  or  {}  160          return  self
._ parse
_ media
_ info
( media_info
,  video_id
)  162      def  _real_extract ( self
,  url
):  163          path
,  video_id 
=  re
. search ( self
._ VALID
_U RL
,  url
). groups ()  170          if  path
. startswith ( 'cards/' ):  171              urls
. append ( 'https://twitter.com/i/videos/'  +  video_id
)  174              webpage 
=  self
._ download
_ webpage
(  175                  u
,  video_id
,  headers
={ 'Referer' :  'https://twitter.com/' })  177              iframe_url 
=  self
._ html
_ search
_ regex
(  178                  r
'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"' ,  179                  webpage
,  'video iframe' ,  default
= None )  181                  return  self
. url_result ( iframe_url
)  183              config 
=  self
._ parse
_ json
( self
._ html
_ search
_ regex
(  184                  r
'data-(?:player-)?config="([^"]+)"' ,  webpage
,  185                  'data player config' ,  default
= '{}' ),  188              if  config
. get ( 'source_type' ) ==  'vine' :  189                  return  self
. url_result ( config
[ 'player_url' ],  'Vine' )  191              periscope_url 
=  PeriscopeIE
._ extract
_u rl
( webpage
)  193                  return  self
. url_result ( periscope_url
,  PeriscopeIE
. ie_key ())  195              video_url 
=  config
. get ( 'video_url' )  or  config
. get ( 'playlist' , [{}])[ 0 ]. get ( 'source' )  198                  if  determine_ext ( video_url
) ==  'm3u8' :  199                      formats
. extend ( self
._ extract
_ m
3u8_ formats
( video_url
,  video_id
,  ext
= 'mp4' ,  m3u8_id
= 'hls' ))  205                      self
._ search
_ dimensions
_ in
_ video
_u rl
( f
,  video_url
)  209              vmap_url 
=  config
. get ( 'vmapUrl' )  or  config
. get ( 'vmap_url' )  212                      self
._ extract
_ formats
_ from
_ vmap
_u rl
( vmap_url
,  video_id
))  216              for  entity 
in  config
. get ( 'status' , {}). get ( 'entities' , []):  217                  if  'mediaInfo'  in  entity
:  218                      media_info 
=  entity
[ 'mediaInfo' ]  221                  formats
. extend ( self
._ parse
_ media
_ info
( media_info
,  video_id
))  222                  duration 
=  float_or_none ( media_info
. get ( 'duration' , {}). get ( 'nanos' ),  scale
= 1 e9
)  224              username 
=  config
. get ( 'user' , {}). get ( 'screen_name' )  226                  formats
. extend ( self
._ extract
_ mobile
_ formats
( username
,  video_id
))  229                  title 
=  self
._ search
_ regex
( r
'<title>([^<]+)</title>' ,  webpage
,  'title' )  230                  thumbnail 
=  config
. get ( 'posterImageUrl' )  or  config
. get ( 'image_src' )  231                  duration 
=  float_or_none ( config
. get ( 'duration' ),  scale
= 1000 )  or  duration
 236                  'Authorization' :  'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw' ,  239              ct0 
=  self
._ get
_ cookies
( url
). get ( 'ct0' )  241                  headers
[ 'csrf_token' ] =  ct0
. value
 242              guest_token 
=  self
._ download
_ json
(  243                  ' %s /guest/activate.json'  %  self
._ API
_ BASE
,  video_id
,  244                  'Downloading guest token' ,  data
= b
'' ,  245                  headers
= headers
)[ 'guest_token' ]  246              headers
[ 'x-guest-token' ] =  guest_token
 247              self
._ set
_ cookie
( 'api.twitter.com' ,  'gt' ,  guest_token
)  248              config 
=  self
._ download
_ json
(  249                  ' %s /videos/tweet/config/ %s .json'  % ( self
._ API
_ BASE
,  video_id
),  250                  video_id
,  headers
= headers
)  251              track 
=  config
[ 'track' ]  252              vmap_url 
=  track
. get ( 'vmapUrl' )  254                  formats 
=  self
._ extract
_ formats
_ from
_ vmap
_u rl
( vmap_url
,  video_id
)  256                  playback_url 
=  track
[ 'playbackUrl' ]  257                  if  determine_ext ( playback_url
) ==  'm3u8' :  258                      formats 
=  self
._ extract
_ m
3u8_ formats
(  259                          playback_url
,  video_id
,  'mp4' ,  260                          entry_protocol
= 'm3u8_native' ,  m3u8_id
= 'hls' )  265              title 
=  'Twitter web player'  266              thumbnail 
=  config
. get ( 'posterImage' )  267              duration 
=  float_or_none ( track
. get ( 'durationMs' ),  scale
= 1000 )  269          self
._ remove
_ duplicate
_ formats
( formats
)  270          self
._ sort
_ formats
( formats
)  275              'thumbnail' :  thumbnail
,  276              'duration' :  duration
,  281  class  TwitterIE ( InfoExtractor
):  283      _VALID_URL 
=  r
'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P<user_id>[^/]+))/status/(?P<id>\d+)'  284      _TEMPLATE_URL 
=  'https://twitter.com/ %s /status/ %s '  285      _TEMPLATE_STATUSES_URL 
=  'https://twitter.com/statuses/ %s '  288          'url' :  'https://twitter.com/freethenipple/status/643211948184596480' ,  290              'id' :  '643211948184596480' ,  292              'title' :  'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!' ,  293              'thumbnail' :  r
're:^https?://.*\.jpg' ,  294              'description' :  'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"' ,  295              'uploader' :  'FREE THE NIPPLE' ,  296              'uploader_id' :  'freethenipple' ,  300          'url' :  'https://twitter.com/giphz/status/657991469417025536/photo/1' ,  301          'md5' :  'f36dcd5fb92bf7057f155e7d927eeb42' ,  303              'id' :  '657991469417025536' ,  305              'title' :  'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai' ,  306              'description' :  'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"' ,  307              'thumbnail' :  r
're:^https?://.*\.png' ,  309              'uploader_id' :  'giphz' ,  311          'expected_warnings' : [ 'height' ,  'width' ],  312          'skip' :  'Account suspended' ,  314          'url' :  'https://twitter.com/starwars/status/665052190608723968' ,  316              'id' :  '665052190608723968' ,  318              'title' :  'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.' ,  319              'description' :  'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."' ,  320              'uploader_id' :  'starwars' ,  321              'uploader' :  'Star Wars' ,  324          'url' :  'https://twitter.com/BTNBrentYarina/status/705235433198714880' ,  326              'id' :  '705235433198714880' ,  328              'title' :  'Brent Yarina - Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight.' ,  329              'description' :  'Brent Yarina on Twitter: "Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight."' ,  330              'uploader_id' :  'BTNBrentYarina' ,  331              'uploader' :  'Brent Yarina' ,  334              # The same video as https://twitter.com/i/videos/tweet/705235433198714880  335              # Test case of TwitterCardIE  336              'skip_download' :  True ,  339          'url' :  'https://twitter.com/jaydingeer/status/700207533655363584' ,  341              'id' :  '700207533655363584' ,  343              'title' :  'JG - BEAT PROD: @suhmeduh #Damndaniel' ,  344              'description' :  'JG on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"' ,  345              'thumbnail' :  r
're:^https?://.*\.jpg' ,  347              'uploader_id' :  'jaydingeer' ,  351          'url' :  'https://twitter.com/Filmdrunk/status/713801302971588609' ,  352          'md5' :  '89a15ed345d13b86e9a5a5e051fa308a' ,  356              'title' :  'Vince Mancini - Vine of the day' ,  357              'description' :  'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"' ,  358              'uploader' :  'Vince Mancini' ,  359              'uploader_id' :  'Filmdrunk' ,  360              'timestamp' :  1402826626 ,  361              'upload_date' :  '20140615' ,  365          'url' :  'https://twitter.com/captainamerica/status/719944021058060289' ,  367              'id' :  '719944021058060289' ,  369              'title' :  'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.' ,  370              'description' :  'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"' ,  371              'uploader_id' :  'captainamerica' ,  372              'uploader' :  'Captain America' ,  376          'url' :  'https://twitter.com/OPP_HSD/status/779210622571536384' ,  378              'id' :  '1zqKVVlkqLaKB' ,  380              'title' :  'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence' ,  381              'description' :  'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence  https://t.co/EKrVgIXF3s"' ,  382              'upload_date' :  '20160923' ,  383              'uploader_id' :  'OPP_HSD' ,  384              'uploader' :  'Sgt Kerry Schmidt' ,  385              'timestamp' :  1474613214 ,  387          'add_ie' : [ 'Periscope' ],  389          # has mp4 formats via mobile API  390          'url' :  'https://twitter.com/news_al3alm/status/852138619213144067' ,  392              'id' :  '852138619213144067' ,  394              'title' :  'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة' ,  395              'description' :  'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة   https://t.co/xg6OhpyKfN"' ,  396              'uploader' :  'عالم الأخبار' ,  397              'uploader_id' :  'news_al3alm' ,  401          'url' :  'https://twitter.com/i/web/status/910031516746514432' ,  403              'id' :  '910031516746514432' ,  405              'title' :  'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.' ,  406              'thumbnail' :  r
're:^https?://.*\.jpg' ,  407              'description' :  'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"' ,  408              'uploader' :  'Préfet de Guadeloupe' ,  409              'uploader_id' :  'Prefet971' ,  413              'skip_download' :  True ,   # requires ffmpeg  416          # card via api.twitter.com/1.1/videos/tweet/config  417          'url' :  'https://twitter.com/LisPower1/status/1001551623938805763' ,  419              'id' :  '1001551623938805763' ,  421              'title' :  're:.*?Shep is on a roll today.*?' ,  422              'thumbnail' :  r
're:^https?://.*\.jpg' ,  423              'description' :  'md5:63b036c228772523ae1924d5f8e5ed6b' ,  424              'uploader' :  'Lis Power' ,  425              'uploader_id' :  'LisPower1' ,  429              'skip_download' :  True ,   # requires ffmpeg  432          'url' :  'https://twitter.com/foobar/status/1087791357756956680' ,  434              'id' :  '1087791357756956680' ,  436              'title' :  'Twitter - A new is coming.  Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!' ,  437              'thumbnail' :  r
're:^https?://.*\.jpg' ,  438              'description' :  'md5:66d493500c013e3e2d434195746a7f78' ,  439              'uploader' :  'Twitter' ,  440              'uploader_id' :  'Twitter' ,  445      def  _real_extract ( self
,  url
):  446          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  447          twid 
=  mobj
. group ( 'id' )  449          webpage
,  urlh 
=  self
._ download
_ webpage
_ handle
(  450              self
._ TEMPLATE
_ STATUSES
_U RL 
%  twid
,  twid
)  452          if  'twitter.com/account/suspended'  in  urlh
. geturl ():  453              raise  ExtractorError ( 'Account suspended by Twitter.' ,  expected
= True )  457          redirect_mobj 
=  re
. match ( self
._ VALID
_U RL
,  urlh
. geturl ())  459              user_id 
=  redirect_mobj
. group ( 'user_id' )  462              user_id 
=  mobj
. group ( 'user_id' )  464          username 
=  remove_end ( self
._ og
_ search
_ title
( webpage
),  ' on Twitter' )  466          title 
=  description 
=  self
._ og
_ search
_ description
( webpage
). strip ( '' ). replace ( ' \n ' ,  ' ' ). strip ( '“”' )  468          # strip  'https -_t.co_BJYgOjSeGA' junk from filenames  469          title 
=  re
. sub ( r
'\s+(https?://[^ ]+)' ,  '' ,  title
)  472              'uploader_id' :  user_id
,  473              'uploader' :  username
,  475              'description' :  ' %s  on Twitter: " %s "'  % ( username
,  description
),  476              'title' :  username 
+  ' - '  +  title
,  479          mobj 
=  re
. search ( r
'''(?x)  480              <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*  481                  <source[^>]+video-src="(?P<url>[^"]+)"  485              more_info 
=  mobj
. group ( 'more_info' )  486              height 
=  int_or_none ( self
._ search
_ regex
(  487                  r
'data-height="(\d+)"' ,  more_info
,  'height' ,  fatal
= False ))  488              width 
=  int_or_none ( self
._ search
_ regex
(  489                  r
'data-width="(\d+)"' ,  more_info
,  'width' ,  fatal
= False ))  490              thumbnail 
=  self
._ search
_ regex
(  491                  r
'poster="([^"]+)"' ,  more_info
,  'poster' ,  fatal
= False )  494                  'url' :  mobj
. group ( 'url' ),  497                  'thumbnail' :  thumbnail
,  501          twitter_card_url 
=  None  502          if  'class="PlayableMedia'  in  webpage
:  503              twitter_card_url 
=  ' %s //twitter.com/i/videos/tweet/ %s '  % ( self
. http_scheme (),  twid
)  505              twitter_card_iframe_url 
=  self
._ search
_ regex
(  506                  r
'data-full-card-iframe-url=([\' "])(?P<url>(?:(?!\1).)+)\1',  507                  webpage, 'Twitter card iframe URL', default=None, group='url')  508              if twitter_card_iframe_url:  509                  twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url)  513                  '_type': 'url_transparent',  514                  'ie_key': 'TwitterCard',  515                  'url': twitter_card_url,  519          raise ExtractorError('There \' s no video in this tweet.')  522  class TwitterAmplifyIE(TwitterBaseIE):  523      IE_NAME = 'twitter:amplify'  524      _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-] {36} )'  527          'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',  528          'md5': '7df102d0b9fd7066b86f3159f8e81bf6',  530              'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',  532              'title': 'Twitter Video',  533              'thumbnail': 're:^https?://.*',  537      def _real_extract(self, url):  538          video_id = self._match_id(url)  539          webpage = self._download_webpage(url, video_id)  541          vmap_url = self._html_search_meta(  542              'twitter:amplify:vmap', webpage, 'vmap url')  543          formats = self._extract_formats_from_vmap_url(vmap_url, video_id)  546          thumbnail = self._html_search_meta(  547              'twitter:image:src', webpage, 'thumbnail', fatal=False)  549          def _find_dimension(target):  550              w = int_or_none(self._html_search_meta(  551                  'twitter: %s :width' % target, webpage, fatal=False))  552              h = int_or_none(self._html_search_meta(  553                  'twitter: %s :height' % target, webpage, fatal=False))  557              thumbnail_w, thumbnail_h = _find_dimension('image')  560                  'width': thumbnail_w,  561                  'height': thumbnail_h,  564          video_w, video_h = _find_dimension('player')  572              'title': 'Twitter Video',  574              'thumbnails': thumbnails,