]>
 
 
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/twitter.py 
 
 
 
 
 
 
 
 
   2  from  __future__ 
import  unicode_literals
 
   6  from  . common 
import  InfoExtractor
 
   7  from  .. compat 
import  compat_urlparse
 
  17  from  . periscope 
import  PeriscopeIE
 
  20  class  TwitterBaseIE ( InfoExtractor
):  
  21      def  _get_vmap_video_url ( self
,  vmap_url
,  video_id
):  
  22          vmap_data 
=  self
._ download
_ xml
( vmap_url
,  video_id
)  
  23          return  xpath_text ( vmap_data
,  './/MediaFile' ). strip ()  
  26  class  TwitterCardIE ( TwitterBaseIE
):  
  27      IE_NAME 
=  'twitter:card'  
  28      _VALID_URL 
=  r
'https?://(?:www\.)?twitter\.com/i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'  
  31              'url' :  'https://twitter.com/i/cards/tfw/v1/560070183650213889' ,  
  32              # MD5 checksums are different in different places  
  34                  'id' :  '560070183650213889' ,  
  36                  'title' :  'Twitter Card' ,  
  37                  'thumbnail' :  're:^https?://.*\.jpg$' ,  
  42              'url' :  'https://twitter.com/i/cards/tfw/v1/623160978427936768' ,  
  43              'md5' :  '7ee2a553b63d1bccba97fbed97d9e1c8' ,  
  45                  'id' :  '623160978427936768' ,  
  47                  'title' :  'Twitter Card' ,  
  48                  'thumbnail' :  're:^https?://.*\.jpg' ,  
  53              'url' :  'https://twitter.com/i/cards/tfw/v1/654001591733886977' ,  
  54              'md5' :  'b6d9683dd3f48e340ded81c0e917ad46' ,  
  58                  'title' :  'Ubuntu 11.10 Overview' ,  
  59                  'description' :  'md5:a831e97fa384863d6e26ce48d1c43376' ,  
  60                  'upload_date' :  '20111013' ,  
  61                  'uploader' :  'OMG! Ubuntu!' ,  
  62                  'uploader_id' :  'omgubuntu' ,  
  64              'add_ie' : [ 'Youtube' ],  
  67              'url' :  'https://twitter.com/i/cards/tfw/v1/665289828897005568' ,  
  68              'md5' :  'ab2745d0b0ce53319a534fccaa986439' ,  
  72                  'upload_date' :  '20151113' ,  
  73                  'uploader_id' :  '1189339351084113920' ,  
  74                  'uploader' :  'ArsenalTerje' ,  
  75                  'title' :  'Vine by ArsenalTerje' ,  
  79              'url' :  'https://twitter.com/i/videos/tweet/705235433198714880' ,  
  80              'md5' :  '3846d0a07109b5ab622425449b59049d' ,  
  82                  'id' :  '705235433198714880' ,  
  84                  'title' :  'Twitter web player' ,  
  85                  'thumbnail' :  're:^https?://.*\.jpg' ,  
  88              'url' :  'https://twitter.com/i/videos/752274308186120192' ,  
  89              'only_matching' :  True ,  
  93      def  _real_extract ( self
,  url
):  
  94          video_id 
=  self
._ match
_ id
( url
)  
 100          webpage 
=  self
._ download
_ webpage
( url
,  video_id
)  
 102          iframe_url 
=  self
._ html
_ search
_ regex
(  
 103              r
'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"' ,  
 104              webpage
,  'video iframe' ,  default
= None )  
 106              return  self
. url_result ( iframe_url
)  
 108          config 
=  self
._ parse
_ json
( self
._ html
_ search
_ regex
(  
 109              r
'data-(?:player-)?config="([^"]+)"' ,  webpage
,  
 110              'data player config' ,  default
= '{}' ),  
 113          if  config
. get ( 'source_type' ) ==  'vine' :  
 114              return  self
. url_result ( config
[ 'player_url' ],  'Vine' )  
 116          periscope_url 
=  PeriscopeIE
._ extract
_u rl
( webpage
)  
 118              return  self
. url_result ( periscope_url
,  PeriscopeIE
. ie_key ())  
 120          def  _search_dimensions_in_video_url ( a_format
,  video_url
):  
 121              m 
=  re
. search ( r
'/(?P<width>\d+)x(?P<height>\d+)/' ,  video_url
)  
 124                      'width' :  int ( m
. group ( 'width' )),  
 125                      'height' :  int ( m
. group ( 'height' )),  
 128          video_url 
=  config
. get ( 'video_url' )  or  config
. get ( 'playlist' , [{}])[ 0 ]. get ( 'source' )  
 131              if  determine_ext ( video_url
) ==  'm3u8' :  
 132                  formats
. extend ( self
._ extract
_ m
3u8_ formats
( video_url
,  video_id
,  ext
= 'mp4' ,  m3u8_id
= 'hls' ))  
 138                  _search_dimensions_in_video_url ( f
,  video_url
)  
 142          vmap_url 
=  config
. get ( 'vmapUrl' )  or  config
. get ( 'vmap_url' )  
 145                  'url' :  self
._ get
_ vmap
_ video
_u rl
( vmap_url
,  video_id
),  
 150          for  entity 
in  config
. get ( 'status' , {}). get ( 'entities' , []):  
 151              if  'mediaInfo'  in  entity
:  
 152                  media_info 
=  entity
[ 'mediaInfo' ]  
 155              for  media_variant 
in  media_info
[ 'variants' ]:  
 156                  media_url 
=  media_variant
[ 'url' ]  
 157                  if  media_url
. endswith ( '.m3u8' ):  
 158                      formats
. extend ( self
._ extract
_ m
3u8_ formats
( media_url
,  video_id
,  ext
= 'mp4' ,  m3u8_id
= 'hls' ))  
 159                  elif  media_url
. endswith ( '.mpd' ):  
 160                      formats
. extend ( self
._ extract
_ mpd
_ formats
( media_url
,  video_id
,  mpd_id
= 'dash' ))  
 162                      vbr 
=  int_or_none ( media_variant
. get ( 'bitRate' ),  scale
= 1000 )  
 165                          'format_id' :  'http- %d '  %  vbr 
if  vbr 
else  'http' ,  
 168                      # Reported bitRate may be zero  
 169                      if not  a_format
[ 'vbr' ]:  
 172                      _search_dimensions_in_video_url ( a_format
,  media_url
)  
 174                      formats
. append ( a_format
)  
 176              duration 
=  float_or_none ( media_info
. get ( 'duration' , {}). get ( 'nanos' ),  scale
= 1 e9
)  
 178          self
._ sort
_ formats
( formats
)  
 180          title 
=  self
._ search
_ regex
( r
'<title>([^<]+)</title>' ,  webpage
,  'title' )  
 181          thumbnail 
=  config
. get ( 'posterImageUrl' )  or  config
. get ( 'image_src' )  
 182          duration 
=  float_or_none ( config
. get ( 'duration' ))  or  duration
 
 187              'thumbnail' :  thumbnail
,  
 188              'duration' :  duration
,  
 193  class  TwitterIE ( InfoExtractor
):  
 195      _VALID_URL 
=  r
'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'  
 196      _TEMPLATE_URL 
=  'https://twitter.com/ %s /status/ %s '  
 199          'url' :  'https://twitter.com/freethenipple/status/643211948184596480' ,  
 201              'id' :  '643211948184596480' ,  
 203              'title' :  'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!' ,  
 204              'thumbnail' :  're:^https?://.*\.jpg' ,  
 205              'description' :  'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"' ,  
 206              'uploader' :  'FREE THE NIPPLE' ,  
 207              'uploader_id' :  'freethenipple' ,  
 210              'skip_download' :  True ,   # requires ffmpeg  
 213          'url' :  'https://twitter.com/giphz/status/657991469417025536/photo/1' ,  
 214          'md5' :  'f36dcd5fb92bf7057f155e7d927eeb42' ,  
 216              'id' :  '657991469417025536' ,  
 218              'title' :  'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai' ,  
 219              'description' :  'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"' ,  
 220              'thumbnail' :  're:^https?://.*\.png' ,  
 222              'uploader_id' :  'giphz' ,  
 224          'expected_warnings' : [ 'height' ,  'width' ],  
 225          'skip' :  'Account suspended' ,  
 227          'url' :  'https://twitter.com/starwars/status/665052190608723968' ,  
 228          'md5' :  '39b7199856dee6cd4432e72c74bc69d4' ,  
 230              'id' :  '665052190608723968' ,  
 232              'title' :  'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.' ,  
 233              'description' :  'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."' ,  
 234              'uploader_id' :  'starwars' ,  
 235              'uploader' :  'Star Wars' ,  
 238          'url' :  'https://twitter.com/BTNBrentYarina/status/705235433198714880' ,  
 240              'id' :  '705235433198714880' ,  
 242              'title' :  'Brent Yarina - Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight.' ,  
 243              'description' :  'Brent Yarina on Twitter: "Khalil Iverson \' s missed highlight dunk. And made highlight dunk. In one highlight."' ,  
 244              'uploader_id' :  'BTNBrentYarina' ,  
 245              'uploader' :  'Brent Yarina' ,  
 248              # The same video as https://twitter.com/i/videos/tweet/705235433198714880  
 249              # Test case of TwitterCardIE  
 250              'skip_download' :  True ,  
 253          'url' :  'https://twitter.com/jaydingeer/status/700207533655363584' ,  
 256              'id' :  '700207533655363584' ,  
 258              'title' :  'JG - BEAT PROD: @suhmeduh #Damndaniel' ,  
 259              'description' :  'JG on Twitter: "BEAT PROD: @suhmeduh  https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"' ,  
 260              'thumbnail' :  're:^https?://.*\.jpg' ,  
 262              'uploader_id' :  'jaydingeer' ,  
 265              'skip_download' :  True ,   # requires ffmpeg  
 268          'url' :  'https://twitter.com/Filmdrunk/status/713801302971588609' ,  
 269          'md5' :  '89a15ed345d13b86e9a5a5e051fa308a' ,  
 273              'title' :  'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン' ,  
 274              'uploader' :  'TAKUMA' ,  
 275              'uploader_id' :  '1004126642786242560' ,  
 276              'upload_date' :  '20140615' ,  
 280          'url' :  'https://twitter.com/captainamerica/status/719944021058060289' ,  
 282              'id' :  '719944021058060289' ,  
 284              'title' :  'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.' ,  
 285              'description' :  'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"' ,  
 286              'uploader_id' :  'captainamerica' ,  
 287              'uploader' :  'Captain America' ,  
 290              'skip_download' :  True ,   # requires ffmpeg  
 293          'url' :  'https://twitter.com/OPP_HSD/status/779210622571536384' ,  
 295              'id' :  '1zqKVVlkqLaKB' ,  
 297              'title' :  'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence' ,  
 298              'upload_date' :  '20160923' ,  
 299              'uploader_id' :  'OPP_HSD' ,  
 300              'uploader' :  'Sgt Kerry Schmidt - Ontario Provincial Police' ,  
 301              'timestamp' :  1474613214 ,  
 303          'add_ie' : [ 'Periscope' ],  
 306      def  _real_extract ( self
,  url
):  
 307          mobj 
=  re
. match ( self
._ VALID
_U RL
,  url
)  
 308          user_id 
=  mobj
. group ( 'user_id' )  
 309          twid 
=  mobj
. group ( 'id' )  
 311          webpage
,  urlh 
=  self
._ download
_ webpage
_ handle
(  
 312              self
._ TEMPLATE
_U RL 
% ( user_id
,  twid
),  twid
)  
 314          if  'twitter.com/account/suspended'  in  urlh
. geturl ():  
 315              raise  ExtractorError ( 'Account suspended by Twitter.' ,  expected
= True )  
 317          username 
=  remove_end ( self
._ og
_ search
_ title
( webpage
),  ' on Twitter' )  
 319          title 
=  description 
=  self
._ og
_ search
_ description
( webpage
). strip ( '' ). replace ( ' \n ' ,  ' ' ). strip ( '“”' )  
 321          # strip  'https -_t.co_BJYgOjSeGA' junk from filenames  
 322          title 
=  re
. sub ( r
'\s+(https?://[^ ]+)' ,  '' ,  title
)  
 325              'uploader_id' :  user_id
,  
 326              'uploader' :  username
,  
 328              'description' :  ' %s  on Twitter: " %s "'  % ( username
,  description
),  
 329              'title' :  username 
+  ' - '  +  title
,  
 332          mobj 
=  re
. search ( r
'''(?x)  
 333              <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*  
 334                  <source[^>]+video-src="(?P<url>[^"]+)"  
 338              more_info 
=  mobj
. group ( 'more_info' )  
 339              height 
=  int_or_none ( self
._ search
_ regex
(  
 340                  r
'data-height="(\d+)"' ,  more_info
,  'height' ,  fatal
= False ))  
 341              width 
=  int_or_none ( self
._ search
_ regex
(  
 342                  r
'data-width="(\d+)"' ,  more_info
,  'width' ,  fatal
= False ))  
 343              thumbnail 
=  self
._ search
_ regex
(  
 344                  r
'poster="([^"]+)"' ,  more_info
,  'poster' ,  fatal
= False )  
 347                  'url' :  mobj
. group ( 'url' ),  
 350                  'thumbnail' :  thumbnail
,  
 354          twitter_card_url 
=  None  
 355          if  'class="PlayableMedia'  in  webpage
:  
 356              twitter_card_url 
=  ' %s //twitter.com/i/videos/tweet/ %s '  % ( self
. http_scheme (),  twid
)  
 358              twitter_card_iframe_url 
=  self
._ search
_ regex
(  
 359                  r
'data-full-card-iframe-url=([\' "])(?P<url>(?:(?!\1).)+)\1',  
 360                  webpage, 'Twitter card iframe URL', default=None, group='url')  
 361              if twitter_card_iframe_url:  
 362                  twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url)  
 366                  '_type': 'url_transparent',  
 367                  'ie_key': 'TwitterCard',  
 368                  'url': twitter_card_url,  
 372          raise ExtractorError('There \' s no video in this tweet.')  
 375  class TwitterAmplifyIE(TwitterBaseIE):  
 376      IE_NAME = 'twitter:amplify'  
 377      _VALID_URL = r'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-] {36} )'  
 380          'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',  
 381          'md5': '7df102d0b9fd7066b86f3159f8e81bf6',  
 383              'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',  
 385              'title': 'Twitter Video',  
 386              'thumbnail': 're:^https?://.*',  
 390      def _real_extract(self, url):  
 391          video_id = self._match_id(url)  
 392          webpage = self._download_webpage(url, video_id)  
 394          vmap_url = self._html_search_meta(  
 395              'twitter:amplify:vmap', webpage, 'vmap url')  
 396          video_url = self._get_vmap_video_url(vmap_url, video_id)  
 399          thumbnail = self._html_search_meta(  
 400              'twitter:image:src', webpage, 'thumbnail', fatal=False)  
 402          def _find_dimension(target):  
 403              w = int_or_none(self._html_search_meta(  
 404                  'twitter: %s :width' % target, webpage, fatal=False))  
 405              h = int_or_none(self._html_search_meta(  
 406                  'twitter: %s :height' % target, webpage, fatal=False))  
 410              thumbnail_w, thumbnail_h = _find_dimension('image')  
 413                  'width': thumbnail_w,  
 414                  'height': thumbnail_h,  
 417          video_w, video_h = _find_dimension('player')  
 426              'title': 'Twitter Video',  
 428              'thumbnails': thumbnails,