]>
Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/twitter.py
2 from __future__
import unicode_literals
6 from . common
import InfoExtractor
17 class TwitterBaseIE ( InfoExtractor
):
18 def _get_vmap_video_url ( self
, vmap_url
, video_id
):
19 vmap_data
= self
._ download
_ xml
( vmap_url
, video_id
)
20 return xpath_text ( vmap_data
, './/MediaFile' ). strip ()
23 class TwitterCardIE ( TwitterBaseIE
):
24 IE_NAME
= 'twitter:card'
25 _VALID_URL
= r
'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
28 'url' : 'https://twitter.com/i/cards/tfw/v1/560070183650213889' ,
29 # MD5 checksums are different in different places
31 'id' : '560070183650213889' ,
33 'title' : 'TwitterCard' ,
34 'thumbnail' : 're:^https?://.*\.jpg$' ,
39 'url' : 'https://twitter.com/i/cards/tfw/v1/623160978427936768' ,
40 'md5' : '7ee2a553b63d1bccba97fbed97d9e1c8' ,
42 'id' : '623160978427936768' ,
44 'title' : 'TwitterCard' ,
45 'thumbnail' : 're:^https?://.*\.jpg' ,
50 'url' : 'https://twitter.com/i/cards/tfw/v1/654001591733886977' ,
51 'md5' : 'd4724ffe6d2437886d004fa5de1043b3' ,
55 'title' : 'Ubuntu 11.10 Overview' ,
56 'description' : 'Take a quick peek at what \' s new and improved in Ubuntu 11.10. \n\n Once installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/' ,
57 'upload_date' : '20111013' ,
58 'uploader' : 'OMG! Ubuntu!' ,
59 'uploader_id' : 'omgubuntu' ,
61 'add_ie' : [ 'Youtube' ],
64 'url' : 'https://twitter.com/i/cards/tfw/v1/665289828897005568' ,
65 'md5' : 'ab2745d0b0ce53319a534fccaa986439' ,
69 'upload_date' : '20151113' ,
70 'uploader_id' : '1189339351084113920' ,
71 'uploader' : 'ArsenalTerje' ,
72 'title' : 'Vine by ArsenalTerje' ,
78 def _real_extract ( self
, url
):
79 video_id
= self
._ match
_ id
( url
)
81 # Different formats served for different User-Agents
83 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)' , # mp4
84 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0' , # webm
89 for user_agent
in USER_AGENTS
:
90 request
= sanitized_Request ( url
)
91 request
. add_header ( 'User-Agent' , user_agent
)
92 webpage
= self
._ download
_ webpage
( request
, video_id
)
94 iframe_url
= self
._ html
_ search
_ regex
(
95 r
'<iframe[^>]+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"' ,
96 webpage
, 'video iframe' , default
= None )
98 return self
. url_result ( iframe_url
)
100 config
= self
._ parse
_ json
( self
._ html
_ search
_ regex
(
101 r
'data-player-config="([^"]+)"' , webpage
, 'data player config' ),
103 if 'playlist' not in config
:
104 if 'vmapUrl' in config
:
106 'url' : self
._ get
_ vmap
_ video
_u rl
( config
[ 'vmapUrl' ], video_id
),
108 break # same video regardless of UA
111 video_url
= config
[ 'playlist' ][ 0 ][ 'source' ]
117 m
= re
. search ( r
'/(?P<width>\d+)x(?P<height>\d+)/' , video_url
)
120 'width' : int ( m
. group ( 'width' )),
121 'height' : int ( m
. group ( 'height' )),
124 self
._ sort
_ formats
( formats
)
126 thumbnail
= config
. get ( 'posterImageUrl' )
127 duration
= float_or_none ( config
. get ( 'duration' ))
131 'title' : 'TwitterCard' ,
132 'thumbnail' : thumbnail
,
133 'duration' : duration
,
138 class TwitterIE ( InfoExtractor
):
140 _VALID_URL
= r
'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'
141 _TEMPLATE_URL
= 'https://twitter.com/ %s /status/ %s '
144 'url' : 'https://twitter.com/freethenipple/status/643211948184596480' ,
145 # MD5 checksums are different in different places
147 'id' : '643211948184596480' ,
149 'title' : 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!' ,
150 'thumbnail' : 're:^https?://.*\.jpg' ,
152 'description' : 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"' ,
153 'uploader' : 'FREE THE NIPPLE' ,
154 'uploader_id' : 'freethenipple' ,
157 'url' : 'https://twitter.com/giphz/status/657991469417025536/photo/1' ,
158 'md5' : 'f36dcd5fb92bf7057f155e7d927eeb42' ,
160 'id' : '657991469417025536' ,
162 'title' : 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai' ,
163 'description' : 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"' ,
164 'thumbnail' : 're:^https?://.*\.png' ,
166 'uploader_id' : 'giphz' ,
168 'expected_warnings' : [ 'height' , 'width' ],
170 'url' : 'https://twitter.com/starwars/status/665052190608723968' ,
171 'md5' : '39b7199856dee6cd4432e72c74bc69d4' ,
173 'id' : '665052190608723968' ,
175 'title' : 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.' ,
176 'description' : 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."' ,
177 'uploader_id' : 'starwars' ,
178 'uploader' : 'Star Wars' ,
182 def _real_extract ( self
, url
):
183 mobj
= re
. match ( self
._ VALID
_U RL
, url
)
184 user_id
= mobj
. group ( 'user_id' )
185 twid
= mobj
. group ( 'id' )
187 webpage
= self
._ download
_ webpage
( self
._ TEMPLATE
_U RL
% ( user_id
, twid
), twid
)
189 username
= remove_end ( self
._ og
_ search
_ title
( webpage
), ' on Twitter' )
191 title
= description
= self
._ og
_ search
_ description
( webpage
). strip ( '' ). replace ( ' \n ' , ' ' ). strip ( '“”' )
193 # strip 'https -_t.co_BJYgOjSeGA' junk from filenames
194 title
= re
. sub ( r
'\s+(https?://[^ ]+)' , '' , title
)
197 'uploader_id' : user_id
,
198 'uploader' : username
,
200 'description' : ' %s on Twitter: " %s "' % ( username
, description
),
201 'title' : username
+ ' - ' + title
,
204 card_id
= self
._ search
_ regex
(
205 r
'["\' ]/ i
/ cards
/ tfw
/ v1
/( \d
+) ', webpage, ' twitter card url
', default=None)
207 card_url = ' https
:// twitter
. com
/ i
/ cards
/ tfw
/ v1
/ ' + card_id
209 ' _type
': ' url_transparent
',
210 ' ie_key
': ' TwitterCard
',
215 mobj = re.search(r'''(?x)
216 <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
217 <source[^>]+video-src="(?P<url>[^"]+)"
221 more_info = mobj.group(' more_info
')
222 height = int_or_none(self._search_regex(
223 r' data
- height
= "(\d+)" ', more_info, ' height
', fatal=False))
224 width = int_or_none(self._search_regex(
225 r' data
- width
= "(\d+)" ', more_info, ' width
', fatal=False))
226 thumbnail = self._search_regex(
227 r' poster
= "([^" ]+) "', more_info, 'poster', fatal=False)
230 'url': mobj.group('url'),
233 'thumbnail': thumbnail,
237 raise ExtractorError('There \' s no video in this tweet.')
240 class TwitterAmplifyIE(TwitterBaseIE):
241 IE_NAME = 'twitter:amplify'
242 _VALID_URL = 'https?://amp\.twimg\.com/v/(?P<id>[0-9a-f\-] {36} )'
245 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
246 'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
248 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
250 'title': 'Twitter Video',
251 'thumbnail': 're:^https?://.*',
255 def _real_extract(self, url):
256 video_id = self._match_id(url)
257 webpage = self._download_webpage(url, video_id)
259 vmap_url = self._html_search_meta(
260 'twitter:amplify:vmap', webpage, 'vmap url')
261 video_url = self._get_vmap_video_url(vmap_url, video_id)
264 thumbnail = self._html_search_meta(
265 'twitter:image:src', webpage, 'thumbnail', fatal=False)
267 def _find_dimension(target):
268 w = int_or_none(self._html_search_meta(
269 'twitter: %s :width' % target, webpage, fatal=False))
270 h = int_or_none(self._html_search_meta(
271 'twitter: %s :height' % target, webpage, fatal=False))
275 thumbnail_w, thumbnail_h = _find_dimension('image')
278 'width': thumbnail_w,
279 'height': thumbnail_h,
282 video_w, video_h = _find_dimension('player')
291 'title': 'Twitter Video',
293 'thumbnails': thumbnails,