]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/instagram.py
7e0e838f05a5e4a527cd9cf89e84f884ac1b2498
   1 from __future__ 
import unicode_literals
 
   8 from .common 
import InfoExtractor
 
  15     get_element_by_attribute
, 
  24 class InstagramIE(InfoExtractor
): 
  25     _VALID_URL 
= r
'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))' 
  27         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 
  28         'md5': '0d2da106a9d2631273e192b372806516', 
  32             'title': 'Video by naomipq', 
  33             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 
  34             'thumbnail': r
're:^https?://.*\.jpg', 
  35             'timestamp': 1371748545, 
  36             'upload_date': '20130620', 
  37             'uploader_id': 'naomipq', 
  38             'uploader': 'Naomi Leonor Phan-Quang', 
  45         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', 
  49             'title': 'Video by britneyspears', 
  50             'thumbnail': r
're:^https?://.*\.jpg', 
  51             'timestamp': 1453760977, 
  52             'upload_date': '20160125', 
  53             'uploader_id': 'britneyspears', 
  54             'uploader': 'Britney Spears', 
  60             'skip_download': True, 
  64         'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', 
  86             'title': 'Post by instagram', 
  87             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', 
  90         'url': 'https://instagram.com/p/-Cmh1cukG2/', 
  91         'only_matching': True, 
  93         'url': 'http://instagram.com/p/9o6LshA7zy/embed/', 
  94         'only_matching': True, 
  98     def _extract_embed_url(webpage
): 
 100             r
'<iframe[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?instagram\
.com
/p
/[^
/]+/embed
.*?
)\
1', 
 103             return mobj.group('url
') 
 105         blockquote_el = get_element_by_attribute( 
 106             'class', 'instagram
-media
', webpage) 
 107         if blockquote_el is None: 
 111             r'<a
[^
>]+href
=([\'"])(?P<link>[^\'"]+)\
1', blockquote_el) 
 113             return mobj.group('link
') 
 115     def _real_extract(self, url): 
 116         mobj = re.match(self._VALID_URL, url) 
 117         video_id = mobj.group('id') 
 118         url = mobj.group('url
') 
 120         webpage = self._download_webpage(url, video_id) 
 122         (video_url, description, thumbnail, timestamp, uploader, 
 123          uploader_id, like_count, comment_count, comments, height, 
 126         shared_data = self._parse_json( 
 128                 r'window\
._sharedData\s
*=\s
*({.+?
});', 
 129                 webpage, 'shared data
', default='{}'), 
 130             video_id, fatal=False) 
 134                 (lambda x: x['entry_data
']['PostPage
'][0]['graphql
']['shortcode_media
'], 
 135                  lambda x: x['entry_data
']['PostPage
'][0]['media
']), 
 138                 video_url = media.get('video_url
') 
 139                 height = int_or_none(media.get('dimensions
', {}).get('height
')) 
 140                 width = int_or_none(media.get('dimensions
', {}).get('width
')) 
 141                 description = try_get( 
 142                     media, lambda x: x['edge_media_to_caption
']['edges
'][0]['node
']['text
'], 
 143                     compat_str) or media.get('caption
') 
 144                 thumbnail = media.get('display_src
') 
 145                 timestamp = int_or_none(media.get('taken_at_timestamp
') or media.get('date
')) 
 146                 uploader = media.get('owner
', {}).get('full_name
') 
 147                 uploader_id = media.get('owner
', {}).get('username
') 
 149                 def get_count(key, kind): 
 150                     return int_or_none(try_get( 
 151                         media, (lambda x: x['edge_media_
%s' % key]['count
'], 
 152                                 lambda x: x['%ss' % kind]['count
']))) 
 153                 like_count = get_count('preview_like
', 'like
') 
 154                 comment_count = get_count('to_comment
', 'comment
') 
 157                     'author
': comment.get('user
', {}).get('username
'), 
 158                     'author_id
': comment.get('user
', {}).get('id'), 
 159                     'id': comment.get('id'), 
 160                     'text
': comment.get('text
'), 
 161                     'timestamp
': int_or_none(comment.get('created_at
')), 
 162                 } for comment in media.get( 
 163                     'comments
', {}).get('nodes
', []) if comment.get('text
')] 
 166                         media, lambda x: x['edge_sidecar_to_children
']['edges
'], 
 170                         for edge_num, edge in enumerate(edges, start=1): 
 171                             node = try_get(edge, lambda x: x['node
'], dict) 
 174                             node_video_url = url_or_none(node.get('video_url
')) 
 175                             if not node_video_url: 
 178                                 'id': node.get('shortcode
') or node['id'], 
 179                                 'title
': 'Video 
%d' % edge_num, 
 180                                 'url
': node_video_url, 
 181                                 'thumbnail
': node.get('display_url
'), 
 182                                 'width
': int_or_none(try_get(node, lambda x: x['dimensions
']['width
'])), 
 183                                 'height
': int_or_none(try_get(node, lambda x: x['dimensions
']['height
'])), 
 184                                 'view_count
': int_or_none(node.get('video_view_count
')), 
 186                         return self.playlist_result( 
 188                             'Post by 
%s' % uploader_id if uploader_id else None, 
 192             video_url = self._og_search_video_url(webpage, secure=False) 
 201             uploader_id = self._search_regex( 
 202                 r'"owner"\s
*:\s
*{\s
*"username"\s
*:\s
*"(.+?)"', 
 203                 webpage, 'uploader 
id', fatal=False) 
 206             description = self._search_regex( 
 207                 r'"caption"\s
*:\s
*"(.+?)"', webpage, 'description
', default=None) 
 208             if description is not None: 
 209                 description = lowercase_escape(description) 
 212             thumbnail = self._og_search_thumbnail(webpage) 
 218             'title
': 'Video by 
%s' % uploader_id, 
 219             'description
': description, 
 220             'thumbnail
': thumbnail, 
 221             'timestamp
': timestamp, 
 222             'uploader_id
': uploader_id, 
 223             'uploader
': uploader, 
 224             'like_count
': like_count, 
 225             'comment_count
': comment_count, 
 226             'comments
': comments, 
 230 class InstagramUserIE(InfoExtractor): 
 231     _VALID_URL = r'https?
://(?
:www\
.)?instagram\
.com
/(?P
<id>[^
/]{2,})/?
(?
:$|
[?
#])' 
 232     IE_DESC 
= 'Instagram user profile' 
 233     IE_NAME 
= 'instagram:user' 
 235         'url': 'https://instagram.com/porsche', 
 242             'extract_flat': True, 
 243             'skip_download': True, 
 250     def _entries(self
, data
): 
 251         def get_count(suffix
): 
 252             return int_or_none(try_get( 
 253                 node
, lambda x
: x
['edge_media_' + suffix
]['count'])) 
 255         uploader_id 
= data
['entry_data']['ProfilePage'][0]['graphql']['user']['id'] 
 256         csrf_token 
= data
['config']['csrf_token'] 
 257         rhx_gis 
= data
.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' 
 259         self
._set
_cookie
('instagram.com', 'ig_pr', '1') 
 262         for page_num 
in itertools
.count(1): 
 263             variables 
= json
.dumps({ 
 270                 gis_tmpls 
= [self
._gis
_tmpl
] 
 275                     '%s:%s' % (rhx_gis
, csrf_token
), 
 276                     '%s:%s:%s' % (rhx_gis
, csrf_token
, std_headers
['User-Agent']), 
 279             for gis_tmpl 
in gis_tmpls
: 
 281                     media 
= self
._download
_json
( 
 282                         'https://www.instagram.com/graphql/query/', uploader_id
, 
 283                         'Downloading JSON page %d' % page_num
, headers
={ 
 284                             'X-Requested-With': 'XMLHttpRequest', 
 285                             'X-Instagram-GIS': hashlib
.md5( 
 286                                 ('%s:%s' % (gis_tmpl
, variables
)).encode('utf-8')).hexdigest(), 
 288                             'query_hash': '42323d64886122307be10013ad2dcc44', 
 289                             'variables': variables
, 
 290                         })['data']['user']['edge_owner_to_timeline_media'] 
 291                     self
._gis
_tmpl 
= gis_tmpl
 
 293                 except ExtractorError 
as e
: 
 294                     if isinstance(e
.cause
, compat_HTTPError
) and e
.cause
.code 
== 403: 
 295                         if gis_tmpl 
!= gis_tmpls
[-1]: 
 299             edges 
= media
.get('edges') 
 300             if not edges 
or not isinstance(edges
, list): 
 304                 node 
= edge
.get('node') 
 305                 if not node 
or not isinstance(node
, dict): 
 307                 if node
.get('__typename') != 'GraphVideo' and node
.get('is_video') is not True: 
 309                 video_id 
= node
.get('shortcode') 
 313                 info 
= self
.url_result( 
 314                     'https://instagram.com/p/%s/' % video_id
, 
 315                     ie
=InstagramIE
.ie_key(), video_id
=video_id
) 
 317                 description 
= try_get( 
 318                     node
, lambda x
: x
['edge_media_to_caption']['edges'][0]['node']['text'], 
 320                 thumbnail 
= node
.get('thumbnail_src') or node
.get('display_src') 
 321                 timestamp 
= int_or_none(node
.get('taken_at_timestamp')) 
 323                 comment_count 
= get_count('to_comment') 
 324                 like_count 
= get_count('preview_like') 
 325                 view_count 
= int_or_none(node
.get('video_view_count')) 
 328                     'description': description
, 
 329                     'thumbnail': thumbnail
, 
 330                     'timestamp': timestamp
, 
 331                     'comment_count': comment_count
, 
 332                     'like_count': like_count
, 
 333                     'view_count': view_count
, 
 338             page_info 
= media
.get('page_info') 
 339             if not page_info 
or not isinstance(page_info
, dict): 
 342             has_next_page 
= page_info
.get('has_next_page') 
 343             if not has_next_page
: 
 346             cursor 
= page_info
.get('end_cursor') 
 347             if not cursor 
or not isinstance(cursor
, compat_str
): 
 350     def _real_extract(self
, url
): 
 351         username 
= self
._match
_id
(url
) 
 353         webpage 
= self
._download
_webpage
(url
, username
) 
 355         data 
= self
._parse
_json
( 
 357                 r
'sharedData\s*=\s*({.+?})\s*;\s*[<\n]', webpage
, 'data'), 
 360         return self
.playlist_result( 
 361             self
._entries
(data
), username
, username
)