]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/instagram.py
a77f619d291ba4e02ecdc2c785795229fd0de2ca
   1 from __future__ 
import unicode_literals
 
   6 from .common 
import InfoExtractor
 
   7 from ..compat 
import compat_str
 
   9     get_element_by_attribute
, 
  16 class InstagramIE(InfoExtractor
): 
  17     _VALID_URL 
= r
'(?P<url>https?://(?:www\.)?instagram\.com/p/(?P<id>[^/?#&]+))' 
  19         'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 
  20         'md5': '0d2da106a9d2631273e192b372806516', 
  24             'title': 'Video by naomipq', 
  25             'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 
  26             'thumbnail': r
're:^https?://.*\.jpg', 
  27             'timestamp': 1371748545, 
  28             'upload_date': '20130620', 
  29             'uploader_id': 'naomipq', 
  30             'uploader': 'Naomi Leonor Phan-Quang', 
  37         'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', 
  41             'title': 'Video by britneyspears', 
  42             'thumbnail': r
're:^https?://.*\.jpg', 
  43             'timestamp': 1453760977, 
  44             'upload_date': '20160125', 
  45             'uploader_id': 'britneyspears', 
  46             'uploader': 'Britney Spears', 
  52             'skip_download': True, 
  56         'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', 
  78             'title': 'Post by instagram', 
  79             'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', 
  82         'url': 'https://instagram.com/p/-Cmh1cukG2/', 
  83         'only_matching': True, 
  85         'url': 'http://instagram.com/p/9o6LshA7zy/embed/', 
  86         'only_matching': True, 
  90     def _extract_embed_url(webpage
): 
  92             r
'<iframe[^>]+src=(["\'])(?P
<url
>(?
:https?
:)?
//(?
:www\
.)?instagram\
.com
/p
/[^
/]+/embed
.*?
)\
1', 
  95             return mobj.group('url
') 
  97         blockquote_el = get_element_by_attribute( 
  98             'class', 'instagram
-media
', webpage) 
  99         if blockquote_el is None: 
 103             r'<a
[^
>]+href
=([\'"])(?P<link>[^\'"]+)\
1', blockquote_el) 
 105             return mobj.group('link
') 
 107     def _real_extract(self, url): 
 108         mobj = re.match(self._VALID_URL, url) 
 109         video_id = mobj.group('id') 
 110         url = mobj.group('url
') 
 112         webpage = self._download_webpage(url, video_id) 
 114         (video_url, description, thumbnail, timestamp, uploader, 
 115          uploader_id, like_count, comment_count, comments, height, 
 118         shared_data = self._parse_json( 
 120                 r'window\
._sharedData\s
*=\s
*({.+?
});', 
 121                 webpage, 'shared data
', default='{}'), 
 122             video_id, fatal=False) 
 126                 (lambda x: x['entry_data
']['PostPage
'][0]['graphql
']['shortcode_media
'], 
 127                  lambda x: x['entry_data
']['PostPage
'][0]['media
']), 
 130                 video_url = media.get('video_url
') 
 131                 height = int_or_none(media.get('dimensions
', {}).get('height
')) 
 132                 width = int_or_none(media.get('dimensions
', {}).get('width
')) 
 133                 description = try_get( 
 134                     media, lambda x: x['edge_media_to_caption
']['edges
'][0]['node
']['text
'], 
 135                     compat_str) or media.get('caption
') 
 136                 thumbnail = media.get('display_src
') 
 137                 timestamp = int_or_none(media.get('taken_at_timestamp
') or media.get('date
')) 
 138                 uploader = media.get('owner
', {}).get('full_name
') 
 139                 uploader_id = media.get('owner
', {}).get('username
') 
 141                 def get_count(key, kind): 
 142                     return int_or_none(try_get( 
 143                         media, (lambda x: x['edge_media_
%s' % key]['count
'], 
 144                                 lambda x: x['%ss' % kind]['count
']))) 
 145                 like_count = get_count('preview_like
', 'like
') 
 146                 comment_count = get_count('to_comment
', 'comment
') 
 149                     'author
': comment.get('user
', {}).get('username
'), 
 150                     'author_id
': comment.get('user
', {}).get('id'), 
 151                     'id': comment.get('id'), 
 152                     'text
': comment.get('text
'), 
 153                     'timestamp
': int_or_none(comment.get('created_at
')), 
 154                 } for comment in media.get( 
 155                     'comments
', {}).get('nodes
', []) if comment.get('text
')] 
 158                         media, lambda x: x['edge_sidecar_to_children
']['edges
'], 
 162                         for edge_num, edge in enumerate(edges, start=1): 
 163                             node = try_get(edge, lambda x: x['node
'], dict) 
 166                             node_video_url = try_get(node, lambda x: x['video_url
'], compat_str) 
 167                             if not node_video_url: 
 170                                 'id': node.get('shortcode
') or node['id'], 
 171                                 'title
': 'Video 
%d' % edge_num, 
 172                                 'url
': node_video_url, 
 173                                 'thumbnail
': node.get('display_url
'), 
 174                                 'width
': int_or_none(try_get(node, lambda x: x['dimensions
']['width
'])), 
 175                                 'height
': int_or_none(try_get(node, lambda x: x['dimensions
']['height
'])), 
 176                                 'view_count
': int_or_none(node.get('video_view_count
')), 
 178                         return self.playlist_result( 
 180                             'Post by 
%s' % uploader_id if uploader_id else None, 
 184             video_url = self._og_search_video_url(webpage, secure=False) 
 193             uploader_id = self._search_regex( 
 194                 r'"owner"\s
*:\s
*{\s
*"username"\s
*:\s
*"(.+?)"', 
 195                 webpage, 'uploader 
id', fatal=False) 
 198             description = self._search_regex( 
 199                 r'"caption"\s
*:\s
*"(.+?)"', webpage, 'description
', default=None) 
 200             if description is not None: 
 201                 description = lowercase_escape(description) 
 204             thumbnail = self._og_search_thumbnail(webpage) 
 210             'title
': 'Video by 
%s' % uploader_id, 
 211             'description
': description, 
 212             'thumbnail
': thumbnail, 
 213             'timestamp
': timestamp, 
 214             'uploader_id
': uploader_id, 
 215             'uploader
': uploader, 
 216             'like_count
': like_count, 
 217             'comment_count
': comment_count, 
 218             'comments
': comments, 
 222 class InstagramUserIE(InfoExtractor): 
 223     _VALID_URL = r'https?
://(?
:www\
.)?instagram\
.com
/(?P
<id>[^
/]{2,})/?
(?
:$|
[?
#])' 
 224     IE_DESC 
= 'Instagram user profile' 
 225     IE_NAME 
= 'instagram:user' 
 227         'url': 'https://instagram.com/porsche', 
 234             'extract_flat': True, 
 235             'skip_download': True, 
 240     def _entries(self
, uploader_id
): 
 246             return int_or_none(try_get( 
 247                 node
, lambda x
: x
['%ss' % kind
]['count'])) 
 249         for page_num 
in itertools
.count(1): 
 250             page 
= self
._download
_json
( 
 251                 'https://instagram.com/%s/' % uploader_id
, uploader_id
, 
 252                 note
='Downloading page %d' % page_num
, 
 253                 fatal
=False, query
=query
) 
 257             nodes 
= try_get(page
, lambda x
: x
['user']['media']['nodes'], list) 
 264                 node_id 
= node
.get('id') 
 268                 if node
.get('__typename') != 'GraphVideo' and node
.get('is_video') is not True: 
 270                 video_id 
= node
.get('code') 
 274                 info 
= self
.url_result( 
 275                     'https://instagram.com/p/%s/' % video_id
, 
 276                     ie
=InstagramIE
.ie_key(), video_id
=video_id
) 
 278                 description 
= try_get( 
 279                     node
, [lambda x
: x
['caption'], lambda x
: x
['text']['id']], 
 281                 thumbnail 
= node
.get('thumbnail_src') or node
.get('display_src') 
 282                 timestamp 
= int_or_none(node
.get('date')) 
 284                 comment_count 
= get_count('comment') 
 285                 like_count 
= get_count('like') 
 286                 view_count 
= int_or_none(node
.get('video_views')) 
 289                     'description': description
, 
 290                     'thumbnail': thumbnail
, 
 291                     'timestamp': timestamp
, 
 292                     'comment_count': comment_count
, 
 293                     'like_count': like_count
, 
 294                     'view_count': view_count
, 
 302             query
['max_id'] = max_id
 
 304     def _real_extract(self
, url
): 
 305         uploader_id 
= self
._match
_id
(url
) 
 306         return self
.playlist_result( 
 307             self
._entries
(uploader_id
), uploader_id
, uploader_id
)