Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/googleplus.py

   1 import datetime
   2 import re
   3
   4 from .common import InfoExtractor
   5 from ..utils import (
   6     ExtractorError,
   7 )
   8
   9
  10 class GooglePlusIE(InfoExtractor):
  11     """Information extractor for plus.google.com."""
  12
  13     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
  14     IE_NAME = u'plus.google'
  15
  16     def _real_extract(self, url):
  17         # Extract id from URL
  18         mobj = re.match(self._VALID_URL, url)
  19         if mobj is None:
  20             raise ExtractorError(u'Invalid URL: %s' % url)
  21
  22         post_url = mobj.group(0)
  23         video_id = mobj.group(1)
  24
  25         video_extension = 'flv'
  26
  27         # Step 1, Retrieve post webpage to extract further information
  28         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
  29
  30         self.report_extraction(video_id)
  31
  32         # Extract update date
  33         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
  34             webpage, u'upload date', fatal=False)
  35         if upload_date:
  36             # Convert timestring to a format suitable for filename
  37             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
  38             upload_date = upload_date.strftime('%Y%m%d')
  39
  40         # Extract uploader
  41         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
  42             webpage, u'uploader', fatal=False)
  43
  44         # Extract title
  45         # Get the first line for title
  46         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
  47             webpage, 'title', default=u'NA')
  48
  49         # Step 2, Stimulate clicking the image box to launch video
  50         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
  51             webpage, u'video page URL')
  52         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
  53
  54         # Extract video links on video page
  55         """Extract video links of all sizes"""
  56         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
  57         mobj = re.findall(pattern, webpage)
  58         if len(mobj) == 0:
  59             raise ExtractorError(u'Unable to extract video links')
  60
  61         # Sort in resolution
  62         links = sorted(mobj)
  63
  64         # Choose the lowest of the sort, i.e. highest resolution
  65         video_url = links[-1]
  66         # Only get the url. The resolution part in the tuple has no use anymore
  67         video_url = video_url[-1]
  68         # Treat escaped \u0026 style hex
  69         try:
  70             video_url = video_url.decode("unicode_escape")
  71         except AttributeError: # Python 3
  72             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
  73
  74
  75         return [{
  76             'id':       video_id,
  77             'url':      video_url,
  78             'uploader': uploader,
  79             'upload_date':  upload_date,
  80             'title':    video_title,
  81             'ext':      video_extension,
  82         }]