- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- video_id = mobj.group('ID')
- url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
- webpage = self._download_webpage(url, video_id)
- BEFORE = '{swf.addParam(param[0], param[1]);});\n'
- AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
- m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
- if not m:
- raise ExtractorError(u'Cannot parse data')
- data = dict(json.loads(m.group(1)))
- params_raw = compat_urllib_parse.unquote(data['params'])
- params = json.loads(params_raw)
- video_data = params['video_data'][0]
- video_url = video_data.get('hd_src')
- if not video_url:
- video_url = video_data['sd_src']
- if not video_url:
- raise ExtractorError(u'Cannot find video URL')
- video_duration = int(video_data['video_duration'])
- thumbnail = video_data['thumbnail_src']
- video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
- webpage, u'title')
- info = {
+ def _real_initialize(self):
+ self._login()
+ def _extract_from_url(self, url, video_id, fatal_if_no_video=True):
+ req = sanitized_Request(url)
+ req.add_header('User-Agent', self._CHROME_USER_AGENT)
+ webpage = self._download_webpage(req, video_id)
+ video_data = None
+ def extract_video_data(instances):
+ for item in instances:
+ if item[1][0] == 'VideoConfig':
+ video_item = item[2][0]
+ if video_item.get('video_id') == video_id:
+ return video_item['videoData']
+ server_js_data = self._parse_json(self._search_regex(
+ r'handleServerJS\(({.+})(?:\);|,")', webpage,
+ 'server js data', default='{}'), video_id, fatal=False)
+ if server_js_data:
+ video_data = extract_video_data(server_js_data.get('instances', []))
+ if not video_data:
+ server_js_data = self._parse_json(
+ self._search_regex(
+ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+stream_pagelet',
+ webpage, 'js data', default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+ if server_js_data:
+ video_data = extract_video_data(try_get(
+ server_js_data, lambda x: x['jsmods']['instances'],
+ list) or [])
+ if not video_data:
+ if not fatal_if_no_video:
+ return webpage, False
+ m_msg = re.search(r'class="[^"]*uiInterstitialContent[^"]*"><div>(.*?)</div>', webpage)
+ if m_msg is not None:
+ raise ExtractorError(
+ 'The video is not available, Facebook said: "%s"' % m_msg.group(1),
+ expected=True)
+ elif '>You must log in to continue' in webpage:
+ self.raise_login_required()
+ else:
+ raise ExtractorError('Cannot parse data')
+ formats = []
+ for f in video_data:
+ format_id = f['stream_type']
+ if f and isinstance(f, dict):
+ f = [f]
+ if not f or not isinstance(f, list):
+ continue
+ for quality in ('sd', 'hd'):
+ for src_type in ('src', 'src_no_ratelimit'):
+ src = f[0].get('%s_%s' % (quality, src_type))
+ if src:
+ preference = -10 if format_id == 'progressive' else 0
+ if quality == 'hd':
+ preference += 5
+ formats.append({
+ 'format_id': '%s_%s_%s' % (format_id, quality, src_type),
+ 'url': src,
+ 'preference': preference,
+ })
+ dash_manifest = f[0].get('dash_manifest')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest))))
+ if not formats:
+ raise ExtractorError('Cannot find video formats')
+ self._sort_formats(formats)
+ video_title = self._html_search_regex(
+ r'<h2\s+[^>]*class="uiHeaderTitle"[^>]*>([^<]*)</h2>', webpage, 'title',
+ default=None)
+ if not video_title:
+ video_title = self._html_search_regex(
+ r'(?s)<span class="fbPhotosPhotoCaption".*?id="fbPhotoPageCaption"><span class="hasCaption">(.*?)</span>',
+ webpage, 'alternative title', default=None)
+ if not video_title:
+ video_title = self._html_search_meta(
+ 'description', webpage, 'title')
+ if video_title:
+ video_title = limit_length(video_title, 80)
+ else:
+ video_title = 'Facebook video #%s' % video_id
+ uploader = clean_html(get_element_by_id(
+ 'fbPhotoPageAuthorName', webpage)) or self._search_regex(
+ r'ownerName\s*:\s*"([^"]+)"', webpage, 'uploader', fatal=False)
+ timestamp = int_or_none(self._search_regex(
+ r'<abbr[^>]+data-utime=["\'](\d+)', webpage,
+ 'timestamp', default=None))
+ info_dict = {