X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/9dc487f48b50767cf540fa36c3de2c386fd74c04..ced7488f6d3a519b2c1b1cbd31048743fb8285bd:/youtube_dl/extractor/common.py?ds=sidebyside diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2603b..9427ff4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -54,6 +55,8 @@ from ..utils import ( update_Request, update_url_query, parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -161,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -658,6 +662,24 @@ class InfoExtractor(object): else: return res + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self._downloader.params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + + return (username, password) + def _get_login_info(self): """ Get the login info as (username, password) @@ -675,16 +697,8 @@ class InfoExtractor(object): if downloader_params.get('username') is not None: username = downloader_params['username'] password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + else: + username, password = self._get_netrc_login_info() return (username, password) @@ -723,9 +737,14 @@ class InfoExtractor(object): [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): + if not isinstance(prop, (list, tuple)): + prop = [prop] if name is None: - name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) + name = 'OpenGraph %s' % prop[0] + og_regexes = [] + for p in prop: + og_regexes.extend(self._og_regexes(p)) + escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -749,10 +768,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -801,40 +822,66 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) if not json_ld: - return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) - - def _json_ld(self, json_ld, video_id, fatal=True): + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if json_ld.get('@context') == 'http://schema.org': - item_type = json_ld.get('@type') - if item_type == 'TVEpisode': - info.update({ - 'episode': unescapeHTML(json_ld.get('name')), - 'episode_number': int_or_none(json_ld.get('episodeNumber')), - 'description': unescapeHTML(json_ld.get('description')), - }) - part_of_season = json_ld.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = json_ld.get('partOfSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': - info.update({ - 'timestamp': parse_iso8601(json_ld.get('datePublished')), - 'title': unescapeHTML(json_ld.get('headline')), - 'description': unescapeHTML(json_ld.get('articleBody')), - }) + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + for e in json_ld: + if e.get('@context') == 'http://schema.org': + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + return info + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(e.get('name')), + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + }) + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -876,7 +923,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -884,7 +935,8 @@ class InfoExtractor(object): if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 - proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + protocol = f.get('protocol') or determine_protocol(f) + proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) if f.get('vcodec') == 'none': # audio only preference -= 50 @@ -1101,7 +1153,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': preference - 1 if preference else -1, + 'preference': preference - 100 if preference else -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } @@ -1180,6 +1232,7 @@ class InfoExtractor(object): 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'fps': float_or_none(last_info.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } @@ -1188,24 +1241,17 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) - codecs = last_info.get('CODECS') - if codecs: - vcodec, acodec = [None] * 2 - va_codecs = codecs.split(',') - if len(va_codecs) == 1: - # Audio only entries usually come with single codec and - # no resolution. For more robustness we also check it to - # be mp4 audio. - if not resolution and va_codecs[0].startswith('mp4a'): - vcodec, acodec = 'none', va_codecs[0] - else: - vcodec = va_codecs[0] - else: - vcodec, acodec = va_codecs[:2] + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) f.update({ - 'acodec': acodec, - 'vcodec': vcodec, + 'vbr': vbr, + 'abr': abr, }) + f.update(parse_codecs(last_info.get('CODECS'))) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1460,6 +1506,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1492,8 +1545,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1530,7 +1591,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1574,16 +1635,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1610,6 +1695,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P[^/]+/[^;]+)(?:;\s*codecs="?(?P[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?Pvideo|audio)[^>]*>)(.*?)', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r']+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r']+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1670,7 +1811,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) @@ -1723,6 +1864,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """