Imported Upstream version 2016.08.17

[youtubedl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 5a2603b509244810e5a19d8adb2b124d3a5c2d78..9427ff4499243d5236e5349c7cc98c11b9413fb9 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ from ..utils import (
      sanitized_Request,
      unescapeHTML,
      unified_strdate,
      sanitized_Request,
      unescapeHTML,
      unified_strdate,
+    unified_timestamp,
      url_basename,
      xpath_element,
      xpath_text,
      url_basename,
      xpath_element,
      xpath_text,
@@ -54,6 +55,8 @@ from ..utils import (
      update_Request,
      update_url_query,
      parse_m3u8_attributes,
      update_Request,
      update_url_query,
      parse_m3u8_attributes,
+    extract_attributes,
+    parse_codecs,
  )
  
  
  )
  
  
@@ -161,6 +164,7 @@ class InfoExtractor(object):
                          * "height" (optional, int)
                          * "resolution" (optional, string "{width}x{height"},
                                          deprecated)
                          * "height" (optional, int)
                          * "resolution" (optional, string "{width}x{height"},
                                          deprecated)
+                        * "filesize" (optional, int)
      thumbnail:      Full URL to a video thumbnail image.
      description:    Full video description.
      uploader:       Full name of the video uploader.
      thumbnail:      Full URL to a video thumbnail image.
      description:    Full video description.
      uploader:       Full name of the video uploader.
@@ -658,6 +662,24 @@ class InfoExtractor(object):
          else:
              return res
  
          else:
              return res
  
+    def _get_netrc_login_info(self, netrc_machine=None):
+        username = None
+        password = None
+        netrc_machine = netrc_machine or self._NETRC_MACHINE
+
+        if self._downloader.params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(netrc_machine)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+
+        return (username, password)
+
      def _get_login_info(self):
          """
          Get the login info as (username, password)
      def _get_login_info(self):
          """
          Get the login info as (username, password)
@@ -675,16 +697,8 @@ class InfoExtractor(object):
          if downloader_params.get('username') is not None:
              username = downloader_params['username']
              password = downloader_params['password']
          if downloader_params.get('username') is not None:
              username = downloader_params['username']
              password = downloader_params['password']
-        elif downloader_params.get('usenetrc', False):
-            try:
-                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
-            except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+        else:
+            username, password = self._get_netrc_login_info()
  
          return (username, password)
  
  
          return (username, password)
  
@@ -723,9 +737,14 @@ class InfoExtractor(object):
                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  
      def _og_search_property(self, prop, html, name=None, **kargs):
                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  
      def _og_search_property(self, prop, html, name=None, **kargs):
+        if not isinstance(prop, (list, tuple)):
+            prop = [prop]
          if name is None:
          if name is None:
-            name = 'OpenGraph %s' % prop
-        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+            name = 'OpenGraph %s' % prop[0]
+        og_regexes = []
+        for p in prop:
+            og_regexes.extend(self._og_regexes(p))
+        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
          if escaped is None:
              return None
          return unescapeHTML(escaped)
          if escaped is None:
              return None
          return unescapeHTML(escaped)
@@ -749,10 +768,12 @@ class InfoExtractor(object):
          return self._og_search_property('url', html, **kargs)
  
      def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
          return self._og_search_property('url', html, **kargs)
  
      def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
+        if not isinstance(name, (list, tuple)):
+            name = [name]
          if display_name is None:
          if display_name is None:
-            display_name = name
+            display_name = name[0]
          return self._html_search_regex(
          return self._html_search_regex(
-            self._meta_regex(name),
+            [self._meta_regex(n) for n in name],
              html, display_name, fatal=fatal, group='content', **kwargs)
  
      def _dc_search_uploader(self, html):
              html, display_name, fatal=fatal, group='content', **kwargs)
  
      def _dc_search_uploader(self, html):
@@ -801,40 +822,66 @@ class InfoExtractor(object):
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
-    def _search_json_ld(self, html, video_id, **kwargs):
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
          json_ld = self._search_regex(
              r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
              html, 'JSON-LD', group='json_ld', **kwargs)
          json_ld = self._search_regex(
              r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
              html, 'JSON-LD', group='json_ld', **kwargs)
+        default = kwargs.get('default', NO_DEFAULT)
          if not json_ld:
          if not json_ld:
-            return {}
-        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
-
-    def _json_ld(self, json_ld, video_id, fatal=True):
+            return default if default is not NO_DEFAULT else {}
+        # JSON-LD may be malformed and thus `fatal` should be respected.
+        # At the same time `default` may be passed that assumes `fatal=False`
+        # for _search_regex. Let's simulate the same behavior here as well.
+        fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
+        return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
+
+    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
          if isinstance(json_ld, compat_str):
              json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
          if not json_ld:
              return {}
          info = {}
          if isinstance(json_ld, compat_str):
              json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
          if not json_ld:
              return {}
          info = {}
-        if json_ld.get('@context') == 'http://schema.org':
-            item_type = json_ld.get('@type')
-            if item_type == 'TVEpisode':
-                info.update({
-                    'episode': unescapeHTML(json_ld.get('name')),
-                    'episode_number': int_or_none(json_ld.get('episodeNumber')),
-                    'description': unescapeHTML(json_ld.get('description')),
-                })
-                part_of_season = json_ld.get('partOfSeason')
-                if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
-                    info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
-                part_of_series = json_ld.get('partOfSeries')
-                if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
-                    info['series'] = unescapeHTML(part_of_series.get('name'))
-            elif item_type == 'Article':
-                info.update({
-                    'timestamp': parse_iso8601(json_ld.get('datePublished')),
-                    'title': unescapeHTML(json_ld.get('headline')),
-                    'description': unescapeHTML(json_ld.get('articleBody')),
-                })
+        if not isinstance(json_ld, (list, tuple, dict)):
+            return info
+        if isinstance(json_ld, dict):
+            json_ld = [json_ld]
+        for e in json_ld:
+            if e.get('@context') == 'http://schema.org':
+                item_type = e.get('@type')
+                if expected_type is not None and expected_type != item_type:
+                    return info
+                if item_type == 'TVEpisode':
+                    info.update({
+                        'episode': unescapeHTML(e.get('name')),
+                        'episode_number': int_or_none(e.get('episodeNumber')),
+                        'description': unescapeHTML(e.get('description')),
+                    })
+                    part_of_season = e.get('partOfSeason')
+                    if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
+                        info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
+                    part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
+                    if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
+                        info['series'] = unescapeHTML(part_of_series.get('name'))
+                elif item_type == 'Article':
+                    info.update({
+                        'timestamp': parse_iso8601(e.get('datePublished')),
+                        'title': unescapeHTML(e.get('headline')),
+                        'description': unescapeHTML(e.get('articleBody')),
+                    })
+                elif item_type == 'VideoObject':
+                    info.update({
+                        'url': e.get('contentUrl'),
+                        'title': unescapeHTML(e.get('name')),
+                        'description': unescapeHTML(e.get('description')),
+                        'thumbnail': e.get('thumbnailUrl'),
+                        'duration': parse_duration(e.get('duration')),
+                        'timestamp': unified_timestamp(e.get('uploadDate')),
+                        'filesize': float_or_none(e.get('contentSize')),
+                        'tbr': int_or_none(e.get('bitrate')),
+                        'width': int_or_none(e.get('width')),
+                        'height': int_or_none(e.get('height')),
+                    })
+                break
          return dict((k, v) for k, v in info.items() if v is not None)
  
      @staticmethod
          return dict((k, v) for k, v in info.items() if v is not None)
  
      @staticmethod
@@ -876,7 +923,11 @@ class InfoExtractor(object):
                  f['ext'] = determine_ext(f['url'])
  
              if isinstance(field_preference, (list, tuple)):
                  f['ext'] = determine_ext(f['url'])
  
              if isinstance(field_preference, (list, tuple)):
-                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+                return tuple(
+                    f.get(field)
+                    if f.get(field) is not None
+                    else ('' if field == 'format_id' else -1)
+                    for field in field_preference)
  
              preference = f.get('preference')
              if preference is None:
  
              preference = f.get('preference')
              if preference is None:
@@ -884,7 +935,8 @@ class InfoExtractor(object):
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
                  if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
                      preference -= 0.5
  
-            proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
+            protocol = f.get('protocol') or determine_protocol(f)
+            proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
  
              if f.get('vcodec') == 'none':  # audio only
                  preference -= 50
  
              if f.get('vcodec') == 'none':  # audio only
                  preference -= 50
@@ -1101,7 +1153,7 @@ class InfoExtractor(object):
              'url': m3u8_url,
              'ext': ext,
              'protocol': 'm3u8',
              'url': m3u8_url,
              'ext': ext,
              'protocol': 'm3u8',
-            'preference': preference - 1 if preference else -1,
+            'preference': preference - 100 if preference else -100,
              'resolution': 'multiple',
              'format_note': 'Quality selection URL',
          }
              'resolution': 'multiple',
              'format_note': 'Quality selection URL',
          }
@@ -1180,6 +1232,7 @@ class InfoExtractor(object):
                      'url': format_url(line.strip()),
                      'tbr': tbr,
                      'ext': ext,
                      'url': format_url(line.strip()),
                      'tbr': tbr,
                      'ext': ext,
+                    'fps': float_or_none(last_info.get('FRAME-RATE')),
                      'protocol': entry_protocol,
                      'preference': preference,
                  }
                      'protocol': entry_protocol,
                      'preference': preference,
                  }
@@ -1188,24 +1241,17 @@ class InfoExtractor(object):
                      width_str, height_str = resolution.split('x')
                      f['width'] = int(width_str)
                      f['height'] = int(height_str)
                      width_str, height_str = resolution.split('x')
                      f['width'] = int(width_str)
                      f['height'] = int(height_str)
-                codecs = last_info.get('CODECS')
-                if codecs:
-                    vcodec, acodec = [None] * 2
-                    va_codecs = codecs.split(',')
-                    if len(va_codecs) == 1:
-                        # Audio only entries usually come with single codec and
-                        # no resolution. For more robustness we also check it to
-                        # be mp4 audio.
-                        if not resolution and va_codecs[0].startswith('mp4a'):
-                            vcodec, acodec = 'none', va_codecs[0]
-                        else:
-                            vcodec = va_codecs[0]
-                    else:
-                        vcodec, acodec = va_codecs[:2]
+                # Unified Streaming Platform
+                mobj = re.search(
+                    r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+                if mobj:
+                    abr, vbr = mobj.groups()
+                    abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
                      f.update({
                      f.update({
-                        'acodec': acodec,
-                        'vcodec': vcodec,
+                        'vbr': vbr,
+                        'abr': abr,
                      })
                      })
+                f.update(parse_codecs(last_info.get('CODECS')))
                  if last_media is not None:
                      f['m3u8_media'] = last_media
                      last_media = None
                  if last_media is not None:
                      f['m3u8_media'] = last_media
                      last_media = None
@@ -1460,6 +1506,13 @@ class InfoExtractor(object):
              compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
  
      def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
              compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
  
      def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+        """
+        Parse formats from MPD manifest.
+        References:
+         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+        """
          if mpd_doc.get('type') == 'dynamic':
              return []
  
          if mpd_doc.get('type') == 'dynamic':
              return []
  
@@ -1492,8 +1545,16 @@ class InfoExtractor(object):
                          s_e = segment_timeline.findall(_add_ns('S'))
                          if s_e:
                              ms_info['total_number'] = 0
                          s_e = segment_timeline.findall(_add_ns('S'))
                          if s_e:
                              ms_info['total_number'] = 0
+                            ms_info['s'] = []
                              for s in s_e:
                              for s in s_e:
-                                ms_info['total_number'] += 1 + int(s.get('r', '0'))
+                                r = int(s.get('r', 0))
+                                ms_info['total_number'] += 1 + r
+                                ms_info['s'].append({
+                                    't': int(s.get('t', 0)),
+                                    # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+                                    'd': int(s.attrib['d']),
+                                    'r': r,
+                                })
                      else:
                          timescale = segment_template.get('timescale')
                          if timescale:
                      else:
                          timescale = segment_template.get('timescale')
                          if timescale:
@@ -1530,7 +1591,7 @@ class InfoExtractor(object):
                          continue
                      representation_attrib = adaptation_set.attrib.copy()
                      representation_attrib.update(representation.attrib)
                          continue
                      representation_attrib = adaptation_set.attrib.copy()
                      representation_attrib.update(representation.attrib)
-                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
                      mime_type = representation_attrib['mimeType']
                      content_type = mime_type.split('/')[0]
                      if content_type == 'text':
                      mime_type = representation_attrib['mimeType']
                      content_type = mime_type.split('/')[0]
                      if content_type == 'text':
@@ -1574,16 +1635,40 @@ class InfoExtractor(object):
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
-                            media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
                              media_template.replace('$$', '$')
                              media_template.replace('$$', '$')
-                            representation_ms_info['segment_urls'] = [
-                                media_template % {
-                                    'Number': segment_number,
-                                    'Bandwidth': representation_attrib.get('bandwidth')}
-                                for segment_number in range(
-                                    representation_ms_info['start_number'],
-                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+
+                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+                            # can't be used at the same time
+                            if '%(Number' in media_template:
+                                representation_ms_info['segment_urls'] = [
+                                    media_template % {
+                                        'Number': segment_number,
+                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                    }
+                                    for segment_number in range(
+                                        representation_ms_info['start_number'],
+                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            else:
+                                representation_ms_info['segment_urls'] = []
+                                segment_time = 0
+
+                                def add_segment_url():
+                                    representation_ms_info['segment_urls'].append(
+                                        media_template % {
+                                            'Time': segment_time,
+                                            'Bandwidth': representation_attrib.get('bandwidth'),
+                                        }
+                                    )
+
+                                for num, s in enumerate(representation_ms_info['s']):
+                                    segment_time = s.get('t') or segment_time
+                                    add_segment_url()
+                                    for r in range(s.get('r', 0)):
+                                        segment_time += s['d']
+                                        add_segment_url()
+                                    segment_time += s['d']
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],
@@ -1610,6 +1695,62 @@ class InfoExtractor(object):
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats
  
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats
  
+    def _parse_html5_media_entries(self, base_url, webpage):
+        def absolute_url(video_url):
+            return compat_urlparse.urljoin(base_url, video_url)
+
+        def parse_content_type(content_type):
+            if not content_type:
+                return {}
+            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+            if ctr:
+                mimetype, codecs = ctr.groups()
+                f = parse_codecs(codecs)
+                f['ext'] = mimetype2ext(mimetype)
+                return f
+            return {}
+
+        entries = []
+        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+            media_info = {
+                'formats': [],
+                'subtitles': {},
+            }
+            media_attributes = extract_attributes(media_tag)
+            src = media_attributes.get('src')
+            if src:
+                media_info['formats'].append({
+                    'url': absolute_url(src),
+                    'vcodec': 'none' if media_type == 'audio' else None,
+                })
+            media_info['thumbnail'] = media_attributes.get('poster')
+            if media_content:
+                for source_tag in re.findall(r'<source[^>]+>', media_content):
+                    source_attributes = extract_attributes(source_tag)
+                    src = source_attributes.get('src')
+                    if not src:
+                        continue
+                    f = parse_content_type(source_attributes.get('type'))
+                    f.update({
+                        'url': absolute_url(src),
+                        'vcodec': 'none' if media_type == 'audio' else None,
+                    })
+                    media_info['formats'].append(f)
+                for track_tag in re.findall(r'<track[^>]+>', media_content):
+                    track_attributes = extract_attributes(track_tag)
+                    kind = track_attributes.get('kind')
+                    if not kind or kind == 'subtitles':
+                        src = track_attributes.get('src')
+                        if not src:
+                            continue
+                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+                        media_info['subtitles'].setdefault(lang, []).append({
+                            'url': absolute_url(src),
+                        })
+            if media_info['formats']:
+                entries.append(media_info)
+        return entries
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
@@ -1670,7 +1811,7 @@ class InfoExtractor(object):
  
          any_restricted = False
          for tc in self.get_testcases(include_onlymatching=False):
  
          any_restricted = False
          for tc in self.get_testcases(include_onlymatching=False):
-            if 'playlist' in tc:
+            if tc.get('playlist', []):
                  tc = tc['playlist'][0]
              is_restricted = age_restricted(
                  tc.get('info_dict', {}).get('age_limit'), age_limit)
                  tc = tc['playlist'][0]
              is_restricted = age_restricted(
                  tc.get('info_dict', {}).get('age_limit'), age_limit)
@@ -1723,6 +1864,13 @@ class InfoExtractor(object):
      def _mark_watched(self, *args, **kwargs):
          raise NotImplementedError('This method must be implemented by subclasses')
  
      def _mark_watched(self, *args, **kwargs):
          raise NotImplementedError('This method must be implemented by subclasses')
  
+    def geo_verification_headers(self):
+        headers = {}
+        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+        if geo_verification_proxy:
+            headers['Ytdl-request-proxy'] = geo_verification_proxy
+        return headers
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
  
  class SearchInfoExtractor(InfoExtractor):
      """