1 from __future__
import unicode_literals
13 import xml
.etree
.ElementTree
15 from ..compat
import (
19 compat_urllib_parse_urlparse
,
33 _NO_DEFAULT
= object()
36 class InfoExtractor(object):
37 """Information Extractor class.
39 Information extractors are the classes that, given a URL, extract
40 information about the video (or videos) the URL refers to. This
41 information includes the real video URL, the video title, author and
42 others. The information is stored in a dictionary which is then
43 passed to the FileDownloader. The FileDownloader processes this
44 information possibly downloading the video to the file system, among
45 other possible outcomes.
47 The type field determines the the type of the result.
48 By far the most common value (and the default if _type is missing) is
49 "video", which indicates a single video.
51 For a video, the dictionaries must include the following fields:
54 title: Video title, unescaped.
56 Additionally, it must contain either a formats entry or a url one:
58 formats: A list of dictionaries for each format available, ordered
59 from worst to best quality.
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19").
70 Technically optional, but strongly recommended.
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * resolution Textual description of width and height
76 * tbr Average bitrate of audio and video in KBit/s
77 * abr Average audio bitrate in KBit/s
78 * acodec Name of the audio codec in use
79 * asr Audio sampling rate in Hertz
80 * vbr Average video bitrate in KBit/s
82 * vcodec Name of the video codec in use
83 * container Name of the container format
84 * filesize The number of bytes, if known in advance
85 * filesize_approx An estimate for the number of bytes
86 * player_url SWF Player URL (used for rtmpdump).
87 * protocol The protocol that will be used for the actual
89 "http", "https", "rtsp", "rtmp", "m3u8" or so.
90 * preference Order number of this format. If this field is
91 present and not None, the formats get sorted
92 by this field, regardless of all other values.
93 -1 for default (order by other properties),
94 -2 or smaller for less than default.
95 * language_preference Is this in the correct requested
97 10 if it's what the URL is about,
98 -1 for default (don't know),
99 -10 otherwise, other values reserved for now.
100 * quality Order number of the video quality of this
101 format, irrespective of the file format.
102 -1 for default (order by other properties),
103 -2 or smaller for less than default.
104 * source_preference Order number for this video source
105 (quality takes higher priority)
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 * http_referer HTTP Referer header value to set.
109 * http_method HTTP method to use for the download.
110 * http_headers A dictionary of additional HTTP headers
111 to add to the request.
112 * http_post_data Additional data to send with a POST
114 url: Final video URL.
115 ext: Video filename extension.
116 format: The video format, defaults to ext (used for --get-format)
117 player_url: SWF Player URL (used for rtmpdump).
119 The following fields are optional:
121 display_id An alternative identifier for the video, not necessarily
122 unique, but available before title. Typically, id is
123 something like "4234987", title "Dancing naked mole rats",
124 and display_id "dancing-naked-mole-rats"
125 thumbnails: A list of dictionaries, with the following entries:
127 * "width" (optional, int)
128 * "height" (optional, int)
129 * "resolution" (optional, string "{width}x{height"},
131 thumbnail: Full URL to a video thumbnail image.
132 description: One-line video description.
133 uploader: Full name of the video uploader.
134 timestamp: UNIX timestamp of the moment the video became available.
135 upload_date: Video upload date (YYYYMMDD).
136 If not explicitly set, calculated from timestamp.
137 uploader_id: Nickname or id of the video uploader.
138 location: Physical location where the video was filmed.
139 subtitles: The subtitle file contents as a dictionary in the format
140 {language: subtitles}.
141 duration: Length of the video in seconds, as an integer.
142 view_count: How many users have watched the video on the platform.
143 like_count: Number of positive ratings of the video
144 dislike_count: Number of negative ratings of the video
145 comment_count: Number of comments on the video
146 age_limit: Age restriction for the video, as an integer (years)
147 webpage_url: The url to the video webpage, if given to youtube-dl it
148 should allow to get the same result again. (It will be set
149 by YoutubeDL if it's missing)
150 categories: A list of categories that the video falls in, for example
152 is_live: True, False, or None (=unknown). Whether this video is a
153 live stream that goes on instead of a fixed-length video.
155 Unless mentioned otherwise, the fields should be Unicode strings.
157 Unless mentioned otherwise, None is equivalent to absence of information.
160 _type "playlist" indicates multiple videos.
161 There must be a key "entries", which is a list or a PagedList object, each
162 element of which is a valid dictionary under this specfication.
164 Additionally, playlists can have "title" and "id" attributes with the same
165 semantics as videos (see above).
168 _type "multi_video" indicates that there are multiple videos that
169 form a single show, for examples multiple acts of an opera or TV episode.
170 It must have an entries key like a playlist and contain all the keys
171 required for a video at the same time.
174 _type "url" indicates that the video must be extracted from another
175 location, possibly by a different extractor. Its only required key is:
176 "url" - the next URL to extract.
178 Additionally, it may have properties believed to be identical to the
179 resolved entity, for example "title" if the title of the referred video is
183 _type "url_transparent" entities have the same specification as "url", but
184 indicate that the given additional information is more precise than the one
185 associated with the resolved URL.
186 This is useful when a site employs a video service that hosts the video and
187 its technical metadata, but that video service does not embed a useful
188 title, description etc.
191 Subclasses of this one should re-define the _real_initialize() and
192 _real_extract() methods and define a _VALID_URL regexp.
193 Probably, they should also be added to the list of extractors.
195 Finally, the _WORKING attribute should be set to False for broken IEs
196 in order to warn the users and skip the tests.
203 def __init__(self
, downloader
=None):
204 """Constructor. Receives an optional downloader."""
206 self
.set_downloader(downloader
)
209 def suitable(cls
, url
):
210 """Receives a URL and returns True if suitable for this IE."""
212 # This does not use has/getattr intentionally - we want to know whether
213 # we have cached the regexp for *this* class, whereas getattr would also
214 # match the superclass
215 if '_VALID_URL_RE' not in cls
.__dict
__:
216 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
217 return cls
._VALID
_URL
_RE
.match(url
) is not None
220 def _match_id(cls
, url
):
221 if '_VALID_URL_RE' not in cls
.__dict
__:
222 cls
._VALID
_URL
_RE
= re
.compile(cls
._VALID
_URL
)
223 m
= cls
._VALID
_URL
_RE
.match(url
)
229 """Getter method for _WORKING."""
232 def initialize(self
):
233 """Initializes an instance (authentication, etc)."""
235 self
._real
_initialize
()
238 def extract(self
, url
):
239 """Extracts URL information and returns it in list of dicts."""
241 return self
._real
_extract
(url
)
243 def set_downloader(self
, downloader
):
244 """Sets the downloader for this IE."""
245 self
._downloader
= downloader
247 def _real_initialize(self
):
248 """Real initialization process. Redefine in subclasses."""
251 def _real_extract(self
, url
):
252 """Real extraction process. Redefine in subclasses."""
257 """A string for getting the InfoExtractor with get_info_extractor"""
258 return cls
.__name
__[:-2]
262 return type(self
).__name
__[:-2]
264 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
265 """ Returns the response handle """
267 self
.report_download_webpage(video_id
)
268 elif note
is not False:
270 self
.to_screen('%s' % (note
,))
272 self
.to_screen('%s: %s' % (video_id
, note
))
274 return self
._downloader
.urlopen(url_or_request
)
275 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
279 errnote
= 'Unable to download webpage'
280 errmsg
= '%s: %s' % (errnote
, compat_str(err
))
282 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
)
284 self
._downloader
.report_warning(errmsg
)
287 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True):
288 """ Returns a tuple (page content as string, URL handle) """
289 # Strip hashes from the URL (#1038)
290 if isinstance(url_or_request
, (compat_str
, str)):
291 url_or_request
= url_or_request
.partition('#')[0]
293 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
)
297 content
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
)
298 return (content
, urlh
)
300 def _webpage_read_content(self
, urlh
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, prefix
=None):
301 content_type
= urlh
.headers
.get('Content-Type', '')
302 webpage_bytes
= urlh
.read()
303 if prefix
is not None:
304 webpage_bytes
= prefix
+ webpage_bytes
305 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
307 encoding
= m
.group(1)
309 m
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
310 webpage_bytes[:1024])
312 encoding = m.group(1).decode('ascii')
313 elif webpage_bytes.startswith(b'\xff\xfe'):
317 if self._downloader.params.get('dump_intermediate_pages', False):
319 url = url_or_request.get_full_url()
320 except AttributeError:
322 self.to_screen('Dumping request to ' + url)
323 dump = base64.b64encode(webpage_bytes).decode('ascii')
324 self._downloader.to_screen(dump)
325 if self._downloader.params.get('write_pages', False):
327 url = url_or_request.get_full_url()
328 except AttributeError:
330 basen = '%s_%s' % (video_id, url)
332 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
333 basen = basen[:240 - len(h)] + h
334 raw_filename = basen + '.dump'
335 filename = sanitize_filename(raw_filename, restricted=True)
336 self.to_screen('Saving request to ' + filename)
337 # Working around MAX_PATH limitation on Windows (see
338 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
340 absfilepath = os.path.abspath(filename)
341 if len(absfilepath) > 259:
342 filename = '\\\\?\\' + absfilepath
343 with open(filename, 'wb') as outf:
344 outf.write(webpage_bytes)
347 content = webpage_bytes.decode(encoding, 'replace')
349 content = webpage_bytes.decode('utf-8', 'replace')
351 if ('<title>Access to this site is blocked</title>' in content and
352 'Websense' in content[:512]):
353 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
354 blocked_iframe = self._html_search_regex(
355 r'<iframe src="([^
"]+)"', content,
356 'Websense information URL
', default=None)
358 msg += ' Visit
%s for more details
' % blocked_iframe
359 raise ExtractorError(msg, expected=True)
363 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
364 """ Returns the data of the page as a string """
365 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
372 def _download_xml(self, url_or_request, video_id,
373 note='Downloading XML
', errnote='Unable to download XML
',
374 transform_source=None, fatal=True):
375 """Return the xml as an xml.etree.ElementTree.Element"""
376 xml_string = self._download_webpage(
377 url_or_request, video_id, note, errnote, fatal=fatal)
378 if xml_string is False:
381 xml_string = transform_source(xml_string)
382 return xml.etree.ElementTree.fromstring(xml_string.encode('utf
-8'))
384 def _download_json(self, url_or_request, video_id,
385 note='Downloading JSON metadata
',
386 errnote='Unable to download JSON metadata
',
387 transform_source=None,
389 json_string = self._download_webpage(
390 url_or_request, video_id, note, errnote, fatal=fatal)
391 if (not fatal) and json_string is False:
394 json_string = transform_source(json_string)
396 return json.loads(json_string)
397 except ValueError as ve:
398 errmsg = '%s: Failed to parse JSON
' % video_id
400 raise ExtractorError(errmsg, cause=ve)
402 self.report_warning(errmsg + str(ve))
404 def report_warning(self, msg, video_id=None):
405 idstr = '' if video_id is None else '%s: ' % video_id
406 self._downloader.report_warning(
407 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
409 def to_screen(self, msg):
410 """Print msg to screen, prefixing it with '[ie_name
]'"""
411 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
413 def report_extraction(self, id_or_name):
414 """Report information extraction."""
415 self.to_screen('%s: Extracting information
' % id_or_name)
417 def report_download_webpage(self, video_id):
418 """Report webpage download."""
419 self.to_screen('%s: Downloading webpage
' % video_id)
421 def report_age_confirmation(self):
422 """Report attempt to confirm age."""
423 self.to_screen('Confirming age
')
425 def report_login(self):
426 """Report attempt to log in."""
427 self.to_screen('Logging
in')
429 # Methods for following #608
431 def url_result(url, ie=None, video_id=None):
432 """Returns a url that points to a page that should be processed"""
433 # TODO: ie should be the class used for getting the info
434 video_info = {'_type
': 'url
',
437 if video_id is not None:
438 video_info['id'] = video_id
442 def playlist_result(entries, playlist_id=None, playlist_title=None):
443 """Returns a playlist"""
444 video_info = {'_type
': 'playlist
',
447 video_info['id'] = playlist_id
449 video_info['title
'] = playlist_title
452 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
454 Perform a regex search on the given string, using a single or a list of
455 patterns returning the first matching group.
456 In case of failure return a default value or raise a WARNING or a
457 RegexNotFoundError, depending on fatal, specifying the field name.
459 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
460 mobj = re.search(pattern, string, flags)
463 mobj = re.search(p, string, flags)
467 if os.name != 'nt
' and sys.stderr.isatty():
468 _name = '\033[0;34m
%s\033[0m
' % name
474 # return the first matching group
475 return next(g for g in mobj.groups() if g is not None)
477 return mobj.group(group)
478 elif default is not _NO_DEFAULT:
481 raise RegexNotFoundError('Unable to extract
%s' % _name)
483 self._downloader.report_warning('unable to extract
%s; '
484 'please report this issue on http
://yt
-dl
.org
/bug
' % _name)
487 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
489 Like _search_regex, but strips HTML tags and unescapes entities.
491 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
493 return clean_html(res).strip()
497 def _get_login_info(self):
499 Get the the login info as (username, password)
500 It will look in the netrc file using the _NETRC_MACHINE value
501 If there's no info available
, return (None, None)
503 if self._downloader is None:
508 downloader_params = self._downloader.params
510 # Attempt to use provided username and password or .netrc data
511 if downloader_params.get('username', None) is not None:
512 username = downloader_params['username']
513 password = downloader_params['password']
514 elif downloader_params.get('usenetrc', False):
516 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
521 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
522 except (IOError, netrc.NetrcParseError) as err:
523 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
525 return (username, password)
527 def _get_tfa_info(self):
529 Get the two
-factor authentication info
530 TODO
- asking the user will be required
for sms
/phone verify
531 currently just uses the command line option
532 If there
's no info available, return None
534 if self._downloader is None:
536 downloader_params = self._downloader.params
538 if downloader_params.get('twofactor
', None) is not None:
539 return downloader_params['twofactor
']
543 # Helper functions for extracting OpenGraph info
545 def _og_regexes(prop):
546 content_re = r'content
=(?
:"([^>]+?)"|
\'([^
>]+?
)\')'
547 property_re = r'(?
:name|
property)=[\'"]og:%s[\'"]' % re.escape(prop)
548 template = r'<meta
[^
>]+?
%s[^
>]+?
%s'
550 template % (property_re, content_re),
551 template % (content_re, property_re),
554 def _og_search_property(self, prop, html, name=None, **kargs):
556 name = 'OpenGraph
%s' % prop
557 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
560 return unescapeHTML(escaped)
562 def _og_search_thumbnail(self, html, **kargs):
563 return self._og_search_property('image
', html, 'thumbnail url
', fatal=False, **kargs)
565 def _og_search_description(self, html, **kargs):
566 return self._og_search_property('description
', html, fatal=False, **kargs)
568 def _og_search_title(self, html, **kargs):
569 return self._og_search_property('title
', html, **kargs)
571 def _og_search_video_url(self, html, name='video url
', secure=True, **kargs):
572 regexes = self._og_regexes('video
') + self._og_regexes('video
:url
')
574 regexes = self._og_regexes('video
:secure_url
') + regexes
575 return self._html_search_regex(regexes, html, name, **kargs)
577 def _og_search_url(self, html, **kargs):
578 return self._og_search_property('url
', html, **kargs)
580 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
581 if display_name is None:
583 return self._html_search_regex(
585 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
586 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
587 html, display_name, fatal=fatal, group='content
', **kwargs)
589 def _dc_search_uploader(self, html):
590 return self._html_search_meta('dc
.creator
', html, 'uploader
')
592 def _rta_search(self, html):
593 # See http://www.rtalabel.org/index.php?content=howtofaq#single
594 if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+'
595 r' content
="RTA-5042-1996-1400-1577-RTA"',
600 def _media_rating_search(self, html):
601 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
602 rating = self._html_search_meta('rating
', html)
614 return RATING_TABLE.get(rating.lower(), None)
616 def _twitter_search_player(self, html):
617 return self._html_search_meta('twitter
:player
', html,
618 'twitter card player
')
620 def _sort_formats(self, formats):
622 raise ExtractorError('No video formats found
')
625 # TODO remove the following workaround
626 from ..utils import determine_ext
627 if not f.get('ext
') and 'url
' in f:
628 f['ext
'] = determine_ext(f['url
'])
630 preference = f.get('preference
')
631 if preference is None:
632 proto = f.get('protocol
')
634 proto = compat_urllib_parse_urlparse(f.get('url
', '')).scheme
636 preference = 0 if proto in ['http
', 'https
'] else -0.1
637 if f.get('ext
') in ['f4f
', 'f4m
']: # Not yet supported
640 if f.get('vcodec
') == 'none
': # audio only
641 if self._downloader.params.get('prefer_free_formats
'):
642 ORDER = ['aac
', 'mp3
', 'm4a
', 'webm
', 'ogg
', 'opus
']
644 ORDER = ['webm
', 'opus
', 'ogg
', 'mp3
', 'aac
', 'm4a
']
647 audio_ext_preference = ORDER.index(f['ext
'])
649 audio_ext_preference = -1
651 if self._downloader.params.get('prefer_free_formats
'):
652 ORDER = ['flv
', 'mp4
', 'webm
']
654 ORDER = ['webm
', 'flv
', 'mp4
']
656 ext_preference = ORDER.index(f['ext
'])
659 audio_ext_preference = 0
663 f.get('language_preference
') if f.get('language_preference
') is not None else -1,
664 f.get('quality
') if f.get('quality
') is not None else -1,
665 f.get('height
') if f.get('height
') is not None else -1,
666 f.get('width
') if f.get('width
') is not None else -1,
668 f.get('tbr
') if f.get('tbr
') is not None else -1,
669 f.get('vbr
') if f.get('vbr
') is not None else -1,
670 f.get('abr
') if f.get('abr
') is not None else -1,
671 audio_ext_preference,
672 f.get('fps
') if f.get('fps
') is not None else -1,
673 f.get('filesize
') if f.get('filesize
') is not None else -1,
674 f.get('filesize_approx
') if f.get('filesize_approx
') is not None else -1,
675 f.get('source_preference
') if f.get('source_preference
') is not None else -1,
678 formats.sort(key=_formats_key)
680 def http_scheme(self):
681 """ Either "http:" or "https:", depending on the user's preferences
"""
684 if self._downloader.params.get('prefer_insecure', False)
687 def _proto_relative_url(self, url, scheme=None):
690 if url.startswith('//'):
692 scheme = self.http_scheme()
697 def _sleep(self, timeout, video_id, msg_template=None):
698 if msg_template is None:
699 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
700 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
704 def _extract_f4m_formats(self, manifest_url, video_id):
705 manifest = self._download_xml(
706 manifest_url, video_id, 'Downloading f4m manifest',
707 'Unable to download f4m manifest')
710 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
711 for i, media_el in enumerate(media_nodes):
712 tbr = int_or_none(media_el.attrib.get('bitrate'))
713 format_id = 'f4m-%d' % (i if tbr is None else tbr)
715 'format_id': format_id,
719 'width': int_or_none(media_el.attrib.get('width')),
720 'height': int_or_none(media_el.attrib.get('height')),
722 self._sort_formats(formats)
726 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
727 entry_protocol='m3u8', preference=None):
730 'format_id': 'm3u8-meta',
735 'resolution': 'multiple',
736 'format_note': 'Quality selection URL',
739 format_url = lambda u: (
741 if re.match(r'^https?://', u)
742 else compat_urlparse.urljoin(m3u8_url, u))
744 m3u8_doc = self._download_webpage(
746 note='Downloading m3u8 information',
747 errnote='Failed to download m3u8 information')
750 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
751 for line in m3u8_doc.splitlines():
752 if line.startswith('#EXT-X-STREAM-INF:'):
754 for m in kv_rex.finditer(line):
756 if v.startswith('"'):
758 last_info[m.group('key')] = v
759 elif line.startswith('#') or not line.strip():
762 if last_info is None:
763 formats.append({'url': format_url(line)})
765 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
768 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
769 'url': format_url(line.strip()),
772 'protocol': entry_protocol,
773 'preference': preference,
775 codecs = last_info.get('CODECS')
777 # TODO: looks like video codec is not always necessarily goes first
778 va_codecs = codecs.split(',')
780 f['vcodec'] = va_codecs[0].partition('.')[0]
781 if len(va_codecs) > 1 and va_codecs[1]:
782 f['acodec'] = va_codecs[1].partition('.')[0]
783 resolution = last_info.get('RESOLUTION')
785 width_str, height_str = resolution.split('x')
786 f['width'] = int(width_str)
787 f['height'] = int(height_str)
790 self._sort_formats(formats)
793 def _live_title(self, name):
794 """ Generate the title
for a live video
"""
795 now = datetime.datetime.now()
796 now_str = now.strftime("%Y-%m-%d %H:%M")
797 return name + ' ' + now_str
799 def _int(self, v, name, fatal=False, **kwargs):
800 res = int_or_none(v, **kwargs)
801 if 'get_attr' in kwargs:
802 print(getattr(v, kwargs['get_attr']))
804 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
806 raise ExtractorError(msg)
808 self._downloader.report_warning(msg)
811 def _float(self, v, name, fatal=False, **kwargs):
812 res = float_or_none(v, **kwargs)
814 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
816 raise ExtractorError(msg)
818 self._downloader.report_warning(msg)
821 def _set_cookie(self, domain, name, value, expire_time=None):
822 cookie = compat_cookiejar.Cookie(0, name, value, None, None, domain, None,
823 None, '/', True, False, expire_time, '', None, None, None)
824 self._downloader.cookiejar.set_cookie(cookie)
827 class SearchInfoExtractor(InfoExtractor):
829 Base
class for paged search queries extractors
.
830 They accept urls
in the format
_SEARCH_KEY(|all|
[0-9]):{query}
831 Instances should define _SEARCH_KEY
and _MAX_RESULTS
.
835 def _make_valid_url(cls):
836 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
839 def suitable(cls, url):
840 return re.match(cls._make_valid_url(), url) is not None
842 def _real_extract(self, query):
843 mobj = re.match(self._make_valid_url(), query)
845 raise ExtractorError('Invalid search query "%s"' % query)
847 prefix = mobj.group('prefix')
848 query = mobj.group('query')
850 return self._get_n_results(query, 1)
851 elif prefix == 'all':
852 return self._get_n_results(query, self._MAX_RESULTS)
856 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
857 elif n > self._MAX_RESULTS:
858 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
859 n = self._MAX_RESULTS
860 return self._get_n_results(query, n)
862 def _get_n_results(self, query, n):
863 """Get a specified number of results
for a query
"""
864 raise NotImplementedError("This method must be implemented by subclasses")
867 def SEARCH_KEY(self):
868 return self._SEARCH_KEY