]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
11 compat_urllib_request
,
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
38 The following fields are optional:
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnails: A list of dictionaries (with the entries "resolution" and
42 "url") for the varying thumbnails
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The subtitle file contents.
51 view_count: How many users have watched the video on the platform.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self
, downloader
=None):
73 """Constructor. Receives an optional downloader."""
75 self
.set_downloader(downloader
)
78 def suitable(cls
, url
):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re
.match(cls
._VALID
_URL
, url
) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self
._real
_initialize
()
93 def extract(self
, url
):
94 """Extracts URL information and returns it in list of dicts."""
96 return self
._real
_extract
(url
)
98 def set_downloader(self
, downloader
):
99 """Sets the downloader for this IE."""
100 self
._downloader
= downloader
102 def _real_initialize(self
):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self
, url
):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self
).__name
__[:-2]
114 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
115 """ Returns the response handle """
117 self
.report_download_webpage(video_id
)
118 elif note
is not False:
119 self
.to_screen(u
'%s: %s' % (video_id
, note
))
121 return compat_urllib_request
.urlopen(url_or_request
)
122 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
124 errnote
= u
'Unable to download webpage'
125 raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2])
127 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None):
128 """ Returns a tuple (page content as string, URL handle) """
130 # Strip hashes from the URL (#1038)
131 if isinstance(url_or_request
, (compat_str
, str)):
132 url_or_request
= url_or_request
.partition('#')[0]
134 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
)
135 content_type
= urlh
.headers
.get('Content-Type', '')
136 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
138 encoding
= m
.group(1)
141 webpage_bytes
= urlh
.read()
142 if self
._downloader
.params
.get('dump_intermediate_pages', False):
144 url
= url_or_request
.get_full_url()
145 except AttributeError:
147 self
.to_screen(u
'Dumping request to ' + url
)
148 dump
= base64
.b64encode(webpage_bytes
).decode('ascii')
149 self
._downloader
.to_screen(dump
)
150 content
= webpage_bytes
.decode(encoding
, 'replace')
151 return (content
, urlh
)
153 def _download_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
154 """ Returns the data of the page as a string """
155 return self
._download
_webpage
_handle
(url_or_request
, video_id
, note
, errnote
)[0]
157 def to_screen(self
, msg
):
158 """Print msg to screen, prefixing it with '[ie_name]'"""
159 self
._downloader
.to_screen(u
'[%s] %s' % (self
.IE_NAME
, msg
))
161 def report_extraction(self
, id_or_name
):
162 """Report information extraction."""
163 self
.to_screen(u
'%s: Extracting information' % id_or_name
)
165 def report_download_webpage(self
, video_id
):
166 """Report webpage download."""
167 self
.to_screen(u
'%s: Downloading webpage' % video_id
)
169 def report_age_confirmation(self
):
170 """Report attempt to confirm age."""
171 self
.to_screen(u
'Confirming age')
173 def report_login(self
):
174 """Report attempt to log in."""
175 self
.to_screen(u
'Logging in')
177 #Methods for following #608
178 def url_result(self
, url
, ie
=None):
179 """Returns a url that points to a page that should be processed"""
180 #TODO: ie should be the class used for getting the info
181 video_info
= {'_type': 'url',
185 def playlist_result(self
, entries
, playlist_id
=None, playlist_title
=None):
186 """Returns a playlist"""
187 video_info
= {'_type': 'playlist',
190 video_info
['id'] = playlist_id
192 video_info
['title'] = playlist_title
195 def _search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
197 Perform a regex search on the given string, using a single or a list of
198 patterns returning the first matching group.
199 In case of failure return a default value or raise a WARNING or a
200 ExtractorError, depending on fatal, specifying the field name.
202 if isinstance(pattern
, (str, compat_str
, compiled_regex_type
)):
203 mobj
= re
.search(pattern
, string
, flags
)
206 mobj
= re
.search(p
, string
, flags
)
209 if sys
.stderr
.isatty() and os
.name
!= 'nt':
210 _name
= u
'\033[0;34m%s\033[0m' % name
215 # return the first matching group
216 return next(g
for g
in mobj
.groups() if g
is not None)
217 elif default
is not None:
220 raise ExtractorError(u
'Unable to extract %s' % _name
)
222 self
._downloader
.report_warning(u
'unable to extract %s; '
223 u
'please report this issue on http://yt-dl.org/bug' % _name
)
226 def _html_search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
228 Like _search_regex, but strips HTML tags and unescapes entities.
230 res
= self
._search
_regex
(pattern
, string
, name
, default
, fatal
, flags
)
232 return clean_html(res
).strip()
236 def _get_login_info(self
):
238 Get the the login info as (username, password)
239 It will look in the netrc file using the _NETRC_MACHINE value
240 If there's no info available, return (None, None)
242 if self
._downloader
is None:
247 downloader_params
= self
._downloader
.params
249 # Attempt to use provided username and password or .netrc data
250 if downloader_params
.get('username', None) is not None:
251 username
= downloader_params
['username']
252 password
= downloader_params
['password']
253 elif downloader_params
.get('usenetrc', False):
255 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
260 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
261 except (IOError, netrc
.NetrcParseError
) as err
:
262 self
._downloader
.report_warning(u
'parsing .netrc: %s' % compat_str(err
))
264 return (username
, password
)
266 # Helper functions for extracting OpenGraph info
269 return r
'<meta.+?property=[\'"]og:%s[\'"].+?content
=(?
:"(.+?)"|
\'(.+?
)\')' % re.escape(prop)
271 def _og_search_property(self, prop, html, name=None, **kargs):
273 name = 'OpenGraph
%s' % prop
274 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
275 return unescapeHTML(escaped)
277 def _og_search_thumbnail(self, html, **kargs):
278 return self._og_search_property('image
', html, u'thumbnail url
', fatal=False, **kargs)
280 def _og_search_description(self, html, **kargs):
281 return self._og_search_property('description
', html, fatal=False, **kargs)
283 def _og_search_title(self, html, **kargs):
284 return self._og_search_property('title
', html, **kargs)
286 def _og_search_video_url(self, html, name='video url
', **kargs):
287 return self._html_search_regex([self._og_regex('video
:secure_url
'),
288 self._og_regex('video
')],
291 class SearchInfoExtractor(InfoExtractor):
293 Base class for paged search queries extractors.
294 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
295 Instances should define _SEARCH_KEY and _MAX_RESULTS.
299 def _make_valid_url(cls):
300 return r'%s(?P
<prefix
>|
[1-9][0-9]*|all
):(?P
<query
>[\s\S
]+)' % cls._SEARCH_KEY
303 def suitable(cls, url):
304 return re.match(cls._make_valid_url(), url) is not None
306 def _real_extract(self, query):
307 mobj = re.match(self._make_valid_url(), query)
309 raise ExtractorError(u'Invalid search query
"%s"' % query)
311 prefix = mobj.group('prefix
')
312 query = mobj.group('query
')
314 return self._get_n_results(query, 1)
315 elif prefix == 'all
':
316 return self._get_n_results(query, self._MAX_RESULTS)
320 raise ExtractorError(u'invalid download number
%s for query
"%s"' % (n, query))
321 elif n > self._MAX_RESULTS:
322 self._downloader.report_warning(u'%s returns
max %i results (you requested
%i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
323 n = self._MAX_RESULTS
324 return self._get_n_results(query, n)
326 def _get_n_results(self, query, n):
327 """Get a specified number of results for a query"""
328 raise NotImplementedError("This method must be implemented by sublclasses")
331 def SEARCH_KEY(self):
332 return self._SEARCH_KEY