]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Imported Upstream version 2013.11.11
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_urllib_request,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 RegexNotFoundError,
18 sanitize_filename,
19 unescapeHTML,
20 )
21
22 class InfoExtractor(object):
23 """Information Extractor class.
24
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
32
33 The dictionaries must include the following fields:
34
35 id: Video identifier.
36 url: Final video URL.
37 title: Video title, unescaped.
38 ext: Video filename extension.
39
40 Instead of url and ext, formats can also specified.
41
42 The following fields are optional:
43
44 format: The video format, defaults to ext (used for --get-format)
45 thumbnails: A list of dictionaries (with the entries "resolution" and
46 "url") for the varying thumbnails
47 thumbnail: Full URL to a video thumbnail image.
48 description: One-line video description.
49 uploader: Full name of the video uploader.
50 upload_date: Video upload date (YYYYMMDD).
51 uploader_id: Nickname or id of the video uploader.
52 location: Physical location of the video.
53 player_url: SWF Player URL (used for rtmpdump).
54 subtitles: The subtitle file contents as a dictionary in the format
55 {language: subtitles}.
56 view_count: How many users have watched the video on the platform.
57 urlhandle: [internal] The urlHandle to be used to download the file,
58 like returned by urllib.request.urlopen
59 age_limit: Age restriction for the video, as an integer (years)
60 formats: A list of dictionaries for each format available, it must
61 be ordered from worst to best quality. Potential fields:
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19")
70 * format_note Additional info about the format
71 ("3D" or "DASH video")
72 * width Width of the video, if known
73 * height Height of the video, if known
74 webpage_url: The url to the video webpage, if given to youtube-dl it
75 should allow to get the same result again. (It will be set
76 by YoutubeDL if it's missing)
77
78 Unless mentioned otherwise, the fields should be Unicode strings.
79
80 Subclasses of this one should re-define the _real_initialize() and
81 _real_extract() methods and define a _VALID_URL regexp.
82 Probably, they should also be added to the list of extractors.
83
84 _real_extract() must return a *list* of information dictionaries as
85 described above.
86
87 Finally, the _WORKING attribute should be set to False for broken IEs
88 in order to warn the users and skip the tests.
89 """
90
91 _ready = False
92 _downloader = None
93 _WORKING = True
94
95 def __init__(self, downloader=None):
96 """Constructor. Receives an optional downloader."""
97 self._ready = False
98 self.set_downloader(downloader)
99
100 @classmethod
101 def suitable(cls, url):
102 """Receives a URL and returns True if suitable for this IE."""
103
104 # This does not use has/getattr intentionally - we want to know whether
105 # we have cached the regexp for *this* class, whereas getattr would also
106 # match the superclass
107 if '_VALID_URL_RE' not in cls.__dict__:
108 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
109 return cls._VALID_URL_RE.match(url) is not None
110
111 @classmethod
112 def working(cls):
113 """Getter method for _WORKING."""
114 return cls._WORKING
115
116 def initialize(self):
117 """Initializes an instance (authentication, etc)."""
118 if not self._ready:
119 self._real_initialize()
120 self._ready = True
121
122 def extract(self, url):
123 """Extracts URL information and returns it in list of dicts."""
124 self.initialize()
125 return self._real_extract(url)
126
127 def set_downloader(self, downloader):
128 """Sets the downloader for this IE."""
129 self._downloader = downloader
130
131 def _real_initialize(self):
132 """Real initialization process. Redefine in subclasses."""
133 pass
134
135 def _real_extract(self, url):
136 """Real extraction process. Redefine in subclasses."""
137 pass
138
139 @classmethod
140 def ie_key(cls):
141 """A string for getting the InfoExtractor with get_info_extractor"""
142 return cls.__name__[:-2]
143
144 @property
145 def IE_NAME(self):
146 return type(self).__name__[:-2]
147
148 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
149 """ Returns the response handle """
150 if note is None:
151 self.report_download_webpage(video_id)
152 elif note is not False:
153 self.to_screen(u'%s: %s' % (video_id, note))
154 try:
155 return compat_urllib_request.urlopen(url_or_request)
156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
157 if errnote is None:
158 errnote = u'Unable to download webpage'
159 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
160
161 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
162 """ Returns a tuple (page content as string, URL handle) """
163
164 # Strip hashes from the URL (#1038)
165 if isinstance(url_or_request, (compat_str, str)):
166 url_or_request = url_or_request.partition('#')[0]
167
168 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
169 content_type = urlh.headers.get('Content-Type', '')
170 webpage_bytes = urlh.read()
171 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
172 if m:
173 encoding = m.group(1)
174 else:
175 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
176 webpage_bytes[:1024])
177 if m:
178 encoding = m.group(1).decode('ascii')
179 else:
180 encoding = 'utf-8'
181 if self._downloader.params.get('dump_intermediate_pages', False):
182 try:
183 url = url_or_request.get_full_url()
184 except AttributeError:
185 url = url_or_request
186 self.to_screen(u'Dumping request to ' + url)
187 dump = base64.b64encode(webpage_bytes).decode('ascii')
188 self._downloader.to_screen(dump)
189 if self._downloader.params.get('write_pages', False):
190 try:
191 url = url_or_request.get_full_url()
192 except AttributeError:
193 url = url_or_request
194 raw_filename = ('%s_%s.dump' % (video_id, url))
195 filename = sanitize_filename(raw_filename, restricted=True)
196 self.to_screen(u'Saving request to ' + filename)
197 with open(filename, 'wb') as outf:
198 outf.write(webpage_bytes)
199
200 content = webpage_bytes.decode(encoding, 'replace')
201 return (content, urlh)
202
203 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
204 """ Returns the data of the page as a string """
205 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
206
207 def to_screen(self, msg):
208 """Print msg to screen, prefixing it with '[ie_name]'"""
209 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
210
211 def report_extraction(self, id_or_name):
212 """Report information extraction."""
213 self.to_screen(u'%s: Extracting information' % id_or_name)
214
215 def report_download_webpage(self, video_id):
216 """Report webpage download."""
217 self.to_screen(u'%s: Downloading webpage' % video_id)
218
219 def report_age_confirmation(self):
220 """Report attempt to confirm age."""
221 self.to_screen(u'Confirming age')
222
223 def report_login(self):
224 """Report attempt to log in."""
225 self.to_screen(u'Logging in')
226
227 #Methods for following #608
228 def url_result(self, url, ie=None):
229 """Returns a url that points to a page that should be processed"""
230 #TODO: ie should be the class used for getting the info
231 video_info = {'_type': 'url',
232 'url': url,
233 'ie_key': ie}
234 return video_info
235 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
236 """Returns a playlist"""
237 video_info = {'_type': 'playlist',
238 'entries': entries}
239 if playlist_id:
240 video_info['id'] = playlist_id
241 if playlist_title:
242 video_info['title'] = playlist_title
243 return video_info
244
245 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
246 """
247 Perform a regex search on the given string, using a single or a list of
248 patterns returning the first matching group.
249 In case of failure return a default value or raise a WARNING or a
250 RegexNotFoundError, depending on fatal, specifying the field name.
251 """
252 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
253 mobj = re.search(pattern, string, flags)
254 else:
255 for p in pattern:
256 mobj = re.search(p, string, flags)
257 if mobj: break
258
259 if sys.stderr.isatty() and os.name != 'nt':
260 _name = u'\033[0;34m%s\033[0m' % name
261 else:
262 _name = name
263
264 if mobj:
265 # return the first matching group
266 return next(g for g in mobj.groups() if g is not None)
267 elif default is not None:
268 return default
269 elif fatal:
270 raise RegexNotFoundError(u'Unable to extract %s' % _name)
271 else:
272 self._downloader.report_warning(u'unable to extract %s; '
273 u'please report this issue on http://yt-dl.org/bug' % _name)
274 return None
275
276 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
277 """
278 Like _search_regex, but strips HTML tags and unescapes entities.
279 """
280 res = self._search_regex(pattern, string, name, default, fatal, flags)
281 if res:
282 return clean_html(res).strip()
283 else:
284 return res
285
286 def _get_login_info(self):
287 """
288 Get the the login info as (username, password)
289 It will look in the netrc file using the _NETRC_MACHINE value
290 If there's no info available, return (None, None)
291 """
292 if self._downloader is None:
293 return (None, None)
294
295 username = None
296 password = None
297 downloader_params = self._downloader.params
298
299 # Attempt to use provided username and password or .netrc data
300 if downloader_params.get('username', None) is not None:
301 username = downloader_params['username']
302 password = downloader_params['password']
303 elif downloader_params.get('usenetrc', False):
304 try:
305 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
306 if info is not None:
307 username = info[0]
308 password = info[2]
309 else:
310 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
311 except (IOError, netrc.NetrcParseError) as err:
312 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
313
314 return (username, password)
315
316 # Helper functions for extracting OpenGraph info
317 @staticmethod
318 def _og_regex(prop):
319 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
320
321 def _og_search_property(self, prop, html, name=None, **kargs):
322 if name is None:
323 name = 'OpenGraph %s' % prop
324 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
325 if not escaped is None:
326 return unescapeHTML(escaped)
327 return None
328
329 def _og_search_thumbnail(self, html, **kargs):
330 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
331
332 def _og_search_description(self, html, **kargs):
333 return self._og_search_property('description', html, fatal=False, **kargs)
334
335 def _og_search_title(self, html, **kargs):
336 return self._og_search_property('title', html, **kargs)
337
338 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
339 regexes = [self._og_regex('video')]
340 if secure: regexes.insert(0, self._og_regex('video:secure_url'))
341 return self._html_search_regex(regexes, html, name, **kargs)
342
343 def _rta_search(self, html):
344 # See http://www.rtalabel.org/index.php?content=howtofaq#single
345 if re.search(r'(?ix)<meta\s+name="rating"\s+'
346 r' content="RTA-5042-1996-1400-1577-RTA"',
347 html):
348 return 18
349 return 0
350
351
352 class SearchInfoExtractor(InfoExtractor):
353 """
354 Base class for paged search queries extractors.
355 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
356 Instances should define _SEARCH_KEY and _MAX_RESULTS.
357 """
358
359 @classmethod
360 def _make_valid_url(cls):
361 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
362
363 @classmethod
364 def suitable(cls, url):
365 return re.match(cls._make_valid_url(), url) is not None
366
367 def _real_extract(self, query):
368 mobj = re.match(self._make_valid_url(), query)
369 if mobj is None:
370 raise ExtractorError(u'Invalid search query "%s"' % query)
371
372 prefix = mobj.group('prefix')
373 query = mobj.group('query')
374 if prefix == '':
375 return self._get_n_results(query, 1)
376 elif prefix == 'all':
377 return self._get_n_results(query, self._MAX_RESULTS)
378 else:
379 n = int(prefix)
380 if n <= 0:
381 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
382 elif n > self._MAX_RESULTS:
383 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
384 n = self._MAX_RESULTS
385 return self._get_n_results(query, n)
386
387 def _get_n_results(self, query, n):
388 """Get a specified number of results for a query"""
389 raise NotImplementedError("This method must be implemented by subclasses")
390
391 @property
392 def SEARCH_KEY(self):
393 return self._SEARCH_KEY