]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Imported Upstream version 2013.12.04
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 RegexNotFoundError,
18 sanitize_filename,
19 unescapeHTML,
20 )
21
22
23 class InfoExtractor(object):
24 """Information Extractor class.
25
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
33
34 The dictionaries must include the following fields:
35
36 id: Video identifier.
37 url: Final video URL.
38 title: Video title, unescaped.
39 ext: Video filename extension.
40
41 Instead of url and ext, formats can also specified.
42
43 The following fields are optional:
44
45 format: The video format, defaults to ext (used for --get-format)
46 thumbnails: A list of dictionaries (with the entries "resolution" and
47 "url") for the varying thumbnails
48 thumbnail: Full URL to a video thumbnail image.
49 description: One-line video description.
50 uploader: Full name of the video uploader.
51 upload_date: Video upload date (YYYYMMDD).
52 uploader_id: Nickname or id of the video uploader.
53 location: Physical location of the video.
54 player_url: SWF Player URL (used for rtmpdump).
55 subtitles: The subtitle file contents as a dictionary in the format
56 {language: subtitles}.
57 view_count: How many users have watched the video on the platform.
58 urlhandle: [internal] The urlHandle to be used to download the file,
59 like returned by urllib.request.urlopen
60 age_limit: Age restriction for the video, as an integer (years)
61 formats: A list of dictionaries for each format available, it must
62 be ordered from worst to best quality. Potential fields:
63 * url Mandatory. The URL of the video file
64 * ext Will be calculated from url if missing
65 * format A human-readable description of the format
66 ("mp4 container with h264/opus").
67 Calculated from the format_id, width, height.
68 and format_note fields if missing.
69 * format_id A short description of the format
70 ("mp4_h264_opus" or "19")
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * abr Average audio bitrate in KBit/s
76 * acodec Name of the audio codec in use
77 * vbr Average video bitrate in KBit/s
78 * vcodec Name of the video codec in use
79 * filesize The number of bytes, if known in advance
80 webpage_url: The url to the video webpage, if given to youtube-dl it
81 should allow to get the same result again. (It will be set
82 by YoutubeDL if it's missing)
83
84 Unless mentioned otherwise, the fields should be Unicode strings.
85
86 Subclasses of this one should re-define the _real_initialize() and
87 _real_extract() methods and define a _VALID_URL regexp.
88 Probably, they should also be added to the list of extractors.
89
90 _real_extract() must return a *list* of information dictionaries as
91 described above.
92
93 Finally, the _WORKING attribute should be set to False for broken IEs
94 in order to warn the users and skip the tests.
95 """
96
97 _ready = False
98 _downloader = None
99 _WORKING = True
100
101 def __init__(self, downloader=None):
102 """Constructor. Receives an optional downloader."""
103 self._ready = False
104 self.set_downloader(downloader)
105
106 @classmethod
107 def suitable(cls, url):
108 """Receives a URL and returns True if suitable for this IE."""
109
110 # This does not use has/getattr intentionally - we want to know whether
111 # we have cached the regexp for *this* class, whereas getattr would also
112 # match the superclass
113 if '_VALID_URL_RE' not in cls.__dict__:
114 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
115 return cls._VALID_URL_RE.match(url) is not None
116
117 @classmethod
118 def working(cls):
119 """Getter method for _WORKING."""
120 return cls._WORKING
121
122 def initialize(self):
123 """Initializes an instance (authentication, etc)."""
124 if not self._ready:
125 self._real_initialize()
126 self._ready = True
127
128 def extract(self, url):
129 """Extracts URL information and returns it in list of dicts."""
130 self.initialize()
131 return self._real_extract(url)
132
133 def set_downloader(self, downloader):
134 """Sets the downloader for this IE."""
135 self._downloader = downloader
136
137 def _real_initialize(self):
138 """Real initialization process. Redefine in subclasses."""
139 pass
140
141 def _real_extract(self, url):
142 """Real extraction process. Redefine in subclasses."""
143 pass
144
145 @classmethod
146 def ie_key(cls):
147 """A string for getting the InfoExtractor with get_info_extractor"""
148 return cls.__name__[:-2]
149
150 @property
151 def IE_NAME(self):
152 return type(self).__name__[:-2]
153
154 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
155 """ Returns the response handle """
156 if note is None:
157 self.report_download_webpage(video_id)
158 elif note is not False:
159 self.to_screen(u'%s: %s' % (video_id, note))
160 try:
161 return self._downloader.urlopen(url_or_request)
162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
163 if errnote is None:
164 errnote = u'Unable to download webpage'
165 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
166
167 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
168 """ Returns a tuple (page content as string, URL handle) """
169
170 # Strip hashes from the URL (#1038)
171 if isinstance(url_or_request, (compat_str, str)):
172 url_or_request = url_or_request.partition('#')[0]
173
174 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
175 content_type = urlh.headers.get('Content-Type', '')
176 webpage_bytes = urlh.read()
177 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
178 if m:
179 encoding = m.group(1)
180 else:
181 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
182 webpage_bytes[:1024])
183 if m:
184 encoding = m.group(1).decode('ascii')
185 else:
186 encoding = 'utf-8'
187 if self._downloader.params.get('dump_intermediate_pages', False):
188 try:
189 url = url_or_request.get_full_url()
190 except AttributeError:
191 url = url_or_request
192 self.to_screen(u'Dumping request to ' + url)
193 dump = base64.b64encode(webpage_bytes).decode('ascii')
194 self._downloader.to_screen(dump)
195 if self._downloader.params.get('write_pages', False):
196 try:
197 url = url_or_request.get_full_url()
198 except AttributeError:
199 url = url_or_request
200 raw_filename = ('%s_%s.dump' % (video_id, url))
201 filename = sanitize_filename(raw_filename, restricted=True)
202 self.to_screen(u'Saving request to ' + filename)
203 with open(filename, 'wb') as outf:
204 outf.write(webpage_bytes)
205
206 content = webpage_bytes.decode(encoding, 'replace')
207 return (content, urlh)
208
209 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
210 """ Returns the data of the page as a string """
211 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
212
213 def _download_xml(self, url_or_request, video_id,
214 note=u'Downloading XML', errnote=u'Unable to download XML'):
215 """Return the xml as an xml.etree.ElementTree.Element"""
216 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
217 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
218
219 def to_screen(self, msg):
220 """Print msg to screen, prefixing it with '[ie_name]'"""
221 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
222
223 def report_extraction(self, id_or_name):
224 """Report information extraction."""
225 self.to_screen(u'%s: Extracting information' % id_or_name)
226
227 def report_download_webpage(self, video_id):
228 """Report webpage download."""
229 self.to_screen(u'%s: Downloading webpage' % video_id)
230
231 def report_age_confirmation(self):
232 """Report attempt to confirm age."""
233 self.to_screen(u'Confirming age')
234
235 def report_login(self):
236 """Report attempt to log in."""
237 self.to_screen(u'Logging in')
238
239 #Methods for following #608
240 def url_result(self, url, ie=None, video_id=None):
241 """Returns a url that points to a page that should be processed"""
242 #TODO: ie should be the class used for getting the info
243 video_info = {'_type': 'url',
244 'url': url,
245 'ie_key': ie}
246 if video_id is not None:
247 video_info['id'] = video_id
248 return video_info
249 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
250 """Returns a playlist"""
251 video_info = {'_type': 'playlist',
252 'entries': entries}
253 if playlist_id:
254 video_info['id'] = playlist_id
255 if playlist_title:
256 video_info['title'] = playlist_title
257 return video_info
258
259 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
260 """
261 Perform a regex search on the given string, using a single or a list of
262 patterns returning the first matching group.
263 In case of failure return a default value or raise a WARNING or a
264 RegexNotFoundError, depending on fatal, specifying the field name.
265 """
266 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
267 mobj = re.search(pattern, string, flags)
268 else:
269 for p in pattern:
270 mobj = re.search(p, string, flags)
271 if mobj: break
272
273 if sys.stderr.isatty() and os.name != 'nt':
274 _name = u'\033[0;34m%s\033[0m' % name
275 else:
276 _name = name
277
278 if mobj:
279 # return the first matching group
280 return next(g for g in mobj.groups() if g is not None)
281 elif default is not None:
282 return default
283 elif fatal:
284 raise RegexNotFoundError(u'Unable to extract %s' % _name)
285 else:
286 self._downloader.report_warning(u'unable to extract %s; '
287 u'please report this issue on http://yt-dl.org/bug' % _name)
288 return None
289
290 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
291 """
292 Like _search_regex, but strips HTML tags and unescapes entities.
293 """
294 res = self._search_regex(pattern, string, name, default, fatal, flags)
295 if res:
296 return clean_html(res).strip()
297 else:
298 return res
299
300 def _get_login_info(self):
301 """
302 Get the the login info as (username, password)
303 It will look in the netrc file using the _NETRC_MACHINE value
304 If there's no info available, return (None, None)
305 """
306 if self._downloader is None:
307 return (None, None)
308
309 username = None
310 password = None
311 downloader_params = self._downloader.params
312
313 # Attempt to use provided username and password or .netrc data
314 if downloader_params.get('username', None) is not None:
315 username = downloader_params['username']
316 password = downloader_params['password']
317 elif downloader_params.get('usenetrc', False):
318 try:
319 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
320 if info is not None:
321 username = info[0]
322 password = info[2]
323 else:
324 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
325 except (IOError, netrc.NetrcParseError) as err:
326 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
327
328 return (username, password)
329
330 # Helper functions for extracting OpenGraph info
331 @staticmethod
332 def _og_regexes(prop):
333 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
334 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
335 template = r'<meta[^>]+?%s[^>]+?%s'
336 return [
337 template % (property_re, content_re),
338 template % (content_re, property_re),
339 ]
340
341 def _og_search_property(self, prop, html, name=None, **kargs):
342 if name is None:
343 name = 'OpenGraph %s' % prop
344 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
345 if escaped is None:
346 return None
347 return unescapeHTML(escaped)
348
349 def _og_search_thumbnail(self, html, **kargs):
350 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
351
352 def _og_search_description(self, html, **kargs):
353 return self._og_search_property('description', html, fatal=False, **kargs)
354
355 def _og_search_title(self, html, **kargs):
356 return self._og_search_property('title', html, **kargs)
357
358 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
359 regexes = self._og_regexes('video')
360 if secure: regexes = self._og_regexes('video:secure_url') + regexes
361 return self._html_search_regex(regexes, html, name, **kargs)
362
363 def _html_search_meta(self, name, html, display_name=None):
364 if display_name is None:
365 display_name = name
366 return self._html_search_regex(
367 r'''(?ix)<meta
368 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
369 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
370 html, display_name, fatal=False)
371
372 def _dc_search_uploader(self, html):
373 return self._html_search_meta('dc.creator', html, 'uploader')
374
375 def _rta_search(self, html):
376 # See http://www.rtalabel.org/index.php?content=howtofaq#single
377 if re.search(r'(?ix)<meta\s+name="rating"\s+'
378 r' content="RTA-5042-1996-1400-1577-RTA"',
379 html):
380 return 18
381 return 0
382
383 def _media_rating_search(self, html):
384 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
385 rating = self._html_search_meta('rating', html)
386
387 if not rating:
388 return None
389
390 RATING_TABLE = {
391 'safe for kids': 0,
392 'general': 8,
393 '14 years': 14,
394 'mature': 17,
395 'restricted': 19,
396 }
397 return RATING_TABLE.get(rating.lower(), None)
398
399
400
401 class SearchInfoExtractor(InfoExtractor):
402 """
403 Base class for paged search queries extractors.
404 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
405 Instances should define _SEARCH_KEY and _MAX_RESULTS.
406 """
407
408 @classmethod
409 def _make_valid_url(cls):
410 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
411
412 @classmethod
413 def suitable(cls, url):
414 return re.match(cls._make_valid_url(), url) is not None
415
416 def _real_extract(self, query):
417 mobj = re.match(self._make_valid_url(), query)
418 if mobj is None:
419 raise ExtractorError(u'Invalid search query "%s"' % query)
420
421 prefix = mobj.group('prefix')
422 query = mobj.group('query')
423 if prefix == '':
424 return self._get_n_results(query, 1)
425 elif prefix == 'all':
426 return self._get_n_results(query, self._MAX_RESULTS)
427 else:
428 n = int(prefix)
429 if n <= 0:
430 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
431 elif n > self._MAX_RESULTS:
432 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
433 n = self._MAX_RESULTS
434 return self._get_n_results(query, n)
435
436 def _get_n_results(self, query, n):
437 """Get a specified number of results for a query"""
438 raise NotImplementedError("This method must be implemented by subclasses")
439
440 @property
441 def SEARCH_KEY(self):
442 return self._SEARCH_KEY