]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Merge tag 'upstream/2013.08.17'
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7
8 from ..utils import (
9 compat_http_client,
10 compat_urllib_error,
11 compat_urllib_request,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 unescapeHTML,
18 )
19
20 class InfoExtractor(object):
21 """Information Extractor class.
22
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
30
31 The dictionaries must include the following fields:
32
33 id: Video identifier.
34 url: Final video URL.
35 title: Video title, unescaped.
36 ext: Video filename extension.
37
38 The following fields are optional:
39
40 format: The video format, defaults to ext (used for --get-format)
41 thumbnails: A list of dictionaries (with the entries "resolution" and
42 "url") for the varying thumbnails
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The subtitle file contents.
51 view_count: How many users have watched the video on the platform.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
54
55 The fields should all be Unicode strings.
56
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
60
61 _real_extract() must return a *list* of information dictionaries as
62 described above.
63
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
66 """
67
68 _ready = False
69 _downloader = None
70 _WORKING = True
71
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
74 self._ready = False
75 self.set_downloader(downloader)
76
77 @classmethod
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
81
82 @classmethod
83 def working(cls):
84 """Getter method for _WORKING."""
85 return cls._WORKING
86
87 def initialize(self):
88 """Initializes an instance (authentication, etc)."""
89 if not self._ready:
90 self._real_initialize()
91 self._ready = True
92
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
95 self.initialize()
96 return self._real_extract(url)
97
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
101
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
104 pass
105
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
108 pass
109
110 @property
111 def IE_NAME(self):
112 return type(self).__name__[:-2]
113
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
116 if note is None:
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
120 try:
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 if errnote is None:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126
127 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns a tuple (page content as string, URL handle) """
129
130 # Strip hashes from the URL (#1038)
131 if isinstance(url_or_request, (compat_str, str)):
132 url_or_request = url_or_request.partition('#')[0]
133
134 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
135 content_type = urlh.headers.get('Content-Type', '')
136 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
137 if m:
138 encoding = m.group(1)
139 else:
140 encoding = 'utf-8'
141 webpage_bytes = urlh.read()
142 if self._downloader.params.get('dump_intermediate_pages', False):
143 try:
144 url = url_or_request.get_full_url()
145 except AttributeError:
146 url = url_or_request
147 self.to_screen(u'Dumping request to ' + url)
148 dump = base64.b64encode(webpage_bytes).decode('ascii')
149 self._downloader.to_screen(dump)
150 content = webpage_bytes.decode(encoding, 'replace')
151 return (content, urlh)
152
153 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
154 """ Returns the data of the page as a string """
155 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
156
157 def to_screen(self, msg):
158 """Print msg to screen, prefixing it with '[ie_name]'"""
159 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
160
161 def report_extraction(self, id_or_name):
162 """Report information extraction."""
163 self.to_screen(u'%s: Extracting information' % id_or_name)
164
165 def report_download_webpage(self, video_id):
166 """Report webpage download."""
167 self.to_screen(u'%s: Downloading webpage' % video_id)
168
169 def report_age_confirmation(self):
170 """Report attempt to confirm age."""
171 self.to_screen(u'Confirming age')
172
173 def report_login(self):
174 """Report attempt to log in."""
175 self.to_screen(u'Logging in')
176
177 #Methods for following #608
178 def url_result(self, url, ie=None):
179 """Returns a url that points to a page that should be processed"""
180 #TODO: ie should be the class used for getting the info
181 video_info = {'_type': 'url',
182 'url': url,
183 'ie_key': ie}
184 return video_info
185 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
186 """Returns a playlist"""
187 video_info = {'_type': 'playlist',
188 'entries': entries}
189 if playlist_id:
190 video_info['id'] = playlist_id
191 if playlist_title:
192 video_info['title'] = playlist_title
193 return video_info
194
195 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
196 """
197 Perform a regex search on the given string, using a single or a list of
198 patterns returning the first matching group.
199 In case of failure return a default value or raise a WARNING or a
200 ExtractorError, depending on fatal, specifying the field name.
201 """
202 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
203 mobj = re.search(pattern, string, flags)
204 else:
205 for p in pattern:
206 mobj = re.search(p, string, flags)
207 if mobj: break
208
209 if sys.stderr.isatty() and os.name != 'nt':
210 _name = u'\033[0;34m%s\033[0m' % name
211 else:
212 _name = name
213
214 if mobj:
215 # return the first matching group
216 return next(g for g in mobj.groups() if g is not None)
217 elif default is not None:
218 return default
219 elif fatal:
220 raise ExtractorError(u'Unable to extract %s' % _name)
221 else:
222 self._downloader.report_warning(u'unable to extract %s; '
223 u'please report this issue on http://yt-dl.org/bug' % _name)
224 return None
225
226 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
227 """
228 Like _search_regex, but strips HTML tags and unescapes entities.
229 """
230 res = self._search_regex(pattern, string, name, default, fatal, flags)
231 if res:
232 return clean_html(res).strip()
233 else:
234 return res
235
236 def _get_login_info(self):
237 """
238 Get the the login info as (username, password)
239 It will look in the netrc file using the _NETRC_MACHINE value
240 If there's no info available, return (None, None)
241 """
242 if self._downloader is None:
243 return (None, None)
244
245 username = None
246 password = None
247 downloader_params = self._downloader.params
248
249 # Attempt to use provided username and password or .netrc data
250 if downloader_params.get('username', None) is not None:
251 username = downloader_params['username']
252 password = downloader_params['password']
253 elif downloader_params.get('usenetrc', False):
254 try:
255 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
256 if info is not None:
257 username = info[0]
258 password = info[2]
259 else:
260 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
261 except (IOError, netrc.NetrcParseError) as err:
262 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
263
264 return (username, password)
265
266 # Helper functions for extracting OpenGraph info
267 @staticmethod
268 def _og_regex(prop):
269 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
270
271 def _og_search_property(self, prop, html, name=None, **kargs):
272 if name is None:
273 name = 'OpenGraph %s' % prop
274 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
275 return unescapeHTML(escaped)
276
277 def _og_search_thumbnail(self, html, **kargs):
278 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
279
280 def _og_search_description(self, html, **kargs):
281 return self._og_search_property('description', html, fatal=False, **kargs)
282
283 def _og_search_title(self, html, **kargs):
284 return self._og_search_property('title', html, **kargs)
285
286 def _og_search_video_url(self, html, name='video url', **kargs):
287 return self._html_search_regex([self._og_regex('video:secure_url'),
288 self._og_regex('video')],
289 html, name, **kargs)
290
291 class SearchInfoExtractor(InfoExtractor):
292 """
293 Base class for paged search queries extractors.
294 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
295 Instances should define _SEARCH_KEY and _MAX_RESULTS.
296 """
297
298 @classmethod
299 def _make_valid_url(cls):
300 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
301
302 @classmethod
303 def suitable(cls, url):
304 return re.match(cls._make_valid_url(), url) is not None
305
306 def _real_extract(self, query):
307 mobj = re.match(self._make_valid_url(), query)
308 if mobj is None:
309 raise ExtractorError(u'Invalid search query "%s"' % query)
310
311 prefix = mobj.group('prefix')
312 query = mobj.group('query')
313 if prefix == '':
314 return self._get_n_results(query, 1)
315 elif prefix == 'all':
316 return self._get_n_results(query, self._MAX_RESULTS)
317 else:
318 n = int(prefix)
319 if n <= 0:
320 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
321 elif n > self._MAX_RESULTS:
322 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
323 n = self._MAX_RESULTS
324 return self._get_n_results(query, n)
325
326 def _get_n_results(self, query, n):
327 """Get a specified number of results for a query"""
328 raise NotImplementedError("This method must be implemented by sublclasses")
329
330 @property
331 def SEARCH_KEY(self):
332 return self._SEARCH_KEY