]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Merge tag 'upstream/2013.06.26'
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6
7 from ..utils import (
8 compat_http_client,
9 compat_urllib_error,
10 compat_urllib_request,
11 compat_str,
12
13 clean_html,
14 compiled_regex_type,
15 ExtractorError,
16 )
17
18 class InfoExtractor(object):
19 """Information Extractor class.
20
21 Information extractors are the classes that, given a URL, extract
22 information about the video (or videos) the URL refers to. This
23 information includes the real video URL, the video title, author and
24 others. The information is stored in a dictionary which is then
25 passed to the FileDownloader. The FileDownloader processes this
26 information possibly downloading the video to the file system, among
27 other possible outcomes.
28
29 The dictionaries must include the following fields:
30
31 id: Video identifier.
32 url: Final video URL.
33 title: Video title, unescaped.
34 ext: Video filename extension.
35
36 The following fields are optional:
37
38 format: The video format, defaults to ext (used for --get-format)
39 thumbnail: Full URL to a video thumbnail image.
40 description: One-line video description.
41 uploader: Full name of the video uploader.
42 upload_date: Video upload date (YYYYMMDD).
43 uploader_id: Nickname or id of the video uploader.
44 location: Physical location of the video.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The subtitle file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
49
50 The fields should all be Unicode strings.
51
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
55
56 _real_extract() must return a *list* of information dictionaries as
57 described above.
58
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
61 """
62
63 _ready = False
64 _downloader = None
65 _WORKING = True
66
67 def __init__(self, downloader=None):
68 """Constructor. Receives an optional downloader."""
69 self._ready = False
70 self.set_downloader(downloader)
71
72 @classmethod
73 def suitable(cls, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(cls._VALID_URL, url) is not None
76
77 @classmethod
78 def working(cls):
79 """Getter method for _WORKING."""
80 return cls._WORKING
81
82 def initialize(self):
83 """Initializes an instance (authentication, etc)."""
84 if not self._ready:
85 self._real_initialize()
86 self._ready = True
87
88 def extract(self, url):
89 """Extracts URL information and returns it in list of dicts."""
90 self.initialize()
91 return self._real_extract(url)
92
93 def set_downloader(self, downloader):
94 """Sets the downloader for this IE."""
95 self._downloader = downloader
96
97 def _real_initialize(self):
98 """Real initialization process. Redefine in subclasses."""
99 pass
100
101 def _real_extract(self, url):
102 """Real extraction process. Redefine in subclasses."""
103 pass
104
105 @property
106 def IE_NAME(self):
107 return type(self).__name__[:-2]
108
109 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
110 """ Returns the response handle """
111 if note is None:
112 self.report_download_webpage(video_id)
113 elif note is not False:
114 self.to_screen(u'%s: %s' % (video_id, note))
115 try:
116 return compat_urllib_request.urlopen(url_or_request)
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 if errnote is None:
119 errnote = u'Unable to download webpage'
120 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
121
122 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
123 """ Returns a tuple (page content as string, URL handle) """
124 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
125 content_type = urlh.headers.get('Content-Type', '')
126 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
127 if m:
128 encoding = m.group(1)
129 else:
130 encoding = 'utf-8'
131 webpage_bytes = urlh.read()
132 if self._downloader.params.get('dump_intermediate_pages', False):
133 try:
134 url = url_or_request.get_full_url()
135 except AttributeError:
136 url = url_or_request
137 self.to_screen(u'Dumping request to ' + url)
138 dump = base64.b64encode(webpage_bytes).decode('ascii')
139 self._downloader.to_screen(dump)
140 content = webpage_bytes.decode(encoding, 'replace')
141 return (content, urlh)
142
143 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
144 """ Returns the data of the page as a string """
145 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
146
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
150
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
154
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
158
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
162
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
168 return video_info
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
173 'url': url,
174 'ie_key': ie}
175 return video_info
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
179 'entries': entries}
180 if playlist_id:
181 video_info['id'] = playlist_id
182 if playlist_title:
183 video_info['title'] = playlist_title
184 return video_info
185
186 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
187 """
188 Perform a regex search on the given string, using a single or a list of
189 patterns returning the first matching group.
190 In case of failure return a default value or raise a WARNING or a
191 ExtractorError, depending on fatal, specifying the field name.
192 """
193 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
194 mobj = re.search(pattern, string, flags)
195 else:
196 for p in pattern:
197 mobj = re.search(p, string, flags)
198 if mobj: break
199
200 if sys.stderr.isatty() and os.name != 'nt':
201 _name = u'\033[0;34m%s\033[0m' % name
202 else:
203 _name = name
204
205 if mobj:
206 # return the first matching group
207 return next(g for g in mobj.groups() if g is not None)
208 elif default is not None:
209 return default
210 elif fatal:
211 raise ExtractorError(u'Unable to extract %s' % _name)
212 else:
213 self._downloader.report_warning(u'unable to extract %s; '
214 u'please report this issue on GitHub.' % _name)
215 return None
216
217 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
218 """
219 Like _search_regex, but strips HTML tags and unescapes entities.
220 """
221 res = self._search_regex(pattern, string, name, default, fatal, flags)
222 if res:
223 return clean_html(res).strip()
224 else:
225 return res
226
227 class SearchInfoExtractor(InfoExtractor):
228 """
229 Base class for paged search queries extractors.
230 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
231 Instances should define _SEARCH_KEY and _MAX_RESULTS.
232 """
233
234 @classmethod
235 def _make_valid_url(cls):
236 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
237
238 @classmethod
239 def suitable(cls, url):
240 return re.match(cls._make_valid_url(), url) is not None
241
242 def _real_extract(self, query):
243 mobj = re.match(self._make_valid_url(), query)
244 if mobj is None:
245 raise ExtractorError(u'Invalid search query "%s"' % query)
246
247 prefix = mobj.group('prefix')
248 query = mobj.group('query')
249 if prefix == '':
250 return self._get_n_results(query, 1)
251 elif prefix == 'all':
252 return self._get_n_results(query, self._MAX_RESULTS)
253 else:
254 n = int(prefix)
255 if n <= 0:
256 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
257 elif n > self._MAX_RESULTS:
258 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
259 n = self._MAX_RESULTS
260 return self._get_n_results(query, n)
261
262 def _get_n_results(self, query, n):
263 """Get a specified number of results for a query"""
264 raise NotImplementedError("This method must be implemented by sublclasses")