]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
10 compat_urllib_request
,
18 class InfoExtractor(object):
19 """Information Extractor class.
21 Information extractors are the classes that, given a URL, extract
22 information about the video (or videos) the URL refers to. This
23 information includes the real video URL, the video title, author and
24 others. The information is stored in a dictionary which is then
25 passed to the FileDownloader. The FileDownloader processes this
26 information possibly downloading the video to the file system, among
27 other possible outcomes.
29 The dictionaries must include the following fields:
33 title: Video title, unescaped.
34 ext: Video filename extension.
36 The following fields are optional:
38 format: The video format, defaults to ext (used for --get-format)
39 thumbnail: Full URL to a video thumbnail image.
40 description: One-line video description.
41 uploader: Full name of the video uploader.
42 upload_date: Video upload date (YYYYMMDD).
43 uploader_id: Nickname or id of the video uploader.
44 location: Physical location of the video.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The subtitle file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self
, downloader
=None):
68 """Constructor. Receives an optional downloader."""
70 self
.set_downloader(downloader
)
73 def suitable(cls
, url
):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re
.match(cls
._VALID
_URL
, url
) is not None
79 """Getter method for _WORKING."""
83 """Initializes an instance (authentication, etc)."""
85 self
._real
_initialize
()
88 def extract(self
, url
):
89 """Extracts URL information and returns it in list of dicts."""
91 return self
._real
_extract
(url
)
93 def set_downloader(self
, downloader
):
94 """Sets the downloader for this IE."""
95 self
._downloader
= downloader
97 def _real_initialize(self
):
98 """Real initialization process. Redefine in subclasses."""
101 def _real_extract(self
, url
):
102 """Real extraction process. Redefine in subclasses."""
107 return type(self
).__name
__[:-2]
109 def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
110 """ Returns the response handle """
112 self
.report_download_webpage(video_id
)
113 elif note
is not False:
114 self
.to_screen(u
'%s: %s' % (video_id
, note
))
116 return compat_urllib_request
.urlopen(url_or_request
)
117 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
119 errnote
= u
'Unable to download webpage'
120 raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2])
122 def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None):
123 """ Returns a tuple (page content as string, URL handle) """
124 urlh
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
)
125 content_type
= urlh
.headers
.get('Content-Type', '')
126 m
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
)
128 encoding
= m
.group(1)
131 webpage_bytes
= urlh
.read()
132 if self
._downloader
.params
.get('dump_intermediate_pages', False):
134 url
= url_or_request
.get_full_url()
135 except AttributeError:
137 self
.to_screen(u
'Dumping request to ' + url
)
138 dump
= base64
.b64encode(webpage_bytes
).decode('ascii')
139 self
._downloader
.to_screen(dump
)
140 content
= webpage_bytes
.decode(encoding
, 'replace')
141 return (content
, urlh
)
143 def _download_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None):
144 """ Returns the data of the page as a string """
145 return self
._download
_webpage
_handle
(url_or_request
, video_id
, note
, errnote
)[0]
147 def to_screen(self
, msg
):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self
._downloader
.to_screen(u
'[%s] %s' % (self
.IE_NAME
, msg
))
151 def report_extraction(self
, id_or_name
):
152 """Report information extraction."""
153 self
.to_screen(u
'%s: Extracting information' % id_or_name
)
155 def report_download_webpage(self
, video_id
):
156 """Report webpage download."""
157 self
.to_screen(u
'%s: Downloading webpage' % video_id
)
159 def report_age_confirmation(self
):
160 """Report attempt to confirm age."""
161 self
.to_screen(u
'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self
, video_info
):
166 """Returns a video"""
167 video_info
['_type'] = 'video'
169 def url_result(self
, url
, ie
=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info
= {'_type': 'url',
176 def playlist_result(self
, entries
, playlist_id
=None, playlist_title
=None):
177 """Returns a playlist"""
178 video_info
= {'_type': 'playlist',
181 video_info
['id'] = playlist_id
183 video_info
['title'] = playlist_title
186 def _search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
188 Perform a regex search on the given string, using a single or a list of
189 patterns returning the first matching group.
190 In case of failure return a default value or raise a WARNING or a
191 ExtractorError, depending on fatal, specifying the field name.
193 if isinstance(pattern
, (str, compat_str
, compiled_regex_type
)):
194 mobj
= re
.search(pattern
, string
, flags
)
197 mobj
= re
.search(p
, string
, flags
)
200 if sys
.stderr
.isatty() and os
.name
!= 'nt':
201 _name
= u
'\033[0;34m%s\033[0m' % name
206 # return the first matching group
207 return next(g
for g
in mobj
.groups() if g
is not None)
208 elif default
is not None:
211 raise ExtractorError(u
'Unable to extract %s' % _name
)
213 self
._downloader
.report_warning(u
'unable to extract %s; '
214 u
'please report this issue on GitHub.' % _name
)
217 def _html_search_regex(self
, pattern
, string
, name
, default
=None, fatal
=True, flags
=0):
219 Like _search_regex, but strips HTML tags and unescapes entities.
221 res
= self
._search
_regex
(pattern
, string
, name
, default
, fatal
, flags
)
223 return clean_html(res
).strip()
227 class SearchInfoExtractor(InfoExtractor
):
229 Base class for paged search queries extractors.
230 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
231 Instances should define _SEARCH_KEY and _MAX_RESULTS.
235 def _make_valid_url(cls
):
236 return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
239 def suitable(cls
, url
):
240 return re
.match(cls
._make
_valid
_url
(), url
) is not None
242 def _real_extract(self
, query
):
243 mobj
= re
.match(self
._make
_valid
_url
(), query
)
245 raise ExtractorError(u
'Invalid search query "%s"' % query
)
247 prefix
= mobj
.group('prefix')
248 query
= mobj
.group('query')
250 return self
._get
_n
_results
(query
, 1)
251 elif prefix
== 'all':
252 return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
)
256 raise ExtractorError(u
'invalid download number %s for query "%s"' % (n
, query
))
257 elif n
> self
._MAX
_RESULTS
:
258 self
._downloader
.report_warning(u
'%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
))
259 n
= self
._MAX
_RESULTS
260 return self
._get
_n
_results
(query
, n
)
262 def _get_n_results(self
, query
, n
):
263 """Get a specified number of results for a query"""
264 raise NotImplementedError("This method must be implemented by sublclasses")