]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/InfoExtractors.py
Prepare for upload.
[youtubedl] / youtube_dl / InfoExtractors.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import datetime
5 import HTMLParser
6 import httplib
7 import netrc
8 import os
9 import re
10 import socket
11 import time
12 import urllib
13 import urllib2
14 import email.utils
15 import xml.etree.ElementTree
16 from urlparse import parse_qs
17
18 try:
19 import cStringIO as StringIO
20 except ImportError:
21 import StringIO
22
23 from utils import *
24
25
26 class InfoExtractor(object):
27 """Information Extractor class.
28
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
36 the following fields:
37
38 id: Video identifier.
39 url: Final video URL.
40 uploader: Nickname of the video uploader.
41 title: Literal title.
42 ext: Video filename extension.
43 format: Video format.
44 player_url: SWF Player URL (may be None).
45
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
50
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
53
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
57 """
58
59 _ready = False
60 _downloader = None
61
62 def __init__(self, downloader=None):
63 """Constructor. Receives an optional downloader."""
64 self._ready = False
65 self.set_downloader(downloader)
66
67 def suitable(self, url):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re.match(self._VALID_URL, url) is not None
70
71 def initialize(self):
72 """Initializes an instance (authentication, etc)."""
73 if not self._ready:
74 self._real_initialize()
75 self._ready = True
76
77 def extract(self, url):
78 """Extracts URL information and returns it in list of dicts."""
79 self.initialize()
80 return self._real_extract(url)
81
82 def set_downloader(self, downloader):
83 """Sets the downloader for this IE."""
84 self._downloader = downloader
85
86 def _real_initialize(self):
87 """Real initialization process. Redefine in subclasses."""
88 pass
89
90 def _real_extract(self, url):
91 """Real extraction process. Redefine in subclasses."""
92 pass
93
94
95 class YoutubeIE(InfoExtractor):
96 """Information extractor for youtube.com."""
97
98 _VALID_URL = r'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube.majestyc.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE = 'youtube'
104 # Listed in order of quality
105 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions = {
108 '13': '3gp',
109 '17': 'mp4',
110 '18': 'mp4',
111 '22': 'mp4',
112 '37': 'mp4',
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
114 '43': 'webm',
115 '44': 'webm',
116 '45': 'webm',
117 '46': 'webm',
118 }
119 _video_dimensions = {
120 '5': '240x400',
121 '6': '???',
122 '13': '???',
123 '17': '144x176',
124 '18': '360x640',
125 '22': '720x1280',
126 '34': '360x640',
127 '35': '480x854',
128 '37': '1080x1920',
129 '38': '3072x4096',
130 '43': '360x640',
131 '44': '480x854',
132 '45': '720x1280',
133 '46': '1080x1920',
134 }
135 IE_NAME = u'youtube'
136
137 def report_lang(self):
138 """Report attempt to set language."""
139 self._downloader.to_screen(u'[youtube] Setting language')
140
141 def report_login(self):
142 """Report attempt to log in."""
143 self._downloader.to_screen(u'[youtube] Logging in')
144
145 def report_age_confirmation(self):
146 """Report attempt to confirm age."""
147 self._downloader.to_screen(u'[youtube] Confirming age')
148
149 def report_video_webpage_download(self, video_id):
150 """Report attempt to download video webpage."""
151 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
152
153 def report_video_info_webpage_download(self, video_id):
154 """Report attempt to download video info webpage."""
155 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
156
157 def report_video_subtitles_download(self, video_id):
158 """Report attempt to download video info webpage."""
159 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
160
161 def report_information_extraction(self, video_id):
162 """Report attempt to extract video information."""
163 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
164
165 def report_unavailable_format(self, video_id, format):
166 """Report extracted video URL."""
167 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
168
169 def report_rtmp_download(self):
170 """Indicate the download will use the RTMP protocol."""
171 self._downloader.to_screen(u'[youtube] RTMP download detected')
172
173 def _closed_captions_xml_to_srt(self, xml_string):
174 srt = ''
175 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
176 # TODO parse xml instead of regex
177 for n, (start, dur_tag, dur, caption) in enumerate(texts):
178 if not dur: dur = '4'
179 start = float(start)
180 end = start + float(dur)
181 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
182 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
183 caption = unescapeHTML(caption)
184 caption = unescapeHTML(caption) # double cycle, intentional
185 srt += str(n+1) + '\n'
186 srt += start + ' --> ' + end + '\n'
187 srt += caption + '\n\n'
188 return srt
189
190 def _print_formats(self, formats):
191 print 'Available formats:'
192 for x in formats:
193 print '%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))
194
195 def _real_initialize(self):
196 if self._downloader is None:
197 return
198
199 username = None
200 password = None
201 downloader_params = self._downloader.params
202
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params.get('username', None) is not None:
205 username = downloader_params['username']
206 password = downloader_params['password']
207 elif downloader_params.get('usenetrc', False):
208 try:
209 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
210 if info is not None:
211 username = info[0]
212 password = info[2]
213 else:
214 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
215 except (IOError, netrc.NetrcParseError), err:
216 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
217 return
218
219 # Set language
220 request = urllib2.Request(self._LANG_URL)
221 try:
222 self.report_lang()
223 urllib2.urlopen(request).read()
224 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
225 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
226 return
227
228 # No authentication to be performed
229 if username is None:
230 return
231
232 # Log in
233 login_form = {
234 'current_form': 'loginForm',
235 'next': '/',
236 'action_login': 'Log In',
237 'username': username,
238 'password': password,
239 }
240 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
241 try:
242 self.report_login()
243 login_results = urllib2.urlopen(request).read()
244 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
245 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
246 return
247 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
248 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
249 return
250
251 # Confirm age
252 age_form = {
253 'next_url': '/',
254 'action_confirm': 'Confirm',
255 }
256 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form))
257 try:
258 self.report_age_confirmation()
259 age_results = urllib2.urlopen(request).read()
260 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
261 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
262 return
263
264 def _real_extract(self, url):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj = re.search(self._NEXT_URL_RE, url)
267 if mobj:
268 url = 'http://www.youtube.com/' + urllib.unquote(mobj.group(1)).lstrip('/')
269
270 # Extract video id from URL
271 mobj = re.match(self._VALID_URL, url)
272 if mobj is None:
273 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
274 return
275 video_id = mobj.group(2)
276
277 # Get video webpage
278 self.report_video_webpage_download(video_id)
279 request = urllib2.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
280 try:
281 video_webpage = urllib2.urlopen(request).read()
282 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
283 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
284 return
285
286 # Attempt to extract SWF player URL
287 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
288 if mobj is not None:
289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
290 else:
291 player_url = None
292
293 # Get video info
294 self.report_video_info_webpage_download(video_id)
295 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id, el_type))
298 request = urllib2.Request(video_info_url)
299 try:
300 video_info_webpage = urllib2.urlopen(request).read()
301 video_info = parse_qs(video_info_webpage)
302 if 'token' in video_info:
303 break
304 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
305 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
306 return
307 if 'token' not in video_info:
308 if 'reason' in video_info:
309 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0].decode('utf-8'))
310 else:
311 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
312 return
313
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
316 self._downloader.trouble(u'ERROR: "rental" videos not supported')
317 return
318
319 # Start extracting information
320 self.report_information_extraction(video_id)
321
322 # uploader
323 if 'author' not in video_info:
324 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
325 return
326 video_uploader = urllib.unquote_plus(video_info['author'][0])
327
328 # title
329 if 'title' not in video_info:
330 self._downloader.trouble(u'ERROR: unable to extract video title')
331 return
332 video_title = urllib.unquote_plus(video_info['title'][0])
333 video_title = video_title.decode('utf-8')
334
335 # thumbnail image
336 if 'thumbnail_url' not in video_info:
337 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
338 video_thumbnail = ''
339 else: # don't panic if we can't find it
340 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
341
342 # upload date
343 upload_date = u'NA'
344 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
345 if mobj is not None:
346 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
347 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression in format_expressions:
349 try:
350 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
351 except:
352 pass
353
354 # description
355 video_description = get_element_by_id("eow-description", video_webpage.decode('utf8'))
356 if video_description: video_description = clean_html(video_description)
357 else: video_description = ''
358
359 # closed captions
360 video_subtitles = None
361 if self._downloader.params.get('writesubtitles', False):
362 try:
363 self.report_video_subtitles_download(video_id)
364 request = urllib2.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
365 try:
366 srt_list = urllib2.urlopen(request).read()
367 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
368 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
369 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
370 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
371 if not srt_lang_list:
372 raise Trouble(u'WARNING: video has no closed captions')
373 if self._downloader.params.get('subtitleslang', False):
374 srt_lang = self._downloader.params.get('subtitleslang')
375 elif 'en' in srt_lang_list:
376 srt_lang = 'en'
377 else:
378 srt_lang = srt_lang_list.keys()[0]
379 if not srt_lang in srt_lang_list:
380 raise Trouble(u'WARNING: no closed captions found in the specified language')
381 request = urllib2.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
382 try:
383 srt_xml = urllib2.urlopen(request).read()
384 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
385 raise Trouble(u'WARNING: unable to download video subtitles: %s' % str(err))
386 if not srt_xml:
387 raise Trouble(u'WARNING: unable to download video subtitles')
388 video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
389 except Trouble as trouble:
390 self._downloader.trouble(trouble[0])
391
392 # token
393 video_token = urllib.unquote_plus(video_info['token'][0])
394
395 # Decide which formats to download
396 req_format = self._downloader.params.get('format', None)
397
398 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
399 self.report_rtmp_download()
400 video_url_list = [(None, video_info['conn'][0])]
401 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
402 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
403 url_data = [parse_qs(uds) for uds in url_data_strs]
404 url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
405 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
406
407 format_limit = self._downloader.params.get('format_limit', None)
408 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
409 if format_limit is not None and format_limit in available_formats:
410 format_list = available_formats[available_formats.index(format_limit):]
411 else:
412 format_list = available_formats
413 existing_formats = [x for x in format_list if x in url_map]
414 if len(existing_formats) == 0:
415 self._downloader.trouble(u'ERROR: no known formats available for video')
416 return
417 if self._downloader.params.get('listformats', None):
418 self._print_formats(existing_formats)
419 return
420 if req_format is None or req_format == 'best':
421 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
422 elif req_format == 'worst':
423 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
424 elif req_format in ('-1', 'all'):
425 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
426 else:
427 # Specific formats. We pick the first in a slash-delimeted sequence.
428 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
429 req_formats = req_format.split('/')
430 video_url_list = None
431 for rf in req_formats:
432 if rf in url_map:
433 video_url_list = [(rf, url_map[rf])]
434 break
435 if video_url_list is None:
436 self._downloader.trouble(u'ERROR: requested format not available')
437 return
438 else:
439 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
440 return
441
442 results = []
443 for format_param, video_real_url in video_url_list:
444 # Extension
445 video_extension = self._video_extensions.get(format_param, 'flv')
446
447 results.append({
448 'id': video_id.decode('utf-8'),
449 'url': video_real_url.decode('utf-8'),
450 'uploader': video_uploader.decode('utf-8'),
451 'upload_date': upload_date,
452 'title': video_title,
453 'ext': video_extension.decode('utf-8'),
454 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
455 'thumbnail': video_thumbnail.decode('utf-8'),
456 'description': video_description,
457 'player_url': player_url,
458 'subtitles': video_subtitles
459 })
460 return results
461
462
463 class MetacafeIE(InfoExtractor):
464 """Information Extractor for metacafe.com."""
465
466 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
467 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
468 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
469 IE_NAME = u'metacafe'
470
471 def __init__(self, downloader=None):
472 InfoExtractor.__init__(self, downloader)
473
474 def report_disclaimer(self):
475 """Report disclaimer retrieval."""
476 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
477
478 def report_age_confirmation(self):
479 """Report attempt to confirm age."""
480 self._downloader.to_screen(u'[metacafe] Confirming age')
481
482 def report_download_webpage(self, video_id):
483 """Report webpage download."""
484 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
485
486 def report_extraction(self, video_id):
487 """Report information extraction."""
488 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
489
490 def _real_initialize(self):
491 # Retrieve disclaimer
492 request = urllib2.Request(self._DISCLAIMER)
493 try:
494 self.report_disclaimer()
495 disclaimer = urllib2.urlopen(request).read()
496 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
497 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
498 return
499
500 # Confirm age
501 disclaimer_form = {
502 'filters': '0',
503 'submit': "Continue - I'm over 18",
504 }
505 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form))
506 try:
507 self.report_age_confirmation()
508 disclaimer = urllib2.urlopen(request).read()
509 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
510 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
511 return
512
513 def _real_extract(self, url):
514 # Extract id and simplified title from URL
515 mobj = re.match(self._VALID_URL, url)
516 if mobj is None:
517 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
518 return
519
520 video_id = mobj.group(1)
521
522 # Check if video comes from YouTube
523 mobj2 = re.match(r'^yt-(.*)$', video_id)
524 if mobj2 is not None:
525 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
526 return
527
528 # Retrieve video webpage to extract further information
529 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
530 try:
531 self.report_download_webpage(video_id)
532 webpage = urllib2.urlopen(request).read()
533 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
534 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
535 return
536
537 # Extract URL, uploader and title from webpage
538 self.report_extraction(video_id)
539 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
540 if mobj is not None:
541 mediaURL = urllib.unquote(mobj.group(1))
542 video_extension = mediaURL[-3:]
543
544 # Extract gdaKey if available
545 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
546 if mobj is None:
547 video_url = mediaURL
548 else:
549 gdaKey = mobj.group(1)
550 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
551 else:
552 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
553 if mobj is None:
554 self._downloader.trouble(u'ERROR: unable to extract media URL')
555 return
556 vardict = parse_qs(mobj.group(1))
557 if 'mediaData' not in vardict:
558 self._downloader.trouble(u'ERROR: unable to extract media URL')
559 return
560 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
561 if mobj is None:
562 self._downloader.trouble(u'ERROR: unable to extract media URL')
563 return
564 mediaURL = mobj.group(1).replace('\\/', '/')
565 video_extension = mediaURL[-3:]
566 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
567
568 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
569 if mobj is None:
570 self._downloader.trouble(u'ERROR: unable to extract title')
571 return
572 video_title = mobj.group(1).decode('utf-8')
573
574 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
575 if mobj is None:
576 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
577 return
578 video_uploader = mobj.group(1)
579
580 return [{
581 'id': video_id.decode('utf-8'),
582 'url': video_url.decode('utf-8'),
583 'uploader': video_uploader.decode('utf-8'),
584 'upload_date': u'NA',
585 'title': video_title,
586 'ext': video_extension.decode('utf-8'),
587 'format': u'NA',
588 'player_url': None,
589 }]
590
591
592 class DailymotionIE(InfoExtractor):
593 """Information Extractor for Dailymotion"""
594
595 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
596 IE_NAME = u'dailymotion'
597
598 def __init__(self, downloader=None):
599 InfoExtractor.__init__(self, downloader)
600
601 def report_download_webpage(self, video_id):
602 """Report webpage download."""
603 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
604
605 def report_extraction(self, video_id):
606 """Report information extraction."""
607 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
608
609 def _real_extract(self, url):
610 # Extract id and simplified title from URL
611 mobj = re.match(self._VALID_URL, url)
612 if mobj is None:
613 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
614 return
615
616 video_id = mobj.group(1)
617
618 video_extension = 'flv'
619
620 # Retrieve video webpage to extract further information
621 request = urllib2.Request(url)
622 request.add_header('Cookie', 'family_filter=off')
623 try:
624 self.report_download_webpage(video_id)
625 webpage = urllib2.urlopen(request).read()
626 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
627 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
628 return
629
630 # Extract URL, uploader and title from webpage
631 self.report_extraction(video_id)
632 mobj = re.search(r'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage)
633 if mobj is None:
634 self._downloader.trouble(u'ERROR: unable to extract media URL')
635 return
636 sequence = urllib.unquote(mobj.group(1))
637 mobj = re.search(r',\"sdURL\"\:\"([^\"]+?)\",', sequence)
638 if mobj is None:
639 self._downloader.trouble(u'ERROR: unable to extract media URL')
640 return
641 mediaURL = urllib.unquote(mobj.group(1)).replace('\\', '')
642
643 # if needed add http://www.dailymotion.com/ if relative URL
644
645 video_url = mediaURL
646
647 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
648 if mobj is None:
649 self._downloader.trouble(u'ERROR: unable to extract title')
650 return
651 video_title = unescapeHTML(mobj.group('title').decode('utf-8'))
652
653 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage)
654 if mobj is None:
655 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 return
657 video_uploader = mobj.group(1)
658
659 return [{
660 'id': video_id.decode('utf-8'),
661 'url': video_url.decode('utf-8'),
662 'uploader': video_uploader.decode('utf-8'),
663 'upload_date': u'NA',
664 'title': video_title,
665 'ext': video_extension.decode('utf-8'),
666 'format': u'NA',
667 'player_url': None,
668 }]
669
670
671 class GoogleIE(InfoExtractor):
672 """Information extractor for video.google.com."""
673
674 _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
675 IE_NAME = u'video.google'
676
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
679
680 def report_download_webpage(self, video_id):
681 """Report webpage download."""
682 self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
683
684 def report_extraction(self, video_id):
685 """Report information extraction."""
686 self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
687
688 def _real_extract(self, url):
689 # Extract id from URL
690 mobj = re.match(self._VALID_URL, url)
691 if mobj is None:
692 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
693 return
694
695 video_id = mobj.group(1)
696
697 video_extension = 'mp4'
698
699 # Retrieve video webpage to extract further information
700 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
701 try:
702 self.report_download_webpage(video_id)
703 webpage = urllib2.urlopen(request).read()
704 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
705 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
706 return
707
708 # Extract URL, uploader, and title from webpage
709 self.report_extraction(video_id)
710 mobj = re.search(r"download_url:'([^']+)'", webpage)
711 if mobj is None:
712 video_extension = 'flv'
713 mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
714 if mobj is None:
715 self._downloader.trouble(u'ERROR: unable to extract media URL')
716 return
717 mediaURL = urllib.unquote(mobj.group(1))
718 mediaURL = mediaURL.replace('\\x3d', '\x3d')
719 mediaURL = mediaURL.replace('\\x26', '\x26')
720
721 video_url = mediaURL
722
723 mobj = re.search(r'<title>(.*)</title>', webpage)
724 if mobj is None:
725 self._downloader.trouble(u'ERROR: unable to extract title')
726 return
727 video_title = mobj.group(1).decode('utf-8')
728
729 # Extract video description
730 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
731 if mobj is None:
732 self._downloader.trouble(u'ERROR: unable to extract video description')
733 return
734 video_description = mobj.group(1).decode('utf-8')
735 if not video_description:
736 video_description = 'No description available.'
737
738 # Extract video thumbnail
739 if self._downloader.params.get('forcethumbnail', False):
740 request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
741 try:
742 webpage = urllib2.urlopen(request).read()
743 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
744 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
745 return
746 mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
747 if mobj is None:
748 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
749 return
750 video_thumbnail = mobj.group(1)
751 else: # we need something to pass to process_info
752 video_thumbnail = ''
753
754 return [{
755 'id': video_id.decode('utf-8'),
756 'url': video_url.decode('utf-8'),
757 'uploader': u'NA',
758 'upload_date': u'NA',
759 'title': video_title,
760 'ext': video_extension.decode('utf-8'),
761 'format': u'NA',
762 'player_url': None,
763 }]
764
765
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
768
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
771
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
774
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
778
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
782
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
786 if mobj is None:
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
788 return
789
790 video_id = mobj.group(1)
791
792 video_extension = 'flv'
793
794 # Retrieve video webpage to extract further information
795 request = urllib2.Request(url)
796 try:
797 self.report_download_webpage(video_id)
798 webpage = urllib2.urlopen(request).read()
799 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
801 return
802
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
806 if mobj is None:
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
808 return
809 mediaURL = urllib.unquote(mobj.group(1))
810
811 video_url = mediaURL
812
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
814 if mobj is None:
815 self._downloader.trouble(u'ERROR: unable to extract title')
816 return
817 video_title = mobj.group(1).decode('utf-8')
818
819 video_uploader = mobj.group(2).decode('utf-8')
820
821 return [{
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
825 'upload_date': u'NA',
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
828 'format': u'NA',
829 'player_url': None,
830 }]
831
832
833 class YahooIE(InfoExtractor):
834 """Information extractor for video.yahoo.com."""
835
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME = u'video.yahoo'
841
842 def __init__(self, downloader=None):
843 InfoExtractor.__init__(self, downloader)
844
845 def report_download_webpage(self, video_id):
846 """Report webpage download."""
847 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
848
849 def report_extraction(self, video_id):
850 """Report information extraction."""
851 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
852
853 def _real_extract(self, url, new_video=True):
854 # Extract ID from URL
855 mobj = re.match(self._VALID_URL, url)
856 if mobj is None:
857 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
858 return
859
860 video_id = mobj.group(2)
861 video_extension = 'flv'
862
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re.match(self._VPAGE_URL, url) is None:
866 request = urllib2.Request(url)
867 try:
868 webpage = urllib2.urlopen(request).read()
869 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
870 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
871 return
872
873 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
874 if mobj is None:
875 self._downloader.trouble(u'ERROR: Unable to extract id field')
876 return
877 yahoo_id = mobj.group(1)
878
879 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
880 if mobj is None:
881 self._downloader.trouble(u'ERROR: Unable to extract vid field')
882 return
883 yahoo_vid = mobj.group(1)
884
885 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
886 return self._real_extract(url, new_video=False)
887
888 # Retrieve video webpage to extract further information
889 request = urllib2.Request(url)
890 try:
891 self.report_download_webpage(video_id)
892 webpage = urllib2.urlopen(request).read()
893 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
894 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
895 return
896
897 # Extract uploader and title from webpage
898 self.report_extraction(video_id)
899 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
900 if mobj is None:
901 self._downloader.trouble(u'ERROR: unable to extract video title')
902 return
903 video_title = mobj.group(1).decode('utf-8')
904
905 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
906 if mobj is None:
907 self._downloader.trouble(u'ERROR: unable to extract video uploader')
908 return
909 video_uploader = mobj.group(1).decode('utf-8')
910
911 # Extract video thumbnail
912 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
913 if mobj is None:
914 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
915 return
916 video_thumbnail = mobj.group(1).decode('utf-8')
917
918 # Extract video description
919 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
920 if mobj is None:
921 self._downloader.trouble(u'ERROR: unable to extract video description')
922 return
923 video_description = mobj.group(1).decode('utf-8')
924 if not video_description:
925 video_description = 'No description available.'
926
927 # Extract video height and width
928 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
929 if mobj is None:
930 self._downloader.trouble(u'ERROR: unable to extract video height')
931 return
932 yv_video_height = mobj.group(1)
933
934 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
935 if mobj is None:
936 self._downloader.trouble(u'ERROR: unable to extract video width')
937 return
938 yv_video_width = mobj.group(1)
939
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate = '700' # according to Wikipedia this is hard-coded
945 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
946 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
947 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
948 try:
949 self.report_download_webpage(video_id)
950 webpage = urllib2.urlopen(request).read()
951 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
952 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
953 return
954
955 # Extract media URL from playlist XML
956 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
957 if mobj is None:
958 self._downloader.trouble(u'ERROR: Unable to extract media URL')
959 return
960 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
961 video_url = unescapeHTML(video_url)
962
963 return [{
964 'id': video_id.decode('utf-8'),
965 'url': video_url,
966 'uploader': video_uploader,
967 'upload_date': u'NA',
968 'title': video_title,
969 'ext': video_extension.decode('utf-8'),
970 'thumbnail': video_thumbnail.decode('utf-8'),
971 'description': video_description,
972 'thumbnail': video_thumbnail,
973 'player_url': None,
974 }]
975
976
977 class VimeoIE(InfoExtractor):
978 """Information extractor for vimeo.com."""
979
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
982 IE_NAME = u'vimeo'
983
984 def __init__(self, downloader=None):
985 InfoExtractor.__init__(self, downloader)
986
987 def report_download_webpage(self, video_id):
988 """Report webpage download."""
989 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
990
991 def report_extraction(self, video_id):
992 """Report information extraction."""
993 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
994
995 def _real_extract(self, url, new_video=True):
996 # Extract ID from URL
997 mobj = re.match(self._VALID_URL, url)
998 if mobj is None:
999 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1000 return
1001
1002 video_id = mobj.group(1)
1003
1004 # Retrieve video webpage to extract further information
1005 request = urllib2.Request(url, None, std_headers)
1006 try:
1007 self.report_download_webpage(video_id)
1008 webpage = urllib2.urlopen(request).read()
1009 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1010 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1011 return
1012
1013 # Now we begin extracting as much information as we can from what we
1014 # retrieved. First we extract the information common to all extractors,
1015 # and latter we extract those that are Vimeo specific.
1016 self.report_extraction(video_id)
1017
1018 # Extract the config JSON
1019 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1020 try:
1021 config = json.loads(config)
1022 except:
1023 self._downloader.trouble(u'ERROR: unable to extract info section')
1024 return
1025
1026 # Extract title
1027 video_title = config["video"]["title"]
1028
1029 # Extract uploader
1030 video_uploader = config["video"]["owner"]["name"]
1031
1032 # Extract video thumbnail
1033 video_thumbnail = config["video"]["thumbnail"]
1034
1035 # Extract video description
1036 video_description = get_element_by_id("description", webpage.decode('utf8'))
1037 if video_description: video_description = clean_html(video_description)
1038 else: video_description = ''
1039
1040 # Extract upload date
1041 video_upload_date = u'NA'
1042 mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
1043 if mobj is not None:
1044 video_upload_date = mobj.group(1)
1045
1046 # Vimeo specific: extract request signature and timestamp
1047 sig = config['request']['signature']
1048 timestamp = config['request']['timestamp']
1049
1050 # Vimeo specific: extract video codec and quality information
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 for codec in codecs:
1054 if codec[0] in config["video"]["files"]:
1055 video_codec = codec[0]
1056 video_extension = codec[1]
1057 if 'hd' in config["video"]["files"][codec[0]]: quality = 'hd'
1058 else: quality = 'sd'
1059 break
1060 else:
1061 self._downloader.trouble(u'ERROR: no known codec found')
1062 return
1063
1064 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065 %(video_id, sig, timestamp, quality, video_codec.upper())
1066
1067 return [{
1068 'id': video_id,
1069 'url': video_url,
1070 'uploader': video_uploader,
1071 'upload_date': video_upload_date,
1072 'title': video_title,
1073 'ext': video_extension,
1074 'thumbnail': video_thumbnail,
1075 'description': video_description,
1076 'player_url': None,
1077 }]
1078
1079
1080 class GenericIE(InfoExtractor):
1081 """Generic last-resort information extractor."""
1082
1083 _VALID_URL = r'.*'
1084 IE_NAME = u'generic'
1085
1086 def __init__(self, downloader=None):
1087 InfoExtractor.__init__(self, downloader)
1088
1089 def report_download_webpage(self, video_id):
1090 """Report webpage download."""
1091 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1092 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1093
1094 def report_extraction(self, video_id):
1095 """Report information extraction."""
1096 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1097
1098 def report_following_redirect(self, new_url):
1099 """Report information extraction."""
1100 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1101
1102 def _test_redirect(self, url):
1103 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104 class HeadRequest(urllib2.Request):
1105 def get_method(self):
1106 return "HEAD"
1107
1108 class HEADRedirectHandler(urllib2.HTTPRedirectHandler):
1109 """
1110 Subclass the HTTPRedirectHandler to make it use our
1111 HeadRequest also on the redirected URL
1112 """
1113 def redirect_request(self, req, fp, code, msg, headers, newurl):
1114 if code in (301, 302, 303, 307):
1115 newurl = newurl.replace(' ', '%20')
1116 newheaders = dict((k,v) for k,v in req.headers.items()
1117 if k.lower() not in ("content-length", "content-type"))
1118 return HeadRequest(newurl,
1119 headers=newheaders,
1120 origin_req_host=req.get_origin_req_host(),
1121 unverifiable=True)
1122 else:
1123 raise urllib2.HTTPError(req.get_full_url(), code, msg, headers, fp)
1124
1125 class HTTPMethodFallback(urllib2.BaseHandler):
1126 """
1127 Fallback to GET if HEAD is not allowed (405 HTTP error)
1128 """
1129 def http_error_405(self, req, fp, code, msg, headers):
1130 fp.read()
1131 fp.close()
1132
1133 newheaders = dict((k,v) for k,v in req.headers.items()
1134 if k.lower() not in ("content-length", "content-type"))
1135 return self.parent.open(urllib2.Request(req.get_full_url(),
1136 headers=newheaders,
1137 origin_req_host=req.get_origin_req_host(),
1138 unverifiable=True))
1139
1140 # Build our opener
1141 opener = urllib2.OpenerDirector()
1142 for handler in [urllib2.HTTPHandler, urllib2.HTTPDefaultErrorHandler,
1143 HTTPMethodFallback, HEADRedirectHandler,
1144 urllib2.HTTPErrorProcessor, urllib2.HTTPSHandler]:
1145 opener.add_handler(handler())
1146
1147 response = opener.open(HeadRequest(url))
1148 new_url = response.geturl()
1149
1150 if url == new_url: return False
1151
1152 self.report_following_redirect(new_url)
1153 self._downloader.download([new_url])
1154 return True
1155
1156 def _real_extract(self, url):
1157 if self._test_redirect(url): return
1158
1159 video_id = url.split('/')[-1]
1160 request = urllib2.Request(url)
1161 try:
1162 self.report_download_webpage(video_id)
1163 webpage = urllib2.urlopen(request).read()
1164 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1165 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1166 return
1167 except ValueError, err:
1168 # since this is the last-resort InfoExtractor, if
1169 # this error is thrown, it'll be thrown here
1170 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1171 return
1172
1173 self.report_extraction(video_id)
1174 # Start with something easy: JW Player in SWFObject
1175 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1176 if mobj is None:
1177 # Broaden the search a little bit
1178 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1179 if mobj is None:
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1181 return
1182
1183 # It's possible that one of the regexes
1184 # matched, but returned an empty group:
1185 if mobj.group(1) is None:
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1187 return
1188
1189 video_url = urllib.unquote(mobj.group(1))
1190 video_id = os.path.basename(video_url)
1191
1192 # here's a fun little line of code for you:
1193 video_extension = os.path.splitext(video_id)[1][1:]
1194 video_id = os.path.splitext(video_id)[0]
1195
1196 # it's tempting to parse this further, but you would
1197 # have to take into account all the variations like
1198 # Video Title - Site Name
1199 # Site Name | Video Title
1200 # Video Title - Tagline | Site Name
1201 # and so on and so forth; it's just not practical
1202 mobj = re.search(r'<title>(.*)</title>', webpage)
1203 if mobj is None:
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1205 return
1206 video_title = mobj.group(1).decode('utf-8')
1207
1208 # video uploader is domain name
1209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1210 if mobj is None:
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1212 return
1213 video_uploader = mobj.group(1).decode('utf-8')
1214
1215 return [{
1216 'id': video_id.decode('utf-8'),
1217 'url': video_url.decode('utf-8'),
1218 'uploader': video_uploader,
1219 'upload_date': u'NA',
1220 'title': video_title,
1221 'ext': video_extension.decode('utf-8'),
1222 'format': u'NA',
1223 'player_url': None,
1224 }]
1225
1226
1227 class YoutubeSearchIE(InfoExtractor):
1228 """Information Extractor for YouTube search queries."""
1229 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231 _max_youtube_results = 1000
1232 IE_NAME = u'youtube:search'
1233
1234 def __init__(self, downloader=None):
1235 InfoExtractor.__init__(self, downloader)
1236
1237 def report_download_page(self, query, pagenum):
1238 """Report attempt to download search page with given number."""
1239 query = query.decode(preferredencoding())
1240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1241
1242 def _real_extract(self, query):
1243 mobj = re.match(self._VALID_URL, query)
1244 if mobj is None:
1245 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1246 return
1247
1248 prefix, query = query.split(':')
1249 prefix = prefix[8:]
1250 query = query.encode('utf-8')
1251 if prefix == '':
1252 self._download_n_results(query, 1)
1253 return
1254 elif prefix == 'all':
1255 self._download_n_results(query, self._max_youtube_results)
1256 return
1257 else:
1258 try:
1259 n = long(prefix)
1260 if n <= 0:
1261 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1262 return
1263 elif n > self._max_youtube_results:
1264 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265 n = self._max_youtube_results
1266 self._download_n_results(query, n)
1267 return
1268 except ValueError: # parsing prefix as integer fails
1269 self._download_n_results(query, 1)
1270 return
1271
1272 def _download_n_results(self, query, n):
1273 """Downloads a specified number of results for a query"""
1274
1275 video_ids = []
1276 pagenum = 0
1277 limit = n
1278
1279 while (50 * pagenum) < limit:
1280 self.report_download_page(query, pagenum+1)
1281 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282 request = urllib2.Request(result_url)
1283 try:
1284 data = urllib2.urlopen(request).read()
1285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1287 return
1288 api_response = json.loads(data)['data']
1289
1290 new_ids = list(video['id'] for video in api_response['items'])
1291 video_ids += new_ids
1292
1293 limit = min(n, api_response['totalItems'])
1294 pagenum += 1
1295
1296 if len(video_ids) > n:
1297 video_ids = video_ids[:n]
1298 for id in video_ids:
1299 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1300 return
1301
1302
1303 class GoogleSearchIE(InfoExtractor):
1304 """Information Extractor for Google Video search queries."""
1305 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1308 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1309 _max_google_results = 1000
1310 IE_NAME = u'video.google:search'
1311
1312 def __init__(self, downloader=None):
1313 InfoExtractor.__init__(self, downloader)
1314
1315 def report_download_page(self, query, pagenum):
1316 """Report attempt to download playlist page with given number."""
1317 query = query.decode(preferredencoding())
1318 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1319
1320 def _real_extract(self, query):
1321 mobj = re.match(self._VALID_URL, query)
1322 if mobj is None:
1323 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1324 return
1325
1326 prefix, query = query.split(':')
1327 prefix = prefix[8:]
1328 query = query.encode('utf-8')
1329 if prefix == '':
1330 self._download_n_results(query, 1)
1331 return
1332 elif prefix == 'all':
1333 self._download_n_results(query, self._max_google_results)
1334 return
1335 else:
1336 try:
1337 n = long(prefix)
1338 if n <= 0:
1339 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1340 return
1341 elif n > self._max_google_results:
1342 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343 n = self._max_google_results
1344 self._download_n_results(query, n)
1345 return
1346 except ValueError: # parsing prefix as integer fails
1347 self._download_n_results(query, 1)
1348 return
1349
1350 def _download_n_results(self, query, n):
1351 """Downloads a specified number of results for a query"""
1352
1353 video_ids = []
1354 pagenum = 0
1355
1356 while True:
1357 self.report_download_page(query, pagenum)
1358 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359 request = urllib2.Request(result_url)
1360 try:
1361 page = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1364 return
1365
1366 # Extract video identifiers
1367 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368 video_id = mobj.group(1)
1369 if video_id not in video_ids:
1370 video_ids.append(video_id)
1371 if len(video_ids) == n:
1372 # Specified n videos reached
1373 for id in video_ids:
1374 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1375 return
1376
1377 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378 for id in video_ids:
1379 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1380 return
1381
1382 pagenum = pagenum + 1
1383
1384
1385 class YahooSearchIE(InfoExtractor):
1386 """Information Extractor for Yahoo! Video search queries."""
1387 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1390 _MORE_PAGES_INDICATOR = r'\s*Next'
1391 _max_yahoo_results = 1000
1392 IE_NAME = u'video.yahoo:search'
1393
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1396
1397 def report_download_page(self, query, pagenum):
1398 """Report attempt to download playlist page with given number."""
1399 query = query.decode(preferredencoding())
1400 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1401
1402 def _real_extract(self, query):
1403 mobj = re.match(self._VALID_URL, query)
1404 if mobj is None:
1405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1406 return
1407
1408 prefix, query = query.split(':')
1409 prefix = prefix[8:]
1410 query = query.encode('utf-8')
1411 if prefix == '':
1412 self._download_n_results(query, 1)
1413 return
1414 elif prefix == 'all':
1415 self._download_n_results(query, self._max_yahoo_results)
1416 return
1417 else:
1418 try:
1419 n = long(prefix)
1420 if n <= 0:
1421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1422 return
1423 elif n > self._max_yahoo_results:
1424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425 n = self._max_yahoo_results
1426 self._download_n_results(query, n)
1427 return
1428 except ValueError: # parsing prefix as integer fails
1429 self._download_n_results(query, 1)
1430 return
1431
1432 def _download_n_results(self, query, n):
1433 """Downloads a specified number of results for a query"""
1434
1435 video_ids = []
1436 already_seen = set()
1437 pagenum = 1
1438
1439 while True:
1440 self.report_download_page(query, pagenum)
1441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442 request = urllib2.Request(result_url)
1443 try:
1444 page = urllib2.urlopen(request).read()
1445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1447 return
1448
1449 # Extract video identifiers
1450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451 video_id = mobj.group(1)
1452 if video_id not in already_seen:
1453 video_ids.append(video_id)
1454 already_seen.add(video_id)
1455 if len(video_ids) == n:
1456 # Specified n videos reached
1457 for id in video_ids:
1458 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1459 return
1460
1461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462 for id in video_ids:
1463 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1464 return
1465
1466 pagenum = pagenum + 1
1467
1468
1469 class YoutubePlaylistIE(InfoExtractor):
1470 """Information Extractor for YouTube playlists."""
1471
1472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&amp;list=(PL)?%s&'
1475 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476 IE_NAME = u'youtube:playlist'
1477
1478 def __init__(self, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1480
1481 def report_download_page(self, playlist_id, pagenum):
1482 """Report attempt to download playlist page with given number."""
1483 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1484
1485 def _real_extract(self, url):
1486 # Extract playlist id
1487 mobj = re.match(self._VALID_URL, url)
1488 if mobj is None:
1489 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1490 return
1491
1492 # Single video case
1493 if mobj.group(3) is not None:
1494 self._downloader.download([mobj.group(3)])
1495 return
1496
1497 # Download playlist pages
1498 # prefix is 'p' as default for playlists but there are other types that need extra care
1499 playlist_prefix = mobj.group(1)
1500 if playlist_prefix == 'a':
1501 playlist_access = 'artist'
1502 else:
1503 playlist_prefix = 'p'
1504 playlist_access = 'view_play_list'
1505 playlist_id = mobj.group(2)
1506 video_ids = []
1507 pagenum = 1
1508
1509 while True:
1510 self.report_download_page(playlist_id, pagenum)
1511 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512 request = urllib2.Request(url)
1513 try:
1514 page = urllib2.urlopen(request).read()
1515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1517 return
1518
1519 # Extract video identifiers
1520 ids_in_page = []
1521 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522 if mobj.group(1) not in ids_in_page:
1523 ids_in_page.append(mobj.group(1))
1524 video_ids.extend(ids_in_page)
1525
1526 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1527 break
1528 pagenum = pagenum + 1
1529
1530 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531 playlistend = self._downloader.params.get('playlistend', -1)
1532 if playlistend == -1:
1533 video_ids = video_ids[playliststart:]
1534 else:
1535 video_ids = video_ids[playliststart:playlistend]
1536
1537 for id in video_ids:
1538 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1539 return
1540
1541
1542 class YoutubeUserIE(InfoExtractor):
1543 """Information Extractor for YouTube users."""
1544
1545 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547 _GDATA_PAGE_SIZE = 50
1548 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550 IE_NAME = u'youtube:user'
1551
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1554
1555 def report_download_page(self, username, start_index):
1556 """Report attempt to download user page."""
1557 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1559
1560 def _real_extract(self, url):
1561 # Extract username
1562 mobj = re.match(self._VALID_URL, url)
1563 if mobj is None:
1564 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1565 return
1566
1567 username = mobj.group(1)
1568
1569 # Download video ids using YouTube Data API. Result size per
1570 # query is limited (currently to 50 videos) so we need to query
1571 # page by page until there are no video ids - it means we got
1572 # all of them.
1573
1574 video_ids = []
1575 pagenum = 0
1576
1577 while True:
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579 self.report_download_page(username, start_index)
1580
1581 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1582
1583 try:
1584 page = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1587 return
1588
1589 # Extract video identifiers
1590 ids_in_page = []
1591
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 if mobj.group(1) not in ids_in_page:
1594 ids_in_page.append(mobj.group(1))
1595
1596 video_ids.extend(ids_in_page)
1597
1598 # A little optimization - if current page is not
1599 # "full", ie. does not contain PAGE_SIZE video ids then
1600 # we can assume that this page is the last one - there
1601 # are no more ids on further pages - no need to query
1602 # again.
1603
1604 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1605 break
1606
1607 pagenum += 1
1608
1609 all_ids_count = len(video_ids)
1610 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611 playlistend = self._downloader.params.get('playlistend', -1)
1612
1613 if playlistend == -1:
1614 video_ids = video_ids[playliststart:]
1615 else:
1616 video_ids = video_ids[playliststart:playlistend]
1617
1618 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1619 (username, all_ids_count, len(video_ids)))
1620
1621 for video_id in video_ids:
1622 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1623
1624
1625 class BlipTVUserIE(InfoExtractor):
1626 """Information Extractor for blip.tv users."""
1627
1628 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1629 _PAGE_SIZE = 12
1630 IE_NAME = u'blip.tv:user'
1631
1632 def __init__(self, downloader=None):
1633 InfoExtractor.__init__(self, downloader)
1634
1635 def report_download_page(self, username, pagenum):
1636 """Report attempt to download user page."""
1637 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1638 (self.IE_NAME, username, pagenum))
1639
1640 def _real_extract(self, url):
1641 # Extract username
1642 mobj = re.match(self._VALID_URL, url)
1643 if mobj is None:
1644 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1645 return
1646
1647 username = mobj.group(1)
1648
1649 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1650
1651 request = urllib2.Request(url)
1652
1653 try:
1654 page = urllib2.urlopen(request).read().decode('utf-8')
1655 mobj = re.search(r'data-users-id="([^"]+)"', page)
1656 page_base = page_base % mobj.group(1)
1657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1658 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1659 return
1660
1661
1662 # Download video ids using BlipTV Ajax calls. Result size per
1663 # query is limited (currently to 12 videos) so we need to query
1664 # page by page until there are no video ids - it means we got
1665 # all of them.
1666
1667 video_ids = []
1668 pagenum = 1
1669
1670 while True:
1671 self.report_download_page(username, pagenum)
1672
1673 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1674
1675 try:
1676 page = urllib2.urlopen(request).read().decode('utf-8')
1677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1679 return
1680
1681 # Extract video identifiers
1682 ids_in_page = []
1683
1684 for mobj in re.finditer(r'href="/([^"]+)"', page):
1685 if mobj.group(1) not in ids_in_page:
1686 ids_in_page.append(unescapeHTML(mobj.group(1)))
1687
1688 video_ids.extend(ids_in_page)
1689
1690 # A little optimization - if current page is not
1691 # "full", ie. does not contain PAGE_SIZE video ids then
1692 # we can assume that this page is the last one - there
1693 # are no more ids on further pages - no need to query
1694 # again.
1695
1696 if len(ids_in_page) < self._PAGE_SIZE:
1697 break
1698
1699 pagenum += 1
1700
1701 all_ids_count = len(video_ids)
1702 playliststart = self._downloader.params.get('playliststart', 1) - 1
1703 playlistend = self._downloader.params.get('playlistend', -1)
1704
1705 if playlistend == -1:
1706 video_ids = video_ids[playliststart:]
1707 else:
1708 video_ids = video_ids[playliststart:playlistend]
1709
1710 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1711 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1712
1713 for video_id in video_ids:
1714 self._downloader.download([u'http://blip.tv/'+video_id])
1715
1716
1717 class DepositFilesIE(InfoExtractor):
1718 """Information extractor for depositfiles.com"""
1719
1720 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1721 IE_NAME = u'DepositFiles'
1722
1723 def __init__(self, downloader=None):
1724 InfoExtractor.__init__(self, downloader)
1725
1726 def report_download_webpage(self, file_id):
1727 """Report webpage download."""
1728 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1729
1730 def report_extraction(self, file_id):
1731 """Report information extraction."""
1732 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1733
1734 def _real_extract(self, url):
1735 file_id = url.split('/')[-1]
1736 # Rebuild url in english locale
1737 url = 'http://depositfiles.com/en/files/' + file_id
1738
1739 # Retrieve file webpage with 'Free download' button pressed
1740 free_download_indication = { 'gateway_result' : '1' }
1741 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1742 try:
1743 self.report_download_webpage(file_id)
1744 webpage = urllib2.urlopen(request).read()
1745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1746 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1747 return
1748
1749 # Search for the real file URL
1750 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1751 if (mobj is None) or (mobj.group(1) is None):
1752 # Try to figure out reason of the error.
1753 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1754 if (mobj is not None) and (mobj.group(1) is not None):
1755 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1756 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1757 else:
1758 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1759 return
1760
1761 file_url = mobj.group(1)
1762 file_extension = os.path.splitext(file_url)[1][1:]
1763
1764 # Search for file title
1765 mobj = re.search(r'<b title="(.*?)">', webpage)
1766 if mobj is None:
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1768 return
1769 file_title = mobj.group(1).decode('utf-8')
1770
1771 return [{
1772 'id': file_id.decode('utf-8'),
1773 'url': file_url.decode('utf-8'),
1774 'uploader': u'NA',
1775 'upload_date': u'NA',
1776 'title': file_title,
1777 'ext': file_extension.decode('utf-8'),
1778 'format': u'NA',
1779 'player_url': None,
1780 }]
1781
1782
1783 class FacebookIE(InfoExtractor):
1784 """Information Extractor for Facebook"""
1785
1786 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788 _NETRC_MACHINE = 'facebook'
1789 _available_formats = ['video', 'highqual', 'lowqual']
1790 _video_extensions = {
1791 'video': 'mp4',
1792 'highqual': 'mp4',
1793 'lowqual': 'mp4',
1794 }
1795 IE_NAME = u'facebook'
1796
1797 def __init__(self, downloader=None):
1798 InfoExtractor.__init__(self, downloader)
1799
1800 def _reporter(self, message):
1801 """Add header and report message."""
1802 self._downloader.to_screen(u'[facebook] %s' % message)
1803
1804 def report_login(self):
1805 """Report attempt to log in."""
1806 self._reporter(u'Logging in')
1807
1808 def report_video_webpage_download(self, video_id):
1809 """Report attempt to download video webpage."""
1810 self._reporter(u'%s: Downloading video webpage' % video_id)
1811
1812 def report_information_extraction(self, video_id):
1813 """Report attempt to extract video information."""
1814 self._reporter(u'%s: Extracting video information' % video_id)
1815
1816 def _parse_page(self, video_webpage):
1817 """Extract video information from page"""
1818 # General data
1819 data = {'title': r'\("video_title", "(.*?)"\)',
1820 'description': r'<div class="datawrap">(.*?)</div>',
1821 'owner': r'\("video_owner_name", "(.*?)"\)',
1822 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
1823 }
1824 video_info = {}
1825 for piece in data.keys():
1826 mobj = re.search(data[piece], video_webpage)
1827 if mobj is not None:
1828 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1829
1830 # Video urls
1831 video_urls = {}
1832 for fmt in self._available_formats:
1833 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
1834 if mobj is not None:
1835 # URL is in a Javascript segment inside an escaped Unicode format within
1836 # the generally utf-8 page
1837 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1838 video_info['video_urls'] = video_urls
1839
1840 return video_info
1841
1842 def _real_initialize(self):
1843 if self._downloader is None:
1844 return
1845
1846 useremail = None
1847 password = None
1848 downloader_params = self._downloader.params
1849
1850 # Attempt to use provided username and password or .netrc data
1851 if downloader_params.get('username', None) is not None:
1852 useremail = downloader_params['username']
1853 password = downloader_params['password']
1854 elif downloader_params.get('usenetrc', False):
1855 try:
1856 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1857 if info is not None:
1858 useremail = info[0]
1859 password = info[2]
1860 else:
1861 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1862 except (IOError, netrc.NetrcParseError), err:
1863 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
1864 return
1865
1866 if useremail is None:
1867 return
1868
1869 # Log in
1870 login_form = {
1871 'email': useremail,
1872 'pass': password,
1873 'login': 'Log+In'
1874 }
1875 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1876 try:
1877 self.report_login()
1878 login_results = urllib2.urlopen(request).read()
1879 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1880 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1881 return
1882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
1884 return
1885
1886 def _real_extract(self, url):
1887 mobj = re.match(self._VALID_URL, url)
1888 if mobj is None:
1889 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
1890 return
1891 video_id = mobj.group('ID')
1892
1893 # Get video webpage
1894 self.report_video_webpage_download(video_id)
1895 request = urllib2.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
1896 try:
1897 page = urllib2.urlopen(request)
1898 video_webpage = page.read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
1901 return
1902
1903 # Start extracting information
1904 self.report_information_extraction(video_id)
1905
1906 # Extract information
1907 video_info = self._parse_page(video_webpage)
1908
1909 # uploader
1910 if 'owner' not in video_info:
1911 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1912 return
1913 video_uploader = video_info['owner']
1914
1915 # title
1916 if 'title' not in video_info:
1917 self._downloader.trouble(u'ERROR: unable to extract video title')
1918 return
1919 video_title = video_info['title']
1920 video_title = video_title.decode('utf-8')
1921
1922 # thumbnail image
1923 if 'thumbnail' not in video_info:
1924 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
1925 video_thumbnail = ''
1926 else:
1927 video_thumbnail = video_info['thumbnail']
1928
1929 # upload date
1930 upload_date = u'NA'
1931 if 'upload_date' in video_info:
1932 upload_time = video_info['upload_date']
1933 timetuple = email.utils.parsedate_tz(upload_time)
1934 if timetuple is not None:
1935 try:
1936 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
1937 except:
1938 pass
1939
1940 # description
1941 video_description = video_info.get('description', 'No description available.')
1942
1943 url_map = video_info['video_urls']
1944 if len(url_map.keys()) > 0:
1945 # Decide which formats to download
1946 req_format = self._downloader.params.get('format', None)
1947 format_limit = self._downloader.params.get('format_limit', None)
1948
1949 if format_limit is not None and format_limit in self._available_formats:
1950 format_list = self._available_formats[self._available_formats.index(format_limit):]
1951 else:
1952 format_list = self._available_formats
1953 existing_formats = [x for x in format_list if x in url_map]
1954 if len(existing_formats) == 0:
1955 self._downloader.trouble(u'ERROR: no known formats available for video')
1956 return
1957 if req_format is None:
1958 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1959 elif req_format == 'worst':
1960 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1961 elif req_format == '-1':
1962 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1963 else:
1964 # Specific format
1965 if req_format not in url_map:
1966 self._downloader.trouble(u'ERROR: requested format not available')
1967 return
1968 video_url_list = [(req_format, url_map[req_format])] # Specific format
1969
1970 results = []
1971 for format_param, video_real_url in video_url_list:
1972 # Extension
1973 video_extension = self._video_extensions.get(format_param, 'mp4')
1974
1975 results.append({
1976 'id': video_id.decode('utf-8'),
1977 'url': video_real_url.decode('utf-8'),
1978 'uploader': video_uploader.decode('utf-8'),
1979 'upload_date': upload_date,
1980 'title': video_title,
1981 'ext': video_extension.decode('utf-8'),
1982 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1983 'thumbnail': video_thumbnail.decode('utf-8'),
1984 'description': video_description.decode('utf-8'),
1985 'player_url': None,
1986 })
1987 return results
1988
1989 class BlipTVIE(InfoExtractor):
1990 """Information extractor for blip.tv"""
1991
1992 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
1993 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1994 IE_NAME = u'blip.tv'
1995
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
1999
2000 def report_direct_download(self, title):
2001 """Report information extraction."""
2002 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2003
2004 def _real_extract(self, url):
2005 mobj = re.match(self._VALID_URL, url)
2006 if mobj is None:
2007 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2008 return
2009
2010 if '?' in url:
2011 cchar = '&'
2012 else:
2013 cchar = '?'
2014 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2015 request = urllib2.Request(json_url.encode('utf-8'))
2016 self.report_extraction(mobj.group(1))
2017 info = None
2018 try:
2019 urlh = urllib2.urlopen(request)
2020 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2021 basename = url.split('/')[-1]
2022 title,ext = os.path.splitext(basename)
2023 title = title.decode('UTF-8')
2024 ext = ext.replace('.', '')
2025 self.report_direct_download(title)
2026 info = {
2027 'id': title,
2028 'url': url,
2029 'title': title,
2030 'ext': ext,
2031 'urlhandle': urlh
2032 }
2033 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2034 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
2035 return
2036 if info is None: # Regular URL
2037 try:
2038 json_code = urlh.read()
2039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % str(err))
2041 return
2042
2043 try:
2044 json_data = json.loads(json_code)
2045 if 'Post' in json_data:
2046 data = json_data['Post']
2047 else:
2048 data = json_data
2049
2050 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2051 video_url = data['media']['url']
2052 umobj = re.match(self._URL_EXT, video_url)
2053 if umobj is None:
2054 raise ValueError('Can not determine filename extension')
2055 ext = umobj.group(1)
2056
2057 info = {
2058 'id': data['item_id'],
2059 'url': video_url,
2060 'uploader': data['display_name'],
2061 'upload_date': upload_date,
2062 'title': data['title'],
2063 'ext': ext,
2064 'format': data['media']['mimeType'],
2065 'thumbnail': data['thumbnailUrl'],
2066 'description': data['description'],
2067 'player_url': data['embedUrl']
2068 }
2069 except (ValueError,KeyError), err:
2070 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2071 return
2072
2073 std_headers['User-Agent'] = 'iTunes/10.6.1'
2074 return [info]
2075
2076
2077 class MyVideoIE(InfoExtractor):
2078 """Information Extractor for myvideo.de."""
2079
2080 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2081 IE_NAME = u'myvideo'
2082
2083 def __init__(self, downloader=None):
2084 InfoExtractor.__init__(self, downloader)
2085
2086 def report_download_webpage(self, video_id):
2087 """Report webpage download."""
2088 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2089
2090 def report_extraction(self, video_id):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2093
2094 def _real_extract(self,url):
2095 mobj = re.match(self._VALID_URL, url)
2096 if mobj is None:
2097 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2098 return
2099
2100 video_id = mobj.group(1)
2101
2102 # Get video webpage
2103 request = urllib2.Request('http://www.myvideo.de/watch/%s' % video_id)
2104 try:
2105 self.report_download_webpage(video_id)
2106 webpage = urllib2.urlopen(request).read()
2107 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2108 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
2109 return
2110
2111 self.report_extraction(video_id)
2112 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2113 webpage)
2114 if mobj is None:
2115 self._downloader.trouble(u'ERROR: unable to extract media URL')
2116 return
2117 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2118
2119 mobj = re.search('<title>([^<]+)</title>', webpage)
2120 if mobj is None:
2121 self._downloader.trouble(u'ERROR: unable to extract title')
2122 return
2123
2124 video_title = mobj.group(1)
2125
2126 return [{
2127 'id': video_id,
2128 'url': video_url,
2129 'uploader': u'NA',
2130 'upload_date': u'NA',
2131 'title': video_title,
2132 'ext': u'flv',
2133 'format': u'NA',
2134 'player_url': None,
2135 }]
2136
2137 class ComedyCentralIE(InfoExtractor):
2138 """Information extractor for The Daily Show and Colbert Report """
2139
2140 _VALID_URL = r'^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?P<showname>thedailyshow|colbertnation)\.com/full-episodes/(?P<episode>.*)$'
2141 IE_NAME = u'comedycentral'
2142
2143 def report_extraction(self, episode_id):
2144 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2145
2146 def report_config_download(self, episode_id):
2147 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2148
2149 def report_index_download(self, episode_id):
2150 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2151
2152 def report_player_url(self, episode_id):
2153 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2154
2155 def _real_extract(self, url):
2156 mobj = re.match(self._VALID_URL, url)
2157 if mobj is None:
2158 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2159 return
2160
2161 if mobj.group('shortname'):
2162 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2163 url = u'http://www.thedailyshow.com/full-episodes/'
2164 else:
2165 url = u'http://www.colbertnation.com/full-episodes/'
2166 mobj = re.match(self._VALID_URL, url)
2167 assert mobj is not None
2168
2169 dlNewest = not mobj.group('episode')
2170 if dlNewest:
2171 epTitle = mobj.group('showname')
2172 else:
2173 epTitle = mobj.group('episode')
2174
2175 req = urllib2.Request(url)
2176 self.report_extraction(epTitle)
2177 try:
2178 htmlHandle = urllib2.urlopen(req)
2179 html = htmlHandle.read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2182 return
2183 if dlNewest:
2184 url = htmlHandle.geturl()
2185 mobj = re.match(self._VALID_URL, url)
2186 if mobj is None:
2187 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2188 return
2189 if mobj.group('episode') == '':
2190 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2191 return
2192 epTitle = mobj.group('episode')
2193
2194 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*episode.*?:.*?))"', html)
2195 if len(mMovieParams) == 0:
2196 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2197 return
2198
2199 playerUrl_raw = mMovieParams[0][0]
2200 self.report_player_url(epTitle)
2201 try:
2202 urlHandle = urllib2.urlopen(playerUrl_raw)
2203 playerUrl = urlHandle.geturl()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + unicode(err))
2206 return
2207
2208 uri = mMovieParams[0][1]
2209 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + urllib.urlencode({'uri': uri})
2210 self.report_index_download(epTitle)
2211 try:
2212 indexXml = urllib2.urlopen(indexUrl).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR: unable to download episode index: ' + unicode(err))
2215 return
2216
2217 results = []
2218
2219 idoc = xml.etree.ElementTree.fromstring(indexXml)
2220 itemEls = idoc.findall('.//item')
2221 for itemEl in itemEls:
2222 mediaId = itemEl.findall('./guid')[0].text
2223 shortMediaId = mediaId.split(':')[-1]
2224 showId = mediaId.split(':')[-2].replace('.com', '')
2225 officialTitle = itemEl.findall('./title')[0].text
2226 officialDate = itemEl.findall('./pubDate')[0].text
2227
2228 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2229 urllib.urlencode({'uri': mediaId}))
2230 configReq = urllib2.Request(configUrl)
2231 self.report_config_download(epTitle)
2232 try:
2233 configXml = urllib2.urlopen(configReq).read()
2234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % unicode(err))
2236 return
2237
2238 cdoc = xml.etree.ElementTree.fromstring(configXml)
2239 turls = []
2240 for rendition in cdoc.findall('.//rendition'):
2241 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2242 turls.append(finfo)
2243
2244 if len(turls) == 0:
2245 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2246 continue
2247
2248 # For now, just pick the highest bitrate
2249 format,video_url = turls[-1]
2250
2251 effTitle = showId + u'-' + epTitle
2252 info = {
2253 'id': shortMediaId,
2254 'url': video_url,
2255 'uploader': showId,
2256 'upload_date': officialDate,
2257 'title': effTitle,
2258 'ext': 'mp4',
2259 'format': format,
2260 'thumbnail': None,
2261 'description': officialTitle,
2262 'player_url': playerUrl
2263 }
2264
2265 results.append(info)
2266
2267 return results
2268
2269
2270 class EscapistIE(InfoExtractor):
2271 """Information extractor for The Escapist """
2272
2273 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2274 IE_NAME = u'escapist'
2275
2276 def report_extraction(self, showName):
2277 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2278
2279 def report_config_download(self, showName):
2280 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2281
2282 def _real_extract(self, url):
2283 mobj = re.match(self._VALID_URL, url)
2284 if mobj is None:
2285 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2286 return
2287 showName = mobj.group('showname')
2288 videoId = mobj.group('episode')
2289
2290 self.report_extraction(showName)
2291 try:
2292 webPage = urllib2.urlopen(url)
2293 webPageBytes = webPage.read()
2294 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2295 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2298 return
2299
2300 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2301 description = unescapeHTML(descMatch.group(1))
2302 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2303 imgUrl = unescapeHTML(imgMatch.group(1))
2304 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2305 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2306 configUrlMatch = re.search('config=(.*)$', playerUrl)
2307 configUrl = urllib2.unquote(configUrlMatch.group(1))
2308
2309 self.report_config_download(showName)
2310 try:
2311 configJSON = urllib2.urlopen(configUrl).read()
2312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313 self._downloader.trouble(u'ERROR: unable to download configuration: ' + unicode(err))
2314 return
2315
2316 # Technically, it's JavaScript, not JSON
2317 configJSON = configJSON.replace("'", '"')
2318
2319 try:
2320 config = json.loads(configJSON)
2321 except (ValueError,), err:
2322 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + unicode(err))
2323 return
2324
2325 playlist = config['playlist']
2326 videoUrl = playlist[1]['url']
2327
2328 info = {
2329 'id': videoId,
2330 'url': videoUrl,
2331 'uploader': showName,
2332 'upload_date': None,
2333 'title': showName,
2334 'ext': 'flv',
2335 'format': 'flv',
2336 'thumbnail': imgUrl,
2337 'description': description,
2338 'player_url': playerUrl,
2339 }
2340
2341 return [info]
2342
2343
2344 class CollegeHumorIE(InfoExtractor):
2345 """Information extractor for collegehumor.com"""
2346
2347 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2348 IE_NAME = u'collegehumor'
2349
2350 def report_webpage(self, video_id):
2351 """Report information extraction."""
2352 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2353
2354 def report_extraction(self, video_id):
2355 """Report information extraction."""
2356 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2357
2358 def _real_extract(self, url):
2359 mobj = re.match(self._VALID_URL, url)
2360 if mobj is None:
2361 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2362 return
2363 video_id = mobj.group('videoid')
2364
2365 self.report_webpage(video_id)
2366 request = urllib2.Request(url)
2367 try:
2368 webpage = urllib2.urlopen(request).read()
2369 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2370 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2371 return
2372
2373 m = re.search(r'id="video:(?P<internalvideoid>[0-9]+)"', webpage)
2374 if m is None:
2375 self._downloader.trouble(u'ERROR: Cannot extract internal video ID')
2376 return
2377 internal_video_id = m.group('internalvideoid')
2378
2379 info = {
2380 'id': video_id,
2381 'internal_id': internal_video_id,
2382 }
2383
2384 self.report_extraction(video_id)
2385 xmlUrl = 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2386 try:
2387 metaXml = urllib2.urlopen(xmlUrl).read()
2388 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2389 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err))
2390 return
2391
2392 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2393 try:
2394 videoNode = mdoc.findall('./video')[0]
2395 info['description'] = videoNode.findall('./description')[0].text
2396 info['title'] = videoNode.findall('./caption')[0].text
2397 info['url'] = videoNode.findall('./file')[0].text
2398 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2399 info['ext'] = info['url'].rpartition('.')[2]
2400 info['format'] = info['ext']
2401 except IndexError:
2402 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2403 return
2404
2405 return [info]
2406
2407
2408 class XVideosIE(InfoExtractor):
2409 """Information extractor for xvideos.com"""
2410
2411 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412 IE_NAME = u'xvideos'
2413
2414 def report_webpage(self, video_id):
2415 """Report information extraction."""
2416 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2417
2418 def report_extraction(self, video_id):
2419 """Report information extraction."""
2420 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2421
2422 def _real_extract(self, url):
2423 mobj = re.match(self._VALID_URL, url)
2424 if mobj is None:
2425 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2426 return
2427 video_id = mobj.group(1).decode('utf-8')
2428
2429 self.report_webpage(video_id)
2430
2431 request = urllib2.Request(r'http://www.xvideos.com/video' + video_id)
2432 try:
2433 webpage = urllib2.urlopen(request).read()
2434 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2435 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2436 return
2437
2438 self.report_extraction(video_id)
2439
2440
2441 # Extract video URL
2442 mobj = re.search(r'flv_url=(.+?)&', webpage)
2443 if mobj is None:
2444 self._downloader.trouble(u'ERROR: unable to extract video url')
2445 return
2446 video_url = urllib2.unquote(mobj.group(1).decode('utf-8'))
2447
2448
2449 # Extract title
2450 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2451 if mobj is None:
2452 self._downloader.trouble(u'ERROR: unable to extract video title')
2453 return
2454 video_title = mobj.group(1).decode('utf-8')
2455
2456
2457 # Extract video thumbnail
2458 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2459 if mobj is None:
2460 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2461 return
2462 video_thumbnail = mobj.group(0).decode('utf-8')
2463
2464 info = {
2465 'id': video_id,
2466 'url': video_url,
2467 'uploader': None,
2468 'upload_date': None,
2469 'title': video_title,
2470 'ext': 'flv',
2471 'format': 'flv',
2472 'thumbnail': video_thumbnail,
2473 'description': None,
2474 'player_url': None,
2475 }
2476
2477 return [info]
2478
2479
2480 class SoundcloudIE(InfoExtractor):
2481 """Information extractor for soundcloud.com
2482 To access the media, the uid of the song and a stream token
2483 must be extracted from the page source and the script must make
2484 a request to media.soundcloud.com/crossdomain.xml. Then
2485 the media can be grabbed by requesting from an url composed
2486 of the stream token and uid
2487 """
2488
2489 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2490 IE_NAME = u'soundcloud'
2491
2492 def __init__(self, downloader=None):
2493 InfoExtractor.__init__(self, downloader)
2494
2495 def report_webpage(self, video_id):
2496 """Report information extraction."""
2497 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2498
2499 def report_extraction(self, video_id):
2500 """Report information extraction."""
2501 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2502
2503 def _real_extract(self, url):
2504 mobj = re.match(self._VALID_URL, url)
2505 if mobj is None:
2506 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507 return
2508
2509 # extract uploader (which is in the url)
2510 uploader = mobj.group(1).decode('utf-8')
2511 # extract simple title (uploader + slug of song title)
2512 slug_title = mobj.group(2).decode('utf-8')
2513 simple_title = uploader + u'-' + slug_title
2514
2515 self.report_webpage('%s/%s' % (uploader, slug_title))
2516
2517 request = urllib2.Request('http://soundcloud.com/%s/%s' % (uploader, slug_title))
2518 try:
2519 webpage = urllib2.urlopen(request).read()
2520 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2521 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2522 return
2523
2524 self.report_extraction('%s/%s' % (uploader, slug_title))
2525
2526 # extract uid and stream token that soundcloud hands out for access
2527 mobj = re.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage)
2528 if mobj:
2529 video_id = mobj.group(1)
2530 stream_token = mobj.group(2)
2531
2532 # extract unsimplified title
2533 mobj = re.search('"title":"(.*?)",', webpage)
2534 if mobj:
2535 title = mobj.group(1).decode('utf-8')
2536 else:
2537 title = simple_title
2538
2539 # construct media url (with uid/token)
2540 mediaURL = "http://media.soundcloud.com/stream/%s?stream_token=%s"
2541 mediaURL = mediaURL % (video_id, stream_token)
2542
2543 # description
2544 description = u'No description available'
2545 mobj = re.search('track-description-value"><p>(.*?)</p>', webpage)
2546 if mobj:
2547 description = mobj.group(1)
2548
2549 # upload date
2550 upload_date = None
2551 mobj = re.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage)
2552 if mobj:
2553 try:
2554 upload_date = datetime.datetime.strptime(mobj.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2555 except Exception, e:
2556 self._downloader.to_stderr(str(e))
2557
2558 # for soundcloud, a request to a cross domain is required for cookies
2559 request = urllib2.Request('http://media.soundcloud.com/crossdomain.xml', std_headers)
2560
2561 return [{
2562 'id': video_id.decode('utf-8'),
2563 'url': mediaURL,
2564 'uploader': uploader.decode('utf-8'),
2565 'upload_date': upload_date,
2566 'title': title,
2567 'ext': u'mp3',
2568 'format': u'NA',
2569 'player_url': None,
2570 'description': description.decode('utf-8')
2571 }]
2572
2573
2574 class InfoQIE(InfoExtractor):
2575 """Information extractor for infoq.com"""
2576
2577 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2578 IE_NAME = u'infoq'
2579
2580 def report_webpage(self, video_id):
2581 """Report information extraction."""
2582 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2583
2584 def report_extraction(self, video_id):
2585 """Report information extraction."""
2586 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2587
2588 def _real_extract(self, url):
2589 mobj = re.match(self._VALID_URL, url)
2590 if mobj is None:
2591 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2592 return
2593
2594 self.report_webpage(url)
2595
2596 request = urllib2.Request(url)
2597 try:
2598 webpage = urllib2.urlopen(request).read()
2599 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2600 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2601 return
2602
2603 self.report_extraction(url)
2604
2605
2606 # Extract video URL
2607 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2608 if mobj is None:
2609 self._downloader.trouble(u'ERROR: unable to extract video url')
2610 return
2611 video_url = 'rtmpe://video.infoq.com/cfx/st/' + urllib2.unquote(mobj.group(1).decode('base64'))
2612
2613
2614 # Extract title
2615 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2616 if mobj is None:
2617 self._downloader.trouble(u'ERROR: unable to extract video title')
2618 return
2619 video_title = mobj.group(1).decode('utf-8')
2620
2621 # Extract description
2622 video_description = u'No description available.'
2623 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2624 if mobj is not None:
2625 video_description = mobj.group(1).decode('utf-8')
2626
2627 video_filename = video_url.split('/')[-1]
2628 video_id, extension = video_filename.split('.')
2629
2630 info = {
2631 'id': video_id,
2632 'url': video_url,
2633 'uploader': None,
2634 'upload_date': None,
2635 'title': video_title,
2636 'ext': extension,
2637 'format': extension, # Extension is always(?) mp4, but seems to be flv
2638 'thumbnail': None,
2639 'description': video_description,
2640 'player_url': None,
2641 }
2642
2643 return [info]
2644
2645 class MixcloudIE(InfoExtractor):
2646 """Information extractor for www.mixcloud.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'mixcloud'
2649
2650 def __init__(self, downloader=None):
2651 InfoExtractor.__init__(self, downloader)
2652
2653 def report_download_json(self, file_id):
2654 """Report JSON download."""
2655 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2656
2657 def report_extraction(self, file_id):
2658 """Report information extraction."""
2659 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2660
2661 def get_urls(self, jsonData, fmt, bitrate='best'):
2662 """Get urls from 'audio_formats' section in json"""
2663 file_url = None
2664 try:
2665 bitrate_list = jsonData[fmt]
2666 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2667 bitrate = max(bitrate_list) # select highest
2668
2669 url_list = jsonData[fmt][bitrate]
2670 except TypeError: # we have no bitrate info.
2671 url_list = jsonData[fmt]
2672 return url_list
2673
2674 def check_urls(self, url_list):
2675 """Returns 1st active url from list"""
2676 for url in url_list:
2677 try:
2678 urllib2.urlopen(url)
2679 return url
2680 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2681 url = None
2682
2683 return None
2684
2685 def _print_formats(self, formats):
2686 print 'Available formats:'
2687 for fmt in formats.keys():
2688 for b in formats[fmt]:
2689 try:
2690 ext = formats[fmt][b][0]
2691 print '%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1])
2692 except TypeError: # we have no bitrate info
2693 ext = formats[fmt][0]
2694 print '%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1])
2695 break
2696
2697 def _real_extract(self, url):
2698 mobj = re.match(self._VALID_URL, url)
2699 if mobj is None:
2700 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2701 return
2702 # extract uploader & filename from url
2703 uploader = mobj.group(1).decode('utf-8')
2704 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2705
2706 # construct API request
2707 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2708 # retrieve .json file with links to files
2709 request = urllib2.Request(file_url)
2710 try:
2711 self.report_download_json(file_url)
2712 jsonData = urllib2.urlopen(request).read()
2713 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2714 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % str(err))
2715 return
2716
2717 # parse JSON
2718 json_data = json.loads(jsonData)
2719 player_url = json_data['player_swf_url']
2720 formats = dict(json_data['audio_formats'])
2721
2722 req_format = self._downloader.params.get('format', None)
2723 bitrate = None
2724
2725 if self._downloader.params.get('listformats', None):
2726 self._print_formats(formats)
2727 return
2728
2729 if req_format is None or req_format == 'best':
2730 for format_param in formats.keys():
2731 url_list = self.get_urls(formats, format_param)
2732 # check urls
2733 file_url = self.check_urls(url_list)
2734 if file_url is not None:
2735 break # got it!
2736 else:
2737 if req_format not in formats.keys():
2738 self._downloader.trouble(u'ERROR: format is not available')
2739 return
2740
2741 url_list = self.get_urls(formats, req_format)
2742 file_url = self.check_urls(url_list)
2743 format_param = req_format
2744
2745 return [{
2746 'id': file_id.decode('utf-8'),
2747 'url': file_url.decode('utf-8'),
2748 'uploader': uploader.decode('utf-8'),
2749 'upload_date': u'NA',
2750 'title': json_data['name'],
2751 'ext': file_url.split('.')[-1].decode('utf-8'),
2752 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2753 'thumbnail': json_data['thumbnail_url'],
2754 'description': json_data['description'],
2755 'player_url': player_url.decode('utf-8'),
2756 }]
2757
2758 class StanfordOpenClassroomIE(InfoExtractor):
2759 """Information extractor for Stanford's Open ClassRoom"""
2760
2761 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2762 IE_NAME = u'stanfordoc'
2763
2764 def report_download_webpage(self, objid):
2765 """Report information extraction."""
2766 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2767
2768 def report_extraction(self, video_id):
2769 """Report information extraction."""
2770 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2771
2772 def _real_extract(self, url):
2773 mobj = re.match(self._VALID_URL, url)
2774 if mobj is None:
2775 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2776 return
2777
2778 if mobj.group('course') and mobj.group('video'): # A specific video
2779 course = mobj.group('course')
2780 video = mobj.group('video')
2781 info = {
2782 'id': course + '_' + video,
2783 }
2784
2785 self.report_extraction(info['id'])
2786 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2787 xmlUrl = baseUrl + video + '.xml'
2788 try:
2789 metaXml = urllib2.urlopen(xmlUrl).read()
2790 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2791 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err))
2792 return
2793 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2794 try:
2795 info['title'] = mdoc.findall('./title')[0].text
2796 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2797 except IndexError:
2798 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2799 return
2800 info['ext'] = info['url'].rpartition('.')[2]
2801 info['format'] = info['ext']
2802 return [info]
2803 elif mobj.group('course'): # A course page
2804 course = mobj.group('course')
2805 info = {
2806 'id': course,
2807 'type': 'playlist',
2808 }
2809
2810 self.report_download_webpage(info['id'])
2811 try:
2812 coursepage = urllib2.urlopen(url).read()
2813 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2814 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2815 return
2816
2817 m = re.search('<h1>([^<]+)</h1>', coursepage)
2818 if m:
2819 info['title'] = unescapeHTML(m.group(1))
2820 else:
2821 info['title'] = info['id']
2822
2823 m = re.search('<description>([^<]+)</description>', coursepage)
2824 if m:
2825 info['description'] = unescapeHTML(m.group(1))
2826
2827 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2828 info['list'] = [
2829 {
2830 'type': 'reference',
2831 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2832 }
2833 for vpage in links]
2834 results = []
2835 for entry in info['list']:
2836 assert entry['type'] == 'reference'
2837 results += self.extract(entry['url'])
2838 return results
2839
2840 else: # Root page
2841 info = {
2842 'id': 'Stanford OpenClassroom',
2843 'type': 'playlist',
2844 }
2845
2846 self.report_download_webpage(info['id'])
2847 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2848 try:
2849 rootpage = urllib2.urlopen(rootURL).read()
2850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2851 self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err))
2852 return
2853
2854 info['title'] = info['id']
2855
2856 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2857 info['list'] = [
2858 {
2859 'type': 'reference',
2860 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2861 }
2862 for cpage in links]
2863
2864 results = []
2865 for entry in info['list']:
2866 assert entry['type'] == 'reference'
2867 results += self.extract(entry['url'])
2868 return results
2869
2870 class MTVIE(InfoExtractor):
2871 """Information extractor for MTV.com"""
2872
2873 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2874 IE_NAME = u'mtv'
2875
2876 def report_webpage(self, video_id):
2877 """Report information extraction."""
2878 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2879
2880 def report_extraction(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2883
2884 def _real_extract(self, url):
2885 mobj = re.match(self._VALID_URL, url)
2886 if mobj is None:
2887 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2888 return
2889 if not mobj.group('proto'):
2890 url = 'http://' + url
2891 video_id = mobj.group('videoid')
2892 self.report_webpage(video_id)
2893
2894 request = urllib2.Request(url)
2895 try:
2896 webpage = urllib2.urlopen(request).read()
2897 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2898 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err))
2899 return
2900
2901 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2902 if mobj is None:
2903 self._downloader.trouble(u'ERROR: unable to extract song name')
2904 return
2905 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2906 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2907 if mobj is None:
2908 self._downloader.trouble(u'ERROR: unable to extract performer')
2909 return
2910 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2911 video_title = performer + ' - ' + song_name
2912
2913 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2914 if mobj is None:
2915 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
2916 return
2917 mtvn_uri = mobj.group(1)
2918
2919 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2920 if mobj is None:
2921 self._downloader.trouble(u'ERROR: unable to extract content id')
2922 return
2923 content_id = mobj.group(1)
2924
2925 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926 self.report_extraction(video_id)
2927 request = urllib2.Request(videogen_url)
2928 try:
2929 metadataXml = urllib2.urlopen(request).read()
2930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2931 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err))
2932 return
2933
2934 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2935 renditions = mdoc.findall('.//rendition')
2936
2937 # For now, always pick the highest quality.
2938 rendition = renditions[-1]
2939
2940 try:
2941 _,_,ext = rendition.attrib['type'].partition('/')
2942 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2943 video_url = rendition.find('./src').text
2944 except KeyError:
2945 self._downloader.trouble('Invalid rendition field.')
2946 return
2947
2948 info = {
2949 'id': video_id,
2950 'url': video_url,
2951 'uploader': performer,
2952 'title': video_title,
2953 'ext': ext,
2954 'format': format,
2955 }
2956
2957 return [info]