]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
a8e3bd36cd32d6c50bbc4cc686c32496590d2fa9
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # Author: Vasyl' Vavrychuk
7 # License: Public domain code
26 # parse_qs was moved from the cgi module to the urlparse module recently.
28 from urlparse
import parse_qs
30 from cgi
import parse_qs
33 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
34 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
35 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
36 'Accept-Language': 'en-us,en;q=0.5',
39 simple_title_chars
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii')
41 def preferredencoding():
42 """Get preferred encoding.
44 Returns the best encoding scheme for the system, based on
45 locale.getpreferredencoding() and some further tweaks.
47 def yield_preferredencoding():
49 pref
= locale
.getpreferredencoding()
55 return yield_preferredencoding().next()
57 def htmlentity_transform(matchobj
):
58 """Transforms an HTML entity to a Unicode character.
60 This function receives a match object and is intended to be used with
61 the re.sub() function.
63 entity
= matchobj
.group(1)
65 # Known non-numeric HTML entity
66 if entity
in htmlentitydefs
.name2codepoint
:
67 return unichr(htmlentitydefs
.name2codepoint
[entity
])
70 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
72 numstr
= mobj
.group(1)
73 if numstr
.startswith(u
'x'):
75 numstr
= u
'0%s' % numstr
78 return unichr(long(numstr
, base
))
80 # Unknown entity in name, return its literal representation
81 return (u
'&%s;' % entity
)
83 def sanitize_title(utitle
):
84 """Sanitizes a video title so it could be used as part of a filename."""
85 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
86 return utitle
.replace(unicode(os
.sep
), u
'%')
88 def sanitize_open(filename
, open_mode
):
89 """Try to open the given filename, and slightly tweak it if this fails.
91 Attempts to open the given filename. If this fails, it tries to change
92 the filename slightly, step by step, until it's either able to open it
93 or it fails and raises a final exception, like the standard open()
96 It returns the tuple (stream, definitive_file_name).
100 if sys
.platform
== 'win32':
102 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
103 return (sys
.stdout
, filename
)
104 stream
= open(filename
, open_mode
)
105 return (stream
, filename
)
106 except (IOError, OSError), err
:
107 # In case of error, try to remove win32 forbidden chars
108 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
110 # An exception here should be caught in the caller
111 stream
= open(filename
, open_mode
)
112 return (stream
, filename
)
114 class DownloadError(Exception):
115 """Download Error exception.
117 This exception may be thrown by FileDownloader objects if they are not
118 configured to continue on errors. They will contain the appropriate
123 class SameFileError(Exception):
124 """Same File exception.
126 This exception will be thrown by FileDownloader objects if they detect
127 multiple files would have to be downloaded to the same file on disk.
131 class PostProcessingError(Exception):
132 """Post Processing exception.
134 This exception may be raised by PostProcessor's .run() method to
135 indicate an error in the postprocessing task.
139 class UnavailableVideoError(Exception):
140 """Unavailable Format exception.
142 This exception will be thrown when a video is requested
143 in a format that is not available for that video.
147 class ContentTooShortError(Exception):
148 """Content Too Short exception.
150 This exception may be raised by FileDownloader objects when a file they
151 download is too small for what the server announced first, indicating
152 the connection was probably interrupted.
158 def __init__(self
, downloaded
, expected
):
159 self
.downloaded
= downloaded
160 self
.expected
= expected
162 class FileDownloader(object):
163 """File Downloader class.
165 File downloader objects are the ones responsible of downloading the
166 actual video file and writing it to disk if the user has requested
167 it, among some other tasks. In most cases there should be one per
168 program. As, given a video URL, the downloader doesn't know how to
169 extract all the needed information, task that InfoExtractors do, it
170 has to pass the URL to one of them.
172 For this, file downloader objects have a method that allows
173 InfoExtractors to be registered in a given order. When it is passed
174 a URL, the file downloader handles it to the first InfoExtractor it
175 finds that reports being able to handle it. The InfoExtractor extracts
176 all the information about the video or videos the URL refers to, and
177 asks the FileDownloader to process the video information, possibly
178 downloading the video.
180 File downloaders accept a lot of parameters. In order not to saturate
181 the object constructor with arguments, it receives a dictionary of
182 options instead. These options are available through the params
183 attribute for the InfoExtractors to use. The FileDownloader also
184 registers itself as the downloader in charge for the InfoExtractors
185 that are added to it, so this is a "mutual registration".
189 username: Username for authentication purposes.
190 password: Password for authentication purposes.
191 usenetrc: Use netrc for authentication instead.
192 quiet: Do not print messages to stdout.
193 forceurl: Force printing final URL.
194 forcetitle: Force printing title.
195 forcethumbnail: Force printing thumbnail URL.
196 forcedescription: Force printing description.
197 simulate: Do not download the video files.
198 format: Video format code.
199 format_limit: Highest quality format to try.
200 outtmpl: Template for output names.
201 ignoreerrors: Do not stop on download errors.
202 ratelimit: Download speed limit, in bytes/sec.
203 nooverwrites: Prevent overwriting files.
204 retries: Number of times to retry for HTTP error 5xx
205 continuedl: Try to continue downloads if possible.
206 noprogress: Do not print the progress bar.
207 playliststart: Playlist item to start at.
208 playlistend: Playlist item to end at.
209 logtostderr: Log messages to stderr instead of stdout.
215 _download_retcode
= None
216 _num_downloads
= None
219 def __init__(self
, params
):
220 """Create a FileDownloader object with the given options."""
223 self
._download
_retcode
= 0
224 self
._num
_downloads
= 0
225 self
._screen
_file
= [sys
.stdout
, sys
.stderr
][params
.get('logtostderr', False)]
229 def pmkdir(filename
):
230 """Create directory components in filename. Similar to Unix "mkdir -p"."""
231 components
= filename
.split(os
.sep
)
232 aggregate
= [os
.sep
.join(components
[0:x
]) for x
in xrange(1, len(components
))]
233 aggregate
= ['%s%s' % (x
, os
.sep
) for x
in aggregate
] # Finish names with separator
234 for dir in aggregate
:
235 if not os
.path
.exists(dir):
239 def temp_name(filename
):
240 """Returns a temporary filename for the given filename."""
241 if filename
== u
'-' or (os
.path
.exists(filename
) and not os
.path
.isfile(filename
)):
243 return filename
+ u
'.part'
246 def format_bytes(bytes):
249 if type(bytes) is str:
254 exponent
= long(math
.log(bytes, 1024.0))
255 suffix
= 'bkMGTPEZY'[exponent
]
256 converted
= float(bytes) / float(1024**exponent
)
257 return '%.2f%s' % (converted
, suffix
)
260 def calc_percent(byte_counter
, data_len
):
263 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
266 def calc_eta(start
, now
, total
, current
):
270 if current
== 0 or dif
< 0.001: # One millisecond
272 rate
= float(current
) / dif
273 eta
= long((float(total
) - float(current
)) / rate
)
274 (eta_mins
, eta_secs
) = divmod(eta
, 60)
277 return '%02d:%02d' % (eta_mins
, eta_secs
)
280 def calc_speed(start
, now
, bytes):
282 if bytes == 0 or dif
< 0.001: # One millisecond
283 return '%10s' % '---b/s'
284 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
287 def best_block_size(elapsed_time
, bytes):
288 new_min
= max(bytes / 2.0, 1.0)
289 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
290 if elapsed_time
< 0.001:
292 rate
= bytes / elapsed_time
300 def parse_bytes(bytestr
):
301 """Parse a string indicating a byte quantity into a long integer."""
302 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
305 number
= float(matchobj
.group(1))
306 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
307 return long(round(number
* multiplier
))
309 def add_info_extractor(self
, ie
):
310 """Add an InfoExtractor object to the end of the list."""
312 ie
.set_downloader(self
)
314 def add_post_processor(self
, pp
):
315 """Add a PostProcessor object to the end of the chain."""
317 pp
.set_downloader(self
)
319 def to_screen(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
320 """Print message to stdout if not in quiet mode."""
322 if not self
.params
.get('quiet', False):
323 terminator
= [u
'\n', u
''][skip_eol
]
324 print >>self
._screen
_file
, (u
'%s%s' % (message
, terminator
)).encode(preferredencoding()),
325 self
._screen
_file
.flush()
326 except (UnicodeEncodeError), err
:
327 if not ignore_encoding_errors
:
330 def to_stderr(self
, message
):
331 """Print message to stderr."""
332 print >>sys
.stderr
, message
.encode(preferredencoding())
334 def fixed_template(self
):
335 """Checks if the output template is fixed."""
336 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
338 def trouble(self
, message
=None):
339 """Determine action to take when a download problem appears.
341 Depending on if the downloader has been configured to ignore
342 download errors or not, this method may throw an exception or
343 not when errors are found, after printing the message.
345 if message
is not None:
346 self
.to_stderr(message
)
347 if not self
.params
.get('ignoreerrors', False):
348 raise DownloadError(message
)
349 self
._download
_retcode
= 1
351 def slow_down(self
, start_time
, byte_counter
):
352 """Sleep if the download speed is over the rate limit."""
353 rate_limit
= self
.params
.get('ratelimit', None)
354 if rate_limit
is None or byte_counter
== 0:
357 elapsed
= now
- start_time
360 speed
= float(byte_counter
) / elapsed
361 if speed
> rate_limit
:
362 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
364 def try_rename(self
, old_filename
, new_filename
):
366 if old_filename
== new_filename
:
368 os
.rename(old_filename
, new_filename
)
369 except (IOError, OSError), err
:
370 self
.trouble(u
'ERROR: unable to rename file')
372 def report_destination(self
, filename
):
373 """Report destination filename."""
374 self
.to_screen(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
376 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
377 """Report download progress."""
378 if self
.params
.get('noprogress', False):
380 self
.to_screen(u
'\r[download] %s of %s at %s ETA %s' %
381 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
383 def report_resuming_byte(self
, resume_len
):
384 """Report attempt to resume at given byte."""
385 self
.to_screen(u
'[download] Resuming download at byte %s' % resume_len
)
387 def report_retry(self
, count
, retries
):
388 """Report retry in case of HTTP error 5xx"""
389 self
.to_screen(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
))
391 def report_file_already_downloaded(self
, file_name
):
392 """Report file has already been fully downloaded."""
394 self
.to_screen(u
'[download] %s has already been downloaded' % file_name
)
395 except (UnicodeEncodeError), err
:
396 self
.to_screen(u
'[download] The file has already been downloaded')
398 def report_unable_to_resume(self
):
399 """Report it was impossible to resume download."""
400 self
.to_screen(u
'[download] Unable to resume')
402 def report_finish(self
):
403 """Report download finished."""
404 if self
.params
.get('noprogress', False):
405 self
.to_screen(u
'[download] Download completed')
409 def increment_downloads(self
):
410 """Increment the ordinal that assigns a number to each file."""
411 self
._num
_downloads
+= 1
413 def process_info(self
, info_dict
):
414 """Process a single dictionary returned by an InfoExtractor."""
415 # Do nothing else if in simulate mode
416 if self
.params
.get('simulate', False):
418 if self
.params
.get('forcetitle', False):
419 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
420 if self
.params
.get('forceurl', False):
421 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
422 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
423 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
424 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
425 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
430 template_dict
= dict(info_dict
)
431 template_dict
['epoch'] = unicode(long(time
.time()))
432 template_dict
['autonumber'] = unicode('%05d' % self
._num
_downloads
)
433 filename
= self
.params
['outtmpl'] % template_dict
434 except (ValueError, KeyError), err
:
435 self
.trouble(u
'ERROR: invalid system charset or erroneous output template')
437 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
438 self
.to_stderr(u
'WARNING: file exists and will be skipped')
442 self
.pmkdir(filename
)
443 except (OSError, IOError), err
:
444 self
.trouble(u
'ERROR: unable to create directories: %s' % str(err
))
448 success
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None))
449 except (OSError, IOError), err
:
450 raise UnavailableVideoError
451 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
452 self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
))
454 except (ContentTooShortError
, ), err
:
455 self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
460 self
.post_process(filename
, info_dict
)
461 except (PostProcessingError
), err
:
462 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
))
465 def download(self
, url_list
):
466 """Download a given list of URLs."""
467 if len(url_list
) > 1 and self
.fixed_template():
468 raise SameFileError(self
.params
['outtmpl'])
471 suitable_found
= False
473 # Go to next InfoExtractor if not suitable
474 if not ie
.suitable(url
):
477 # Suitable InfoExtractor found
478 suitable_found
= True
480 # Extract information from URL and process it
483 # Suitable InfoExtractor had been found; go to next URL
486 if not suitable_found
:
487 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
)
489 return self
._download
_retcode
491 def post_process(self
, filename
, ie_info
):
492 """Run the postprocessing chain on the given file."""
494 info
['filepath'] = filename
500 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
501 self
.report_destination(filename
)
502 tmpfilename
= self
.temp_name(filename
)
504 # Check for rtmpdump first
506 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
507 except (OSError, IOError):
508 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
511 # Download using rtmpdump. rtmpdump returns exit code 2 when
512 # the connection was interrumpted and resuming appears to be
513 # possible. This is part of rtmpdump's normal usage, AFAIK.
514 basic_args
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', tmpfilename
]
515 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
516 while retval
== 2 or retval
== 1:
517 prevsize
= os
.path
.getsize(tmpfilename
)
518 self
.to_screen(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
519 time
.sleep(5.0) # This seems to be needed
520 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
521 cursize
= os
.path
.getsize(tmpfilename
)
522 if prevsize
== cursize
and retval
== 1:
525 self
.to_screen(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(tmpfilename
))
526 self
.try_rename(tmpfilename
, filename
)
529 self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
)
532 def _do_download(self
, filename
, url
, player_url
):
533 # Check file already present
534 if self
.params
.get('continuedl', False) and os
.path
.isfile(filename
):
535 self
.report_file_already_downloaded(filename
)
538 # Attempt to download using rtmpdump
539 if url
.startswith('rtmp'):
540 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
542 tmpfilename
= self
.temp_name(filename
)
545 basic_request
= urllib2
.Request(url
, None, std_headers
)
546 request
= urllib2
.Request(url
, None, std_headers
)
548 # Establish possible resume length
549 if os
.path
.isfile(tmpfilename
):
550 resume_len
= os
.path
.getsize(tmpfilename
)
554 # Request parameters in case of being able to resume
555 if self
.params
.get('continuedl', False) and resume_len
!= 0:
556 self
.report_resuming_byte(resume_len
)
557 request
.add_header('Range','bytes=%d-' % resume_len
)
561 retries
= self
.params
.get('retries', 0)
562 while count
<= retries
:
563 # Establish connection
565 data
= urllib2
.urlopen(request
)
567 except (urllib2
.HTTPError
, ), err
:
568 if (err
.code
< 500 or err
.code
>= 600) and err
.code
!= 416:
569 # Unexpected HTTP error
571 elif err
.code
== 416:
572 # Unable to resume (requested range not satisfiable)
574 # Open the connection again without the range header
575 data
= urllib2
.urlopen(basic_request
)
576 content_length
= data
.info()['Content-Length']
577 except (urllib2
.HTTPError
, ), err
:
578 if err
.code
< 500 or err
.code
>= 600:
581 # Examine the reported length
582 if (content_length
is not None and
583 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
584 # The file had already been fully downloaded.
585 # Explanation to the above condition: in issue #175 it was revealed that
586 # YouTube sometimes adds or removes a few bytes from the end of the file,
587 # changing the file size slightly and causing problems for some users. So
588 # I decided to implement a suggested change and consider the file
589 # completely downloaded if the file size differs less than 100 bytes from
590 # the one in the hard drive.
591 self
.report_file_already_downloaded(filename
)
592 self
.try_rename(tmpfilename
, filename
)
595 # The length does not match, we start the download over
596 self
.report_unable_to_resume()
602 self
.report_retry(count
, retries
)
605 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
608 data_len
= data
.info().get('Content-length', None)
609 data_len_str
= self
.format_bytes(data_len
)
616 data_block
= data
.read(block_size
)
618 data_block_len
= len(data_block
)
619 if data_block_len
== 0:
621 byte_counter
+= data_block_len
623 # Open file just in time
626 (stream
, tmpfilename
) = sanitize_open(tmpfilename
, open_mode
)
627 self
.report_destination(filename
)
628 except (OSError, IOError), err
:
629 self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
))
632 stream
.write(data_block
)
633 except (IOError, OSError), err
:
634 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
))
636 block_size
= self
.best_block_size(after
- before
, data_block_len
)
639 percent_str
= self
.calc_percent(byte_counter
, data_len
)
640 eta_str
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
)
641 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
)
642 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
645 self
.slow_down(start
, byte_counter
)
649 if data_len
is not None and str(byte_counter
) != data_len
:
650 raise ContentTooShortError(byte_counter
, long(data_len
))
651 self
.try_rename(tmpfilename
, filename
)
654 class InfoExtractor(object):
655 """Information Extractor class.
657 Information extractors are the classes that, given a URL, extract
658 information from the video (or videos) the URL refers to. This
659 information includes the real video URL, the video title and simplified
660 title, author and others. The information is stored in a dictionary
661 which is then passed to the FileDownloader. The FileDownloader
662 processes this information possibly downloading the video to the file
663 system, among other possible outcomes. The dictionaries must include
664 the following fields:
666 id: Video identifier.
667 url: Final video URL.
668 uploader: Nickname of the video uploader.
669 title: Literal title.
670 stitle: Simplified title.
671 ext: Video filename extension.
672 format: Video format.
673 player_url: SWF Player URL (may be None).
675 The following fields are optional. Their primary purpose is to allow
676 youtube-dl to serve as the backend for a video search function, such
677 as the one in youtube2mp3. They are only used when their respective
678 forced printing functions are called:
680 thumbnail: Full URL to a video thumbnail image.
681 description: One-line video description.
683 Subclasses of this one should re-define the _real_initialize() and
684 _real_extract() methods, as well as the suitable() static method.
685 Probably, they should also be instantiated and added to the main
692 def __init__(self
, downloader
=None):
693 """Constructor. Receives an optional downloader."""
695 self
.set_downloader(downloader
)
699 """Receives a URL and returns True if suitable for this IE."""
702 def initialize(self
):
703 """Initializes an instance (authentication, etc)."""
705 self
._real
_initialize
()
708 def extract(self
, url
):
709 """Extracts URL information and returns it in list of dicts."""
711 return self
._real
_extract
(url
)
713 def set_downloader(self
, downloader
):
714 """Sets the downloader for this IE."""
715 self
._downloader
= downloader
717 def _real_initialize(self
):
718 """Real initialization process. Redefine in subclasses."""
721 def _real_extract(self
, url
):
722 """Real extraction process. Redefine in subclasses."""
725 class YoutubeIE(InfoExtractor
):
726 """Information extractor for youtube.com."""
728 _VALID_URL
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
729 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
730 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
731 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
732 _NETRC_MACHINE
= 'youtube'
733 # Listed in order of quality
734 _available_formats
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
735 _video_extensions
= {
741 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
748 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None)
750 def report_lang(self
):
751 """Report attempt to set language."""
752 self
._downloader
.to_screen(u
'[youtube] Setting language')
754 def report_login(self
):
755 """Report attempt to log in."""
756 self
._downloader
.to_screen(u
'[youtube] Logging in')
758 def report_age_confirmation(self
):
759 """Report attempt to confirm age."""
760 self
._downloader
.to_screen(u
'[youtube] Confirming age')
762 def report_video_webpage_download(self
, video_id
):
763 """Report attempt to download video webpage."""
764 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
766 def report_video_info_webpage_download(self
, video_id
):
767 """Report attempt to download video info webpage."""
768 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
770 def report_information_extraction(self
, video_id
):
771 """Report attempt to extract video information."""
772 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
774 def report_unavailable_format(self
, video_id
, format
):
775 """Report extracted video URL."""
776 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
778 def report_rtmp_download(self
):
779 """Indicate the download will use the RTMP protocol."""
780 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
782 def _real_initialize(self
):
783 if self
._downloader
is None:
788 downloader_params
= self
._downloader
.params
790 # Attempt to use provided username and password or .netrc data
791 if downloader_params
.get('username', None) is not None:
792 username
= downloader_params
['username']
793 password
= downloader_params
['password']
794 elif downloader_params
.get('usenetrc', False):
796 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
801 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
802 except (IOError, netrc
.NetrcParseError
), err
:
803 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
807 request
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
)
810 urllib2
.urlopen(request
).read()
811 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
812 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
815 # No authentication to be performed
821 'current_form': 'loginForm',
823 'action_login': 'Log In',
824 'username': username
,
825 'password': password
,
827 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
)
830 login_results
= urllib2
.urlopen(request
).read()
831 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
832 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
834 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
835 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
841 'action_confirm': 'Confirm',
843 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
)
845 self
.report_age_confirmation()
846 age_results
= urllib2
.urlopen(request
).read()
847 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
848 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
851 def _real_extract(self
, url
):
852 # Extract video id from URL
853 mobj
= re
.match(self
._VALID
_URL
, url
)
855 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
857 video_id
= mobj
.group(2)
860 self
.report_video_webpage_download(video_id
)
861 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
, None, std_headers
)
863 video_webpage
= urllib2
.urlopen(request
).read()
864 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
865 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
868 # Attempt to extract SWF player URL
869 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
871 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
876 self
.report_video_info_webpage_download(video_id
)
877 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
878 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
879 % (video_id
, el_type
))
880 request
= urllib2
.Request(video_info_url
, None, std_headers
)
882 video_info_webpage
= urllib2
.urlopen(request
).read()
883 video_info
= parse_qs(video_info_webpage
)
884 if 'token' in video_info
:
886 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
887 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
889 if 'token' not in video_info
:
890 if 'reason' in video_info
:
891 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
893 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
896 # Start extracting information
897 self
.report_information_extraction(video_id
)
900 if 'author' not in video_info
:
901 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
903 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
906 if 'title' not in video_info
:
907 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
909 video_title
= urllib
.unquote_plus(video_info
['title'][0])
910 video_title
= video_title
.decode('utf-8')
911 video_title
= sanitize_title(video_title
)
914 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
915 simple_title
= simple_title
.strip(ur
'_')
918 if 'thumbnail_url' not in video_info
:
919 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
921 else: # don't panic if we can't find it
922 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
926 mobj
= re
.search(r
'id="eow-date".*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
928 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
929 format_expressions
= ['%d %B %Y', '%B %d %Y']
930 for expression
in format_expressions
:
932 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
937 video_description
= 'No description available.'
938 if self
._downloader
.params
.get('forcedescription', False):
939 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
941 video_description
= mobj
.group(1)
944 video_token
= urllib
.unquote_plus(video_info
['token'][0])
946 # Decide which formats to download
947 req_format
= self
._downloader
.params
.get('format', None)
948 get_video_template
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
)
950 if 'fmt_url_map' in video_info
:
951 url_map
= dict(tuple(pair
.split('|')) for pair
in video_info
['fmt_url_map'][0].split(','))
952 format_limit
= self
._downloader
.params
.get('format_limit', None)
953 if format_limit
is not None and format_limit
in self
._available
_formats
:
954 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
956 format_list
= self
._available
_formats
957 existing_formats
= [x
for x
in format_list
if x
in url_map
]
958 if len(existing_formats
) == 0:
959 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
961 if req_format
is None:
962 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
963 elif req_format
== '-1':
964 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
966 if req_format
in url_map
:
967 video_url_list
= [(req_format
, url_map
[req_format
])] # Specific format
969 video_url_list
= [(req_format
, get_video_template
% req_format
)] # Specific format
971 elif 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
972 self
.report_rtmp_download()
973 video_url_list
= [(None, video_info
['conn'][0])]
976 self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info')
979 for format_param
, video_real_url
in video_url_list
:
980 # At this point we have a new video
981 self
._downloader
.increment_downloads()
984 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
986 # Find the video URL in fmt_url_map or conn paramters
988 # Process video information
989 self
._downloader
.process_info({
990 'id': video_id
.decode('utf-8'),
991 'url': video_real_url
.decode('utf-8'),
992 'uploader': video_uploader
.decode('utf-8'),
993 'upload_date': upload_date
,
994 'title': video_title
,
995 'stitle': simple_title
,
996 'ext': video_extension
.decode('utf-8'),
997 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
998 'thumbnail': video_thumbnail
.decode('utf-8'),
999 'description': video_description
.decode('utf-8'),
1000 'player_url': player_url
,
1002 except UnavailableVideoError
, err
:
1003 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)')
1006 class MetacafeIE(InfoExtractor
):
1007 """Information Extractor for metacafe.com."""
1009 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1010 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
1011 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1014 def __init__(self
, youtube_ie
, downloader
=None):
1015 InfoExtractor
.__init
__(self
, downloader
)
1016 self
._youtube
_ie
= youtube_ie
1020 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None)
1022 def report_disclaimer(self
):
1023 """Report disclaimer retrieval."""
1024 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
1026 def report_age_confirmation(self
):
1027 """Report attempt to confirm age."""
1028 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
1030 def report_download_webpage(self
, video_id
):
1031 """Report webpage download."""
1032 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
1034 def report_extraction(self
, video_id
):
1035 """Report information extraction."""
1036 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
1038 def _real_initialize(self
):
1039 # Retrieve disclaimer
1040 request
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
)
1042 self
.report_disclaimer()
1043 disclaimer
= urllib2
.urlopen(request
).read()
1044 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1045 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
1051 'submit': "Continue - I'm over 18",
1053 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
)
1055 self
.report_age_confirmation()
1056 disclaimer
= urllib2
.urlopen(request
).read()
1057 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1058 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1061 def _real_extract(self
, url
):
1062 # Extract id and simplified title from URL
1063 mobj
= re
.match(self
._VALID
_URL
, url
)
1065 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1068 video_id
= mobj
.group(1)
1070 # Check if video comes from YouTube
1071 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1072 if mobj2
is not None:
1073 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1076 # At this point we have a new video
1077 self
._downloader
.increment_downloads()
1079 simple_title
= mobj
.group(2).decode('utf-8')
1081 # Retrieve video webpage to extract further information
1082 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1084 self
.report_download_webpage(video_id
)
1085 webpage
= urllib2
.urlopen(request
).read()
1086 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1087 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1090 # Extract URL, uploader and title from webpage
1091 self
.report_extraction(video_id
)
1092 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1093 if mobj
is not None:
1094 mediaURL
= urllib
.unquote(mobj
.group(1))
1095 video_extension
= mediaURL
[-3:]
1097 # Extract gdaKey if available
1098 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1100 video_url
= mediaURL
1102 gdaKey
= mobj
.group(1)
1103 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1105 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
1107 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1109 vardict
= parse_qs(mobj
.group(1))
1110 if 'mediaData' not in vardict
:
1111 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1113 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
1115 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1117 mediaURL
= mobj
.group(1).replace('\\/', '/')
1118 video_extension
= mediaURL
[-3:]
1119 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
1121 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1123 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1125 video_title
= mobj
.group(1).decode('utf-8')
1126 video_title
= sanitize_title(video_title
)
1128 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1130 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1132 video_uploader
= mobj
.group(1)
1135 # Process video information
1136 self
._downloader
.process_info({
1137 'id': video_id
.decode('utf-8'),
1138 'url': video_url
.decode('utf-8'),
1139 'uploader': video_uploader
.decode('utf-8'),
1140 'upload_date': u
'NA',
1141 'title': video_title
,
1142 'stitle': simple_title
,
1143 'ext': video_extension
.decode('utf-8'),
1147 except UnavailableVideoError
:
1148 self
._downloader
.trouble(u
'ERROR: unable to download video')
1151 class DailymotionIE(InfoExtractor
):
1152 """Information Extractor for Dailymotion"""
1154 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1156 def __init__(self
, downloader
=None):
1157 InfoExtractor
.__init
__(self
, downloader
)
1161 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None)
1163 def report_download_webpage(self
, video_id
):
1164 """Report webpage download."""
1165 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1167 def report_extraction(self
, video_id
):
1168 """Report information extraction."""
1169 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
1171 def _real_initialize(self
):
1174 def _real_extract(self
, url
):
1175 # Extract id and simplified title from URL
1176 mobj
= re
.match(self
._VALID
_URL
, url
)
1178 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1181 # At this point we have a new video
1182 self
._downloader
.increment_downloads()
1183 video_id
= mobj
.group(1)
1185 simple_title
= mobj
.group(2).decode('utf-8')
1186 video_extension
= 'flv'
1188 # Retrieve video webpage to extract further information
1189 request
= urllib2
.Request(url
)
1191 self
.report_download_webpage(video_id
)
1192 webpage
= urllib2
.urlopen(request
).read()
1193 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1194 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1197 # Extract URL, uploader and title from webpage
1198 self
.report_extraction(video_id
)
1199 mobj
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
)
1201 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1203 mediaURL
= urllib
.unquote(mobj
.group(1))
1205 # if needed add http://www.dailymotion.com/ if relative URL
1207 video_url
= mediaURL
1209 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1210 mobj
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
)
1212 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1214 video_title
= mobj
.group(1).decode('utf-8')
1215 video_title
= sanitize_title(video_title
)
1217 mobj
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage
)
1219 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1221 video_uploader
= mobj
.group(1)
1224 # Process video information
1225 self
._downloader
.process_info({
1226 'id': video_id
.decode('utf-8'),
1227 'url': video_url
.decode('utf-8'),
1228 'uploader': video_uploader
.decode('utf-8'),
1229 'upload_date': u
'NA',
1230 'title': video_title
,
1231 'stitle': simple_title
,
1232 'ext': video_extension
.decode('utf-8'),
1236 except UnavailableVideoError
:
1237 self
._downloader
.trouble(u
'ERROR: unable to download video')
1239 class GoogleIE(InfoExtractor
):
1240 """Information extractor for video.google.com."""
1242 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1244 def __init__(self
, downloader
=None):
1245 InfoExtractor
.__init
__(self
, downloader
)
1249 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None)
1251 def report_download_webpage(self
, video_id
):
1252 """Report webpage download."""
1253 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
1255 def report_extraction(self
, video_id
):
1256 """Report information extraction."""
1257 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
1259 def _real_initialize(self
):
1262 def _real_extract(self
, url
):
1263 # Extract id from URL
1264 mobj
= re
.match(self
._VALID
_URL
, url
)
1266 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1269 # At this point we have a new video
1270 self
._downloader
.increment_downloads()
1271 video_id
= mobj
.group(1)
1273 video_extension
= 'mp4'
1275 # Retrieve video webpage to extract further information
1276 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1278 self
.report_download_webpage(video_id
)
1279 webpage
= urllib2
.urlopen(request
).read()
1280 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1281 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1284 # Extract URL, uploader, and title from webpage
1285 self
.report_extraction(video_id
)
1286 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1288 video_extension
= 'flv'
1289 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1291 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1293 mediaURL
= urllib
.unquote(mobj
.group(1))
1294 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1295 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1297 video_url
= mediaURL
1299 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1301 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1303 video_title
= mobj
.group(1).decode('utf-8')
1304 video_title
= sanitize_title(video_title
)
1305 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1307 # Extract video description
1308 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1310 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1312 video_description
= mobj
.group(1).decode('utf-8')
1313 if not video_description
:
1314 video_description
= 'No description available.'
1316 # Extract video thumbnail
1317 if self
._downloader
.params
.get('forcethumbnail', False):
1318 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1320 webpage
= urllib2
.urlopen(request
).read()
1321 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1322 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1324 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1326 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1328 video_thumbnail
= mobj
.group(1)
1329 else: # we need something to pass to process_info
1330 video_thumbnail
= ''
1334 # Process video information
1335 self
._downloader
.process_info({
1336 'id': video_id
.decode('utf-8'),
1337 'url': video_url
.decode('utf-8'),
1339 'upload_date': u
'NA',
1340 'title': video_title
,
1341 'stitle': simple_title
,
1342 'ext': video_extension
.decode('utf-8'),
1346 except UnavailableVideoError
:
1347 self
._downloader
.trouble(u
'ERROR: unable to download video')
1350 class PhotobucketIE(InfoExtractor
):
1351 """Information extractor for photobucket.com."""
1353 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1355 def __init__(self
, downloader
=None):
1356 InfoExtractor
.__init
__(self
, downloader
)
1360 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None)
1362 def report_download_webpage(self
, video_id
):
1363 """Report webpage download."""
1364 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
1366 def report_extraction(self
, video_id
):
1367 """Report information extraction."""
1368 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
1370 def _real_initialize(self
):
1373 def _real_extract(self
, url
):
1374 # Extract id from URL
1375 mobj
= re
.match(self
._VALID
_URL
, url
)
1377 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1380 # At this point we have a new video
1381 self
._downloader
.increment_downloads()
1382 video_id
= mobj
.group(1)
1384 video_extension
= 'flv'
1386 # Retrieve video webpage to extract further information
1387 request
= urllib2
.Request(url
)
1389 self
.report_download_webpage(video_id
)
1390 webpage
= urllib2
.urlopen(request
).read()
1391 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1392 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1395 # Extract URL, uploader, and title from webpage
1396 self
.report_extraction(video_id
)
1397 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1399 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1401 mediaURL
= urllib
.unquote(mobj
.group(1))
1403 video_url
= mediaURL
1405 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1407 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1409 video_title
= mobj
.group(1).decode('utf-8')
1410 video_title
= sanitize_title(video_title
)
1411 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1413 video_uploader
= mobj
.group(2).decode('utf-8')
1416 # Process video information
1417 self
._downloader
.process_info({
1418 'id': video_id
.decode('utf-8'),
1419 'url': video_url
.decode('utf-8'),
1420 'uploader': video_uploader
,
1421 'upload_date': u
'NA',
1422 'title': video_title
,
1423 'stitle': simple_title
,
1424 'ext': video_extension
.decode('utf-8'),
1428 except UnavailableVideoError
:
1429 self
._downloader
.trouble(u
'ERROR: unable to download video')
1432 class YahooIE(InfoExtractor
):
1433 """Information extractor for video.yahoo.com."""
1435 # _VALID_URL matches all Yahoo! Video URLs
1436 # _VPAGE_URL matches only the extractable '/watch/' URLs
1437 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1438 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1440 def __init__(self
, downloader
=None):
1441 InfoExtractor
.__init
__(self
, downloader
)
1445 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None)
1447 def report_download_webpage(self
, video_id
):
1448 """Report webpage download."""
1449 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1451 def report_extraction(self
, video_id
):
1452 """Report information extraction."""
1453 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
1455 def _real_initialize(self
):
1458 def _real_extract(self
, url
, new_video
=True):
1459 # Extract ID from URL
1460 mobj
= re
.match(self
._VALID
_URL
, url
)
1462 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1465 # At this point we have a new video
1466 self
._downloader
.increment_downloads()
1467 video_id
= mobj
.group(2)
1468 video_extension
= 'flv'
1470 # Rewrite valid but non-extractable URLs as
1471 # extractable English language /watch/ URLs
1472 if re
.match(self
._VPAGE
_URL
, url
) is None:
1473 request
= urllib2
.Request(url
)
1475 webpage
= urllib2
.urlopen(request
).read()
1476 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1477 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1480 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1482 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1484 yahoo_id
= mobj
.group(1)
1486 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1488 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1490 yahoo_vid
= mobj
.group(1)
1492 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1493 return self
._real
_extract
(url
, new_video
=False)
1495 # Retrieve video webpage to extract further information
1496 request
= urllib2
.Request(url
)
1498 self
.report_download_webpage(video_id
)
1499 webpage
= urllib2
.urlopen(request
).read()
1500 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1501 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1504 # Extract uploader and title from webpage
1505 self
.report_extraction(video_id
)
1506 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1508 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1510 video_title
= mobj
.group(1).decode('utf-8')
1511 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1513 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1515 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1517 video_uploader
= mobj
.group(1).decode('utf-8')
1519 # Extract video thumbnail
1520 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1522 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1524 video_thumbnail
= mobj
.group(1).decode('utf-8')
1526 # Extract video description
1527 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1529 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1531 video_description
= mobj
.group(1).decode('utf-8')
1532 if not video_description
: video_description
= 'No description available.'
1534 # Extract video height and width
1535 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1537 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1539 yv_video_height
= mobj
.group(1)
1541 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1543 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1545 yv_video_width
= mobj
.group(1)
1547 # Retrieve video playlist to extract media URL
1548 # I'm not completely sure what all these options are, but we
1549 # seem to need most of them, otherwise the server sends a 401.
1550 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1551 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1552 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1553 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1554 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1556 self
.report_download_webpage(video_id
)
1557 webpage
= urllib2
.urlopen(request
).read()
1558 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1559 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1562 # Extract media URL from playlist XML
1563 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1565 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1567 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1568 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1571 # Process video information
1572 self
._downloader
.process_info({
1573 'id': video_id
.decode('utf-8'),
1575 'uploader': video_uploader
,
1576 'upload_date': u
'NA',
1577 'title': video_title
,
1578 'stitle': simple_title
,
1579 'ext': video_extension
.decode('utf-8'),
1580 'thumbnail': video_thumbnail
.decode('utf-8'),
1581 'description': video_description
,
1582 'thumbnail': video_thumbnail
,
1583 'description': video_description
,
1586 except UnavailableVideoError
:
1587 self
._downloader
.trouble(u
'ERROR: unable to download video')
1590 class GenericIE(InfoExtractor
):
1591 """Generic last-resort information extractor."""
1593 def __init__(self
, downloader
=None):
1594 InfoExtractor
.__init
__(self
, downloader
)
1600 def report_download_webpage(self
, video_id
):
1601 """Report webpage download."""
1602 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
1603 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
1605 def report_extraction(self
, video_id
):
1606 """Report information extraction."""
1607 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
1609 def _real_initialize(self
):
1612 def _real_extract(self
, url
):
1613 # At this point we have a new video
1614 self
._downloader
.increment_downloads()
1616 video_id
= url
.split('/')[-1]
1617 request
= urllib2
.Request(url
)
1619 self
.report_download_webpage(video_id
)
1620 webpage
= urllib2
.urlopen(request
).read()
1621 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1622 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1624 except ValueError, err
:
1625 # since this is the last-resort InfoExtractor, if
1626 # this error is thrown, it'll be thrown here
1627 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1630 self
.report_extraction(video_id
)
1631 # Start with something easy: JW Player in SWFObject
1632 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1634 # Broaden the search a little bit
1635 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1637 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1640 # It's possible that one of the regexes
1641 # matched, but returned an empty group:
1642 if mobj.group(1) is None:
1643 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1646 video_url = urllib.unquote(mobj.group(1))
1647 video_id = os.path.basename(video_url)
1649 # here's a fun little line of code for you:
1650 video_extension = os.path.splitext(video_id)[1][1:]
1651 video_id = os.path.splitext(video_id)[0]
1653 # it's tempting to parse this further, but you would
1654 # have to take into account all the variations like
1655 # Video Title - Site Name
1656 # Site Name | Video Title
1657 # Video Title - Tagline | Site Name
1658 # and so on and so forth; it's just not practical
1659 mobj = re.search(r'<title>(.*)</title>', webpage)
1661 self._downloader.trouble(u'ERROR: unable to extract title')
1663 video_title = mobj.group(1).decode('utf-8')
1664 video_title = sanitize_title(video_title)
1665 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1667 # video uploader is domain name
1668 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1670 self._downloader.trouble(u'ERROR: unable to extract title')
1672 video_uploader = mobj.group(1).decode('utf-8')
1675 # Process video information
1676 self._downloader.process_info({
1677 'id': video_id.decode('utf-8'),
1678 'url': video_url.decode('utf-8'),
1679 'uploader': video_uploader,
1680 'upload_date': u'NA',
1681 'title': video_title,
1682 'stitle': simple_title,
1683 'ext': video_extension.decode('utf-8'),
1687 except UnavailableVideoError, err:
1688 self._downloader.trouble(u'ERROR: unable to download video')
1691 class YoutubeSearchIE(InfoExtractor):
1692 """Information Extractor for YouTube search queries."""
1693 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1694 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1695 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
1696 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1698 _max_youtube_results = 1000
1700 def __init__(self, youtube_ie, downloader=None):
1701 InfoExtractor.__init__(self, downloader)
1702 self._youtube_ie = youtube_ie
1706 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1708 def report_download_page(self, query, pagenum):
1709 """Report attempt to download playlist page with given number."""
1710 query = query.decode(preferredencoding())
1711 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1713 def _real_initialize(self):
1714 self._youtube_ie.initialize()
1716 def _real_extract(self, query):
1717 mobj = re.match(self._VALID_QUERY, query)
1719 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1722 prefix, query = query.split(':')
1724 query = query.encode('utf-8')
1726 self._download_n_results(query, 1)
1728 elif prefix == 'all':
1729 self._download_n_results(query, self._max_youtube_results)
1735 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1737 elif n > self._max_youtube_results:
1738 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1739 n = self._max_youtube_results
1740 self._download_n_results(query, n)
1742 except ValueError: # parsing prefix as integer fails
1743 self._download_n_results(query, 1)
1746 def _download_n_results(self, query, n):
1747 """Downloads a specified number of results for a query"""
1750 already_seen = set()
1754 self.report_download_page(query, pagenum)
1755 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1756 request = urllib2.Request(result_url, None, std_headers)
1758 page = urllib2.urlopen(request).read()
1759 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1760 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1763 # Extract video identifiers
1764 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1765 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1766 if video_id not in already_seen:
1767 video_ids.append(video_id)
1768 already_seen.add(video_id)
1769 if len(video_ids) == n:
1770 # Specified n videos reached
1771 for id in video_ids:
1772 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1775 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1776 for id in video_ids:
1777 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1780 pagenum = pagenum + 1
1782 class GoogleSearchIE(InfoExtractor):
1783 """Information Extractor for Google Video search queries."""
1784 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1785 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1786 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1787 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1789 _max_google_results = 1000
1791 def __init__(self, google_ie, downloader=None):
1792 InfoExtractor.__init__(self, downloader)
1793 self._google_ie = google_ie
1797 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1799 def report_download_page(self, query, pagenum):
1800 """Report attempt to download playlist page with given number."""
1801 query = query.decode(preferredencoding())
1802 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1804 def _real_initialize(self):
1805 self._google_ie.initialize()
1807 def _real_extract(self, query):
1808 mobj = re.match(self._VALID_QUERY, query)
1810 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1813 prefix, query = query.split(':')
1815 query = query.encode('utf-8')
1817 self._download_n_results(query, 1)
1819 elif prefix == 'all':
1820 self._download_n_results(query, self._max_google_results)
1826 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1828 elif n > self._max_google_results:
1829 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1830 n = self._max_google_results
1831 self._download_n_results(query, n)
1833 except ValueError: # parsing prefix as integer fails
1834 self._download_n_results(query, 1)
1837 def _download_n_results(self, query, n):
1838 """Downloads a specified number of results for a query"""
1841 already_seen = set()
1845 self.report_download_page(query, pagenum)
1846 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1847 request = urllib2.Request(result_url, None, std_headers)
1849 page = urllib2.urlopen(request).read()
1850 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1851 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1854 # Extract video identifiers
1855 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1856 video_id = mobj.group(1)
1857 if video_id not in already_seen:
1858 video_ids.append(video_id)
1859 already_seen.add(video_id)
1860 if len(video_ids) == n:
1861 # Specified n videos reached
1862 for id in video_ids:
1863 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1866 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1867 for id in video_ids:
1868 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1871 pagenum = pagenum + 1
1873 class YahooSearchIE(InfoExtractor):
1874 """Information Extractor for Yahoo! Video search queries."""
1875 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1876 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1877 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1878 _MORE_PAGES_INDICATOR = r'\s*Next'
1880 _max_yahoo_results = 1000
1882 def __init__(self, yahoo_ie, downloader=None):
1883 InfoExtractor.__init__(self, downloader)
1884 self._yahoo_ie = yahoo_ie
1888 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1890 def report_download_page(self, query, pagenum):
1891 """Report attempt to download playlist page with given number."""
1892 query = query.decode(preferredencoding())
1893 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1895 def _real_initialize(self):
1896 self._yahoo_ie.initialize()
1898 def _real_extract(self, query):
1899 mobj = re.match(self._VALID_QUERY, query)
1901 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1904 prefix, query = query.split(':')
1906 query = query.encode('utf-8')
1908 self._download_n_results(query, 1)
1910 elif prefix == 'all':
1911 self._download_n_results(query, self._max_yahoo_results)
1917 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1919 elif n > self._max_yahoo_results:
1920 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1921 n = self._max_yahoo_results
1922 self._download_n_results(query, n)
1924 except ValueError: # parsing prefix as integer fails
1925 self._download_n_results(query, 1)
1928 def _download_n_results(self, query, n):
1929 """Downloads a specified number of results for a query"""
1932 already_seen = set()
1936 self.report_download_page(query, pagenum)
1937 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1938 request = urllib2.Request(result_url, None, std_headers)
1940 page = urllib2.urlopen(request).read()
1941 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1942 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1945 # Extract video identifiers
1946 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1947 video_id = mobj.group(1)
1948 if video_id not in already_seen:
1949 video_ids.append(video_id)
1950 already_seen.add(video_id)
1951 if len(video_ids) == n:
1952 # Specified n videos reached
1953 for id in video_ids:
1954 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1957 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1958 for id in video_ids:
1959 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1962 pagenum = pagenum + 1
1964 class YoutubePlaylistIE(InfoExtractor):
1965 """Information Extractor for YouTube playlists."""
1967 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1968 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1969 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1970 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1973 def __init__(self, youtube_ie, downloader=None):
1974 InfoExtractor.__init__(self, downloader)
1975 self._youtube_ie = youtube_ie
1979 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1981 def report_download_page(self, playlist_id, pagenum):
1982 """Report attempt to download playlist page with given number."""
1983 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1985 def _real_initialize(self):
1986 self._youtube_ie.initialize()
1988 def _real_extract(self, url):
1989 # Extract playlist id
1990 mobj = re.match(self._VALID_URL, url)
1992 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1995 # Download playlist pages
1996 playlist_id = mobj.group(1)
2001 self.report_download_page(playlist_id, pagenum)
2002 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
2004 page = urllib2.urlopen(request).read()
2005 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2006 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2009 # Extract video identifiers
2011 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2012 if mobj.group(1) not in ids_in_page:
2013 ids_in_page.append(mobj.group(1))
2014 video_ids.extend(ids_in_page)
2016 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2018 pagenum = pagenum + 1
2020 playliststart = self._downloader.params.get('playliststart', 1) - 1
2021 playlistend = self._downloader.params.get('playlistend', -1)
2022 video_ids = video_ids[playliststart:playlistend]
2024 for id in video_ids:
2025 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2028 class YoutubeUserIE(InfoExtractor):
2029 """Information Extractor for YouTube users."""
2031 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2032 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2033 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2036 def __init__(self, youtube_ie, downloader=None):
2037 InfoExtractor.__init__(self, downloader)
2038 self._youtube_ie = youtube_ie
2042 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2044 def report_download_page(self, username):
2045 """Report attempt to download user page."""
2046 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2048 def _real_initialize(self):
2049 self._youtube_ie.initialize()
2051 def _real_extract(self, url):
2053 mobj = re.match(self._VALID_URL, url)
2055 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2058 # Download user page
2059 username = mobj.group(1)
2063 self.report_download_page(username)
2064 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2066 page = urllib2.urlopen(request).read()
2067 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2068 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2071 # Extract video identifiers
2074 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2075 if mobj.group(1) not in ids_in_page:
2076 ids_in_page.append(mobj.group(1))
2077 video_ids.extend(ids_in_page)
2079 playliststart = self._downloader.params.get('playliststart', 1) - 1
2080 playlistend = self._downloader.params.get('playlistend', -1)
2081 video_ids = video_ids[playliststart:playlistend]
2083 for id in video_ids:
2084 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2087 class DepositFilesIE(InfoExtractor):
2088 """Information extractor for depositfiles.com"""
2090 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2092 def __init__(self, downloader=None):
2093 InfoExtractor.__init__(self, downloader)
2097 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2099 def report_download_webpage(self, file_id):
2100 """Report webpage download."""
2101 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2103 def report_extraction(self, file_id):
2104 """Report information extraction."""
2105 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2107 def _real_initialize(self):
2110 def _real_extract(self, url):
2111 # At this point we have a new file
2112 self._downloader.increment_downloads()
2114 file_id = url.split('/')[-1]
2115 # Rebuild url in english locale
2116 url = 'http://depositfiles.com/en/files/' + file_id
2118 # Retrieve file webpage with 'Free download' button pressed
2119 free_download_indication = { 'gateway_result' : '1' }
2120 request = urllib2.Request(url, urllib.urlencode(free_download_indication), std_headers)
2122 self.report_download_webpage(file_id)
2123 webpage = urllib2.urlopen(request).read()
2124 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2125 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2128 # Search for the real file URL
2129 mobj = re.search(r'<form action="(http
://fileshare
.+?
)"', webpage)
2130 if (mobj is None) or (mobj.group(1) is None):
2131 # Try to figure out reason of the error.
2132 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2133 if (mobj is not None) and (mobj.group(1) is not None):
2134 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2135 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2137 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2140 file_url = mobj.group(1)
2141 file_extension = os.path.splitext(file_url)[1][1:]
2143 # Search for file title
2144 mobj = re.search(r'<b title="(.*?
)">', webpage)
2146 self._downloader.trouble(u'ERROR: unable to extract title')
2148 file_title = mobj.group(1).decode('utf-8')
2151 # Process file information
2152 self._downloader.process_info({
2153 'id': file_id.decode('utf-8'),
2154 'url': file_url.decode('utf-8'),
2156 'upload_date': u'NA',
2157 'title': file_title,
2158 'stitle': file_title,
2159 'ext': file_extension.decode('utf-8'),
2163 except UnavailableVideoError, err:
2164 self._downloader.trouble(u'ERROR: unable to download file')
2166 class PostProcessor(object):
2167 """Post Processor class.
2169 PostProcessor objects can be added to downloaders with their
2170 add_post_processor() method. When the downloader has finished a
2171 successful download, it will take its internal chain of PostProcessors
2172 and start calling the run() method on each one of them, first with
2173 an initial argument and then with the returned value of the previous
2176 The chain will be stopped if one of them ever returns None or the end
2177 of the chain is reached.
2179 PostProcessor objects follow a "mutual registration
" process similar
2180 to InfoExtractor objects.
2185 def __init__(self, downloader=None):
2186 self._downloader = downloader
2188 def set_downloader(self, downloader):
2189 """Sets the downloader for this PP."""
2190 self._downloader = downloader
2192 def run(self, information):
2193 """Run the PostProcessor.
2195 The "information
" argument is a dictionary like the ones
2196 composed by InfoExtractors. The only difference is that this
2197 one has an extra field called "filepath
" that points to the
2200 When this method returns None, the postprocessing chain is
2201 stopped. However, this method may return an information
2202 dictionary that will be passed to the next postprocessing
2203 object in the chain. It can be the one it received after
2204 changing some fields.
2206 In addition, this method may raise a PostProcessingError
2207 exception that will be taken into account by the downloader
2210 return information # by default, do nothing
2212 ### MAIN PROGRAM ###
2213 if __name__ == '__main__':
2215 # Modules needed only when running the main program
2219 # Function to update the program file with the latest version from bitbucket.org
2220 def update_self(downloader, filename):
2221 # Note: downloader only used for options
2222 if not os.access (filename, os.W_OK):
2223 sys.exit('ERROR: no write permissions on %s' % filename)
2225 downloader.to_screen('Updating to latest stable version...')
2226 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2227 latest_version = urllib.urlopen(latest_url).read().strip()
2228 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2229 newcontent = urllib.urlopen(prog_url).read()
2230 stream = open(filename, 'w')
2231 stream.write(newcontent)
2233 downloader.to_screen('Updated to version %s' % latest_version)
2235 # Parse command line
2236 parser = optparse.OptionParser(
2237 usage='Usage: %prog [options] url...',
2238 version='2010.12.09',
2239 conflict_handler='resolve',
2242 parser.add_option('-h', '--help',
2243 action='help', help='print this help text and exit')
2244 parser.add_option('-v', '--version',
2245 action='version', help='print program version and exit')
2246 parser.add_option('-U', '--update',
2247 action='store_true', dest='update_self', help='update this program to latest stable version')
2248 parser.add_option('-i', '--ignore-errors',
2249 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2250 parser.add_option('-r', '--rate-limit',
2251 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2252 parser.add_option('-R', '--retries',
2253 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2254 parser.add_option('--playlist-start',
2255 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2256 parser.add_option('--playlist-end',
2257 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2259 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2260 authentication.add_option('-u', '--username',
2261 dest='username', metavar='USERNAME', help='account username')
2262 authentication.add_option('-p', '--password',
2263 dest='password', metavar='PASSWORD', help='account password')
2264 authentication.add_option('-n', '--netrc',
2265 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2266 parser.add_option_group(authentication)
2268 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2269 video_format.add_option('-f', '--format',
2270 action='store', dest='format', metavar='FORMAT', help='video format code')
2271 video_format.add_option('-m', '--mobile-version',
2272 action='store_const', dest='format', help='alias for -f 17', const='17')
2273 video_format.add_option('--all-formats',
2274 action='store_const', dest='format', help='download all available video formats', const='-1')
2275 video_format.add_option('--max-quality',
2276 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2277 video_format.add_option('-b', '--best-quality',
2278 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2279 parser.add_option_group(video_format)
2281 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2282 verbosity.add_option('-q', '--quiet',
2283 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2284 verbosity.add_option('-s', '--simulate',
2285 action='store_true', dest='simulate', help='do not download video', default=False)
2286 verbosity.add_option('-g', '--get-url',
2287 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2288 verbosity.add_option('-e', '--get-title',
2289 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2290 verbosity.add_option('--get-thumbnail',
2291 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2292 verbosity.add_option('--get-description',
2293 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2294 verbosity.add_option('--no-progress',
2295 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2296 parser.add_option_group(verbosity)
2298 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2299 filesystem.add_option('-t', '--title',
2300 action='store_true', dest='usetitle', help='use title in file name', default=False)
2301 filesystem.add_option('-l', '--literal',
2302 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2303 filesystem.add_option('-A', '--auto-number',
2304 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2305 filesystem.add_option('-o', '--output',
2306 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2307 filesystem.add_option('-a', '--batch-file',
2308 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2309 filesystem.add_option('-w', '--no-overwrites',
2310 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2311 filesystem.add_option('-c', '--continue',
2312 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2313 filesystem.add_option('--cookies',
2314 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2315 parser.add_option_group(filesystem)
2317 (opts, args) = parser.parse_args()
2319 # Open appropriate CookieJar
2320 if opts.cookiefile is None:
2321 jar = cookielib.CookieJar()
2324 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2325 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2327 except (IOError, OSError), err:
2328 sys.exit(u'ERROR: unable to open cookie file')
2330 # General configuration
2331 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2332 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2333 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2334 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2336 # Batch file verification
2338 if opts.batchfile is not None:
2340 if opts.batchfile == '-':
2343 batchfd = open(opts.batchfile, 'r')
2344 batchurls = batchfd.readlines()
2345 batchurls = [x.strip() for x in batchurls]
2346 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2348 sys.exit(u'ERROR: batch file could not be read')
2349 all_urls = batchurls + args
2351 # Conflicting, missing and erroneous options
2352 if opts.bestquality:
2353 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2354 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2355 parser.error(u'using .netrc conflicts with giving username/password')
2356 if opts.password is not None and opts.username is None:
2357 parser.error(u'account username missing')
2358 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2359 parser.error(u'using output template conflicts with using title, literal title or auto number')
2360 if opts.usetitle and opts.useliteral:
2361 parser.error(u'using title conflicts with using literal title')
2362 if opts.username is not None and opts.password is None:
2363 opts.password = getpass.getpass(u'Type account password and press return:')
2364 if opts.ratelimit is not None:
2365 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2366 if numeric_limit is None:
2367 parser.error(u'invalid rate limit specified')
2368 opts.ratelimit = numeric_limit
2369 if opts.retries is not None:
2371 opts.retries = long(opts.retries)
2372 except (TypeError, ValueError), err:
2373 parser.error(u'invalid retry count specified')
2375 opts.playliststart = long(opts.playliststart)
2376 if opts.playliststart <= 0:
2378 except (TypeError, ValueError), err:
2379 parser.error(u'invalid playlist start number specified')
2381 opts.playlistend = long(opts.playlistend)
2382 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2384 except (TypeError, ValueError), err:
2385 parser.error(u'invalid playlist end number specified')
2387 # Information extractors
2388 youtube_ie = YoutubeIE()
2389 metacafe_ie = MetacafeIE(youtube_ie)
2390 dailymotion_ie = DailymotionIE()
2391 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2392 youtube_user_ie = YoutubeUserIE(youtube_ie)
2393 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2394 google_ie = GoogleIE()
2395 google_search_ie = GoogleSearchIE(google_ie)
2396 photobucket_ie = PhotobucketIE()
2397 yahoo_ie = YahooIE()
2398 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2399 deposit_files_ie = DepositFilesIE()
2400 generic_ie = GenericIE()
2403 fd = FileDownloader({
2404 'usenetrc': opts.usenetrc,
2405 'username': opts.username,
2406 'password': opts.password,
2407 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2408 'forceurl': opts.geturl,
2409 'forcetitle': opts.gettitle,
2410 'forcethumbnail': opts.getthumbnail,
2411 'forcedescription': opts.getdescription,
2412 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2413 'format': opts.format,
2414 'format_limit': opts.format_limit,
2415 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2416 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2417 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2418 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2419 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2420 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2421 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2422 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2423 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2424 or u'%(id)s.%(ext)s'),
2425 'ignoreerrors': opts.ignoreerrors,
2426 'ratelimit': opts.ratelimit,
2427 'nooverwrites': opts.nooverwrites,
2428 'retries': opts.retries,
2429 'continuedl': opts.continue_dl,
2430 'noprogress': opts.noprogress,
2431 'playliststart': opts.playliststart,
2432 'playlistend': opts.playlistend,
2433 'logtostderr': opts.outtmpl == '-',
2435 fd.add_info_extractor(youtube_search_ie)
2436 fd.add_info_extractor(youtube_pl_ie)
2437 fd.add_info_extractor(youtube_user_ie)
2438 fd.add_info_extractor(metacafe_ie)
2439 fd.add_info_extractor(dailymotion_ie)
2440 fd.add_info_extractor(youtube_ie)
2441 fd.add_info_extractor(google_ie)
2442 fd.add_info_extractor(google_search_ie)
2443 fd.add_info_extractor(photobucket_ie)
2444 fd.add_info_extractor(yahoo_ie)
2445 fd.add_info_extractor(yahoo_search_ie)
2446 fd.add_info_extractor(deposit_files_ie)
2448 # This must come last since it's the
2449 # fallback if none of the others work
2450 fd.add_info_extractor(generic_ie)
2453 if opts.update_self:
2454 update_self(fd, sys.argv[0])
2457 if len(all_urls) < 1:
2458 if not opts.update_self:
2459 parser.error(u'you must provide at least one URL')
2462 retcode = fd.download(all_urls)
2464 # Dump cookie jar if requested
2465 if opts.cookiefile is not None:
2468 except (IOError, OSError), err:
2469 sys.exit(u'ERROR: unable to save cookie jar')
2473 except DownloadError:
2475 except SameFileError:
2476 sys.exit(u'ERROR: fixed output name but more than one file to download')
2477 except KeyboardInterrupt:
2478 sys.exit(u'\nERROR: Interrupted by user')