]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
d5469495404ca1ca9640a0a619628c511edf4f59
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse
import parse_qs
27 from cgi
import parse_qs
30 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref
= locale
.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj
):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity
= matchobj
.group(1)
62 # Known non-numeric HTML entity
63 if entity
in htmlentitydefs
.name2codepoint
:
64 return unichr(htmlentitydefs
.name2codepoint
[entity
])
67 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
69 numstr
= mobj
.group(1)
70 if numstr
.startswith(u
'x'):
72 numstr
= u
'0%s' % numstr
75 return unichr(long(numstr
, base
))
77 # Unknown entity in name, return its literal representation
78 return (u
'&%s;' % entity
)
80 def sanitize_title(utitle
):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
83 return utitle
.replace(unicode(os
.sep
), u
'%')
85 def sanitize_open(filename
, open_mode
):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys
.stdout
, filename
)
98 stream
= open(filename
, open_mode
)
99 return (stream
, filename
)
100 except (IOError, OSError), err
:
101 # In case of error, try to remove win32 forbidden chars
102 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
104 # An exception here should be caught in the caller
105 stream
= open(filename
, open_mode
)
106 return (stream
, filename
)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableVideoError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self
, downloaded
, expected
):
154 self
.downloaded
= downloaded
155 self
.expected
= expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 format_limit: Highest quality format to try.
193 outtmpl: Template for output names.
194 ignoreerrors: Do not stop on download errors.
195 ratelimit: Download speed limit, in bytes/sec.
196 nooverwrites: Prevent overwriting files.
197 retries: Number of times to retry for HTTP error 503
198 continuedl: Try to continue downloads if possible.
199 noprogress: Do not print the progress bar.
205 _download_retcode
= None
206 _num_downloads
= None
208 def __init__(self
, params
):
209 """Create a FileDownloader object with the given options."""
212 self
._download
_retcode
= 0
213 self
._num
_downloads
= 0
217 def pmkdir(filename
):
218 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219 components
= filename
.split(os
.sep
)
220 aggregate
= [os
.sep
.join(components
[0:x
]) for x
in xrange(1, len(components
))]
221 aggregate
= ['%s%s' % (x
, os
.sep
) for x
in aggregate
] # Finish names with separator
222 for dir in aggregate
:
223 if not os
.path
.exists(dir):
227 def format_bytes(bytes):
230 if type(bytes) is str:
235 exponent
= long(math
.log(bytes, 1024.0))
236 suffix
= 'bkMGTPEZY'[exponent
]
237 converted
= float(bytes) / float(1024**exponent
)
238 return '%.2f%s' % (converted
, suffix
)
241 def calc_percent(byte_counter
, data_len
):
244 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
247 def calc_eta(start
, now
, total
, current
):
251 if current
== 0 or dif
< 0.001: # One millisecond
253 rate
= float(current
) / dif
254 eta
= long((float(total
) - float(current
)) / rate
)
255 (eta_mins
, eta_secs
) = divmod(eta
, 60)
258 return '%02d:%02d' % (eta_mins
, eta_secs
)
261 def calc_speed(start
, now
, bytes):
263 if bytes == 0 or dif
< 0.001: # One millisecond
264 return '%10s' % '---b/s'
265 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
268 def best_block_size(elapsed_time
, bytes):
269 new_min
= max(bytes / 2.0, 1.0)
270 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271 if elapsed_time
< 0.001:
273 rate
= bytes / elapsed_time
281 def parse_bytes(bytestr
):
282 """Parse a string indicating a byte quantity into a long integer."""
283 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
286 number
= float(matchobj
.group(1))
287 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
288 return long(round(number
* multiplier
))
290 def add_info_extractor(self
, ie
):
291 """Add an InfoExtractor object to the end of the list."""
293 ie
.set_downloader(self
)
295 def add_post_processor(self
, pp
):
296 """Add a PostProcessor object to the end of the chain."""
298 pp
.set_downloader(self
)
300 def to_stdout(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
301 """Print message to stdout if not in quiet mode."""
303 if not self
.params
.get('quiet', False):
304 print (u
'%s%s' % (message
, [u
'\n', u
''][skip_eol
])).encode(preferredencoding()),
306 except (UnicodeEncodeError), err
:
307 if not ignore_encoding_errors
:
310 def to_stderr(self
, message
):
311 """Print message to stderr."""
312 print >>sys
.stderr
, message
.encode(preferredencoding())
314 def fixed_template(self
):
315 """Checks if the output template is fixed."""
316 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
318 def trouble(self
, message
=None):
319 """Determine action to take when a download problem appears.
321 Depending on if the downloader has been configured to ignore
322 download errors or not, this method may throw an exception or
323 not when errors are found, after printing the message.
325 if message
is not None:
326 self
.to_stderr(message
)
327 if not self
.params
.get('ignoreerrors', False):
328 raise DownloadError(message
)
329 self
._download
_retcode
= 1
331 def slow_down(self
, start_time
, byte_counter
):
332 """Sleep if the download speed is over the rate limit."""
333 rate_limit
= self
.params
.get('ratelimit', None)
334 if rate_limit
is None or byte_counter
== 0:
337 elapsed
= now
- start_time
340 speed
= float(byte_counter
) / elapsed
341 if speed
> rate_limit
:
342 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
344 def report_destination(self
, filename
):
345 """Report destination filename."""
346 self
.to_stdout(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
348 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
349 """Report download progress."""
350 if self
.params
.get('noprogress', False):
352 self
.to_stdout(u
'\r[download] %s of %s at %s ETA %s' %
353 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
355 def report_resuming_byte(self
, resume_len
):
356 """Report attemtp to resume at given byte."""
357 self
.to_stdout(u
'[download] Resuming download at byte %s' % resume_len
)
359 def report_retry(self
, count
, retries
):
360 """Report retry in case of HTTP error 503"""
361 self
.to_stdout(u
'[download] Got HTTP error 503. Retrying (attempt %d of %d)...' % (count
, retries
))
363 def report_file_already_downloaded(self
, file_name
):
364 """Report file has already been fully downloaded."""
366 self
.to_stdout(u
'[download] %s has already been downloaded' % file_name
)
367 except (UnicodeEncodeError), err
:
368 self
.to_stdout(u
'[download] The file has already been downloaded')
370 def report_unable_to_resume(self
):
371 """Report it was impossible to resume download."""
372 self
.to_stdout(u
'[download] Unable to resume')
374 def report_finish(self
):
375 """Report download finished."""
376 if self
.params
.get('noprogress', False):
377 self
.to_stdout(u
'[download] Download completed')
381 def increment_downloads(self
):
382 """Increment the ordinal that assigns a number to each file."""
383 self
._num
_downloads
+= 1
385 def process_info(self
, info_dict
):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self
.params
.get('simulate', False):
390 if self
.params
.get('forcetitle', False):
391 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392 if self
.params
.get('forceurl', False):
393 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
395 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
397 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
402 template_dict
= dict(info_dict
)
403 template_dict
['epoch'] = unicode(long(time
.time()))
404 template_dict
['ord'] = unicode('%05d' % self
._num
_downloads
)
405 filename
= self
.params
['outtmpl'] % template_dict
406 except (ValueError, KeyError), err
:
407 self
.trouble('ERROR: invalid output template or system charset: %s' % str(err
))
408 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
409 self
.to_stderr(u
'WARNING: file exists: %s; skipping' % filename
)
413 self
.pmkdir(filename
)
414 except (OSError, IOError), err
:
415 self
.trouble('ERROR: unable to create directories: %s' % str(err
))
419 success
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None))
420 except (OSError, IOError), err
:
421 raise UnavailableVideoError
422 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
423 self
.trouble('ERROR: unable to download video data: %s' % str(err
))
425 except (ContentTooShortError
, ), err
:
426 self
.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
431 self
.post_process(filename
, info_dict
)
432 except (PostProcessingError
), err
:
433 self
.trouble('ERROR: postprocessing: %s' % str(err
))
436 def download(self
, url_list
):
437 """Download a given list of URLs."""
438 if len(url_list
) > 1 and self
.fixed_template():
439 raise SameFileError(self
.params
['outtmpl'])
442 suitable_found
= False
444 # Go to next InfoExtractor if not suitable
445 if not ie
.suitable(url
):
448 # Suitable InfoExtractor found
449 suitable_found
= True
451 # Extract information from URL and process it
454 # Suitable InfoExtractor had been found; go to next URL
457 if not suitable_found
:
458 self
.trouble('ERROR: no suitable InfoExtractor: %s' % url
)
460 return self
._download
_retcode
462 def post_process(self
, filename
, ie_info
):
463 """Run the postprocessing chain on the given file."""
465 info
['filepath'] = filename
471 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
472 self
.report_destination(filename
)
474 # Check for rtmpdump first
476 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
477 except (OSError, IOError):
478 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
481 # Download using rtmpdump. rtmpdump returns exit code 2 when
482 # the connection was interrumpted and resuming appears to be
483 # possible. This is part of rtmpdump's normal usage, AFAIK.
484 basic_args
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', filename
]
485 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
486 while retval
== 2 or retval
== 1:
487 prevsize
= os
.path
.getsize(filename
)
488 self
.to_stdout(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
489 time
.sleep(5.0) # This seems to be needed
490 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
491 cursize
= os
.path
.getsize(filename
)
492 if prevsize
== cursize
and retval
== 1:
495 self
.to_stdout(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(filename
))
498 self
.trouble('\nERROR: rtmpdump exited with code %d' % retval
)
501 def _do_download(self
, filename
, url
, player_url
):
502 # Attempt to download using rtmpdump
503 if url
.startswith('rtmp'):
504 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
508 basic_request
= urllib2
.Request(url
, None, std_headers
)
509 request
= urllib2
.Request(url
, None, std_headers
)
511 # Establish possible resume length
512 if os
.path
.isfile(filename
):
513 resume_len
= os
.path
.getsize(filename
)
517 # Request parameters in case of being able to resume
518 if self
.params
.get('continuedl', False) and resume_len
!= 0:
519 self
.report_resuming_byte(resume_len
)
520 request
.add_header('Range','bytes=%d-' % resume_len
)
524 retries
= self
.params
.get('retries', 0)
525 while count
<= retries
:
526 # Establish connection
528 data
= urllib2
.urlopen(request
)
530 except (urllib2
.HTTPError
, ), err
:
531 if err
.code
!= 503 and err
.code
!= 416:
532 # Unexpected HTTP error
534 elif err
.code
== 416:
535 # Unable to resume (requested range not satisfiable)
537 # Open the connection again without the range header
538 data
= urllib2
.urlopen(basic_request
)
539 content_length
= data
.info()['Content-Length']
540 except (urllib2
.HTTPError
, ), err
:
544 # Examine the reported length
545 if (content_length
is not None and
546 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
547 # The file had already been fully downloaded.
548 # Explanation to the above condition: in issue #175 it was revealed that
549 # YouTube sometimes adds or removes a few bytes from the end of the file,
550 # changing the file size slightly and causing problems for some users. So
551 # I decided to implement a suggested change and consider the file
552 # completely downloaded if the file size differs less than 100 bytes from
553 # the one in the hard drive.
554 self
.report_file_already_downloaded(filename
)
557 # The length does not match, we start the download over
558 self
.report_unable_to_resume()
564 self
.report_retry(count
, retries
)
567 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
570 data_len
= data
.info().get('Content-length', None)
571 data_len_str
= self
.format_bytes(data_len
)
578 data_block
= data
.read(block_size
)
580 data_block_len
= len(data_block
)
581 if data_block_len
== 0:
583 byte_counter
+= data_block_len
585 # Open file just in time
588 (stream
, filename
) = sanitize_open(filename
, open_mode
)
589 self
.report_destination(filename
)
590 except (OSError, IOError), err
:
591 self
.trouble('ERROR: unable to open for writing: %s' % str(err
))
594 stream
.write(data_block
)
595 except (IOError, OSError), err
:
596 self
.trouble('\nERROR: unable to write data: %s' % str(err
))
597 block_size
= self
.best_block_size(after
- before
, data_block_len
)
600 percent_str
= self
.calc_percent(byte_counter
, data_len
)
601 eta_str
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
)
602 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
)
603 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
606 self
.slow_down(start
, byte_counter
)
609 if data_len
is not None and str(byte_counter
) != data_len
:
610 raise ContentTooShortError(byte_counter
, long(data_len
))
613 class InfoExtractor(object):
614 """Information Extractor class.
616 Information extractors are the classes that, given a URL, extract
617 information from the video (or videos) the URL refers to. This
618 information includes the real video URL, the video title and simplified
619 title, author and others. The information is stored in a dictionary
620 which is then passed to the FileDownloader. The FileDownloader
621 processes this information possibly downloading the video to the file
622 system, among other possible outcomes. The dictionaries must include
623 the following fields:
625 id: Video identifier.
626 url: Final video URL.
627 uploader: Nickname of the video uploader.
628 title: Literal title.
629 stitle: Simplified title.
630 ext: Video filename extension.
631 format: Video format.
632 player_url: SWF Player URL (may be None).
634 The following fields are optional. Their primary purpose is to allow
635 youtube-dl to serve as the backend for a video search function, such
636 as the one in youtube2mp3. They are only used when their respective
637 forced printing functions are called:
639 thumbnail: Full URL to a video thumbnail image.
640 description: One-line video description.
642 Subclasses of this one should re-define the _real_initialize() and
643 _real_extract() methods, as well as the suitable() static method.
644 Probably, they should also be instantiated and added to the main
651 def __init__(self
, downloader
=None):
652 """Constructor. Receives an optional downloader."""
654 self
.set_downloader(downloader
)
658 """Receives a URL and returns True if suitable for this IE."""
661 def initialize(self
):
662 """Initializes an instance (authentication, etc)."""
664 self
._real
_initialize
()
667 def extract(self
, url
):
668 """Extracts URL information and returns it in list of dicts."""
670 return self
._real
_extract
(url
)
672 def set_downloader(self
, downloader
):
673 """Sets the downloader for this IE."""
674 self
._downloader
= downloader
676 def _real_initialize(self
):
677 """Real initialization process. Redefine in subclasses."""
680 def _real_extract(self
, url
):
681 """Real extraction process. Redefine in subclasses."""
684 class YoutubeIE(InfoExtractor
):
685 """Information extractor for youtube.com."""
687 _VALID_URL
= r
'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?[\?#](?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
688 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
689 _LOGIN_URL
= 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
690 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
691 _NETRC_MACHINE
= 'youtube'
692 # Listed in order of quality
693 _available_formats
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
694 _video_extensions
= {
700 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
707 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None)
709 def report_lang(self
):
710 """Report attempt to set language."""
711 self
._downloader
.to_stdout(u
'[youtube] Setting language')
713 def report_login(self
):
714 """Report attempt to log in."""
715 self
._downloader
.to_stdout(u
'[youtube] Logging in')
717 def report_age_confirmation(self
):
718 """Report attempt to confirm age."""
719 self
._downloader
.to_stdout(u
'[youtube] Confirming age')
721 def report_video_webpage_download(self
, video_id
):
722 """Report attempt to download video webpage."""
723 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video webpage' % video_id
)
725 def report_video_info_webpage_download(self
, video_id
):
726 """Report attempt to download video info webpage."""
727 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video info webpage' % video_id
)
729 def report_information_extraction(self
, video_id
):
730 """Report attempt to extract video information."""
731 self
._downloader
.to_stdout(u
'[youtube] %s: Extracting video information' % video_id
)
733 def report_unavailable_format(self
, video_id
, format
):
734 """Report extracted video URL."""
735 self
._downloader
.to_stdout(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
737 def report_rtmp_download(self
):
738 """Indicate the download will use the RTMP protocol."""
739 self
._downloader
.to_stdout(u
'[youtube] RTMP download detected')
741 def _real_initialize(self
):
742 if self
._downloader
is None:
747 downloader_params
= self
._downloader
.params
749 # Attempt to use provided username and password or .netrc data
750 if downloader_params
.get('username', None) is not None:
751 username
= downloader_params
['username']
752 password
= downloader_params
['password']
753 elif downloader_params
.get('usenetrc', False):
755 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
760 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
761 except (IOError, netrc
.NetrcParseError
), err
:
762 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
766 request
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
)
769 urllib2
.urlopen(request
).read()
770 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
771 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
774 # No authentication to be performed
780 'current_form': 'loginForm',
782 'action_login': 'Log In',
783 'username': username
,
784 'password': password
,
786 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
)
789 login_results
= urllib2
.urlopen(request
).read()
790 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
791 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
793 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
794 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
800 'action_confirm': 'Confirm',
802 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
)
804 self
.report_age_confirmation()
805 age_results
= urllib2
.urlopen(request
).read()
806 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
807 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
810 def _real_extract(self
, url
):
811 # Extract video id from URL
812 mobj
= re
.match(self
._VALID
_URL
, url
)
814 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
816 video_id
= mobj
.group(2)
819 self
.report_video_webpage_download(video_id
)
820 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
, None, std_headers
)
822 video_webpage
= urllib2
.urlopen(request
).read()
823 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
824 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
827 # Attempt to extract SWF player URL
828 mobj
= re
.search(r
'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage
)
830 player_url
= mobj
.group(1)
835 self
.report_video_info_webpage_download(video_id
)
836 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
837 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
838 % (video_id
, el_type
))
839 request
= urllib2
.Request(video_info_url
, None, std_headers
)
841 video_info_webpage
= urllib2
.urlopen(request
).read()
842 video_info
= parse_qs(video_info_webpage
)
843 if 'token' in video_info
:
845 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
846 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
848 if 'token' not in video_info
:
849 if 'reason' in video_info
:
850 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0])
852 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
855 # Start extracting information
856 self
.report_information_extraction(video_id
)
859 if 'author' not in video_info
:
860 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
862 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
865 if 'title' not in video_info
:
866 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
868 video_title
= urllib
.unquote_plus(video_info
['title'][0])
869 video_title
= video_title
.decode('utf-8')
870 video_title
= sanitize_title(video_title
)
873 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
874 simple_title
= simple_title
.strip(ur
'_')
877 if 'thumbnail_url' not in video_info
:
878 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
880 else: # don't panic if we can't find it
881 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
884 video_description
= 'No description available.'
885 if self
._downloader
.params
.get('forcedescription', False):
886 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
888 video_description
= mobj
.group(1)
891 video_token
= urllib
.unquote_plus(video_info
['token'][0])
893 # Decide which formats to download
894 requested_format
= self
._downloader
.params
.get('format', None)
895 get_video_template
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
)
897 if 'fmt_url_map' in video_info
:
898 url_map
= dict(tuple(pair
.split('|')) for pair
in video_info
['fmt_url_map'][0].split(','))
899 format_limit
= self
._downloader
.params
.get('format_limit', None)
900 if format_limit
is not None and format_limit
in self
._available
_formats
:
901 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
903 format_list
= self
._available
_formats
904 existing_formats
= [x
for x
in format_list
if x
in url_map
]
905 if len(existing_formats
) == 0:
906 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
908 if requested_format
is None:
909 video_url_list
= [(existing_formats
[0], get_video_template
% existing_formats
[0])] # Best quality
910 elif requested_format
== '-1':
911 video_url_list
= [(f
, get_video_template
% f
) for f
in existing_formats
] # All formats
913 video_url_list
= [(requested_format
, get_video_template
% requested_format
)] # Specific format
915 elif 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
916 self
.report_rtmp_download()
917 video_url_list
= [(None, video_info
['conn'][0])]
920 self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info')
923 for format_param
, video_real_url
in video_url_list
:
924 # At this point we have a new video
925 self
._downloader
.increment_downloads()
928 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
930 # Find the video URL in fmt_url_map or conn paramters
932 # Process video information
933 self
._downloader
.process_info({
934 'id': video_id
.decode('utf-8'),
935 'url': video_real_url
.decode('utf-8'),
936 'uploader': video_uploader
.decode('utf-8'),
937 'title': video_title
,
938 'stitle': simple_title
,
939 'ext': video_extension
.decode('utf-8'),
940 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
941 'thumbnail': video_thumbnail
.decode('utf-8'),
942 'description': video_description
.decode('utf-8'),
943 'player_url': player_url
,
945 except UnavailableVideoError
, err
:
946 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)')
949 class MetacafeIE(InfoExtractor
):
950 """Information Extractor for metacafe.com."""
952 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
953 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
954 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
957 def __init__(self
, youtube_ie
, downloader
=None):
958 InfoExtractor
.__init
__(self
, downloader
)
959 self
._youtube
_ie
= youtube_ie
963 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None)
965 def report_disclaimer(self
):
966 """Report disclaimer retrieval."""
967 self
._downloader
.to_stdout(u
'[metacafe] Retrieving disclaimer')
969 def report_age_confirmation(self
):
970 """Report attempt to confirm age."""
971 self
._downloader
.to_stdout(u
'[metacafe] Confirming age')
973 def report_download_webpage(self
, video_id
):
974 """Report webpage download."""
975 self
._downloader
.to_stdout(u
'[metacafe] %s: Downloading webpage' % video_id
)
977 def report_extraction(self
, video_id
):
978 """Report information extraction."""
979 self
._downloader
.to_stdout(u
'[metacafe] %s: Extracting information' % video_id
)
981 def _real_initialize(self
):
982 # Retrieve disclaimer
983 request
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
)
985 self
.report_disclaimer()
986 disclaimer
= urllib2
.urlopen(request
).read()
987 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
988 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
994 'submit': "Continue - I'm over 18",
996 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
)
998 self
.report_age_confirmation()
999 disclaimer
= urllib2
.urlopen(request
).read()
1000 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1001 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1004 def _real_extract(self
, url
):
1005 # Extract id and simplified title from URL
1006 mobj
= re
.match(self
._VALID
_URL
, url
)
1008 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1011 video_id
= mobj
.group(1)
1013 # Check if video comes from YouTube
1014 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1015 if mobj2
is not None:
1016 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1019 # At this point we have a new video
1020 self
._downloader
.increment_downloads()
1022 simple_title
= mobj
.group(2).decode('utf-8')
1023 video_extension
= 'flv'
1025 # Retrieve video webpage to extract further information
1026 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1028 self
.report_download_webpage(video_id
)
1029 webpage
= urllib2
.urlopen(request
).read()
1030 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1031 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1034 # Extract URL, uploader and title from webpage
1035 self
.report_extraction(video_id
)
1036 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1038 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1040 mediaURL
= urllib
.unquote(mobj
.group(1))
1042 # Extract gdaKey if available
1043 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1045 video_url
= mediaURL
1046 #self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1049 gdaKey
= mobj
.group(1)
1050 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1052 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1054 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1056 video_title
= mobj
.group(1).decode('utf-8')
1057 video_title
= sanitize_title(video_title
)
1059 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1061 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1063 video_uploader
= mobj
.group(1)
1066 # Process video information
1067 self
._downloader
.process_info({
1068 'id': video_id
.decode('utf-8'),
1069 'url': video_url
.decode('utf-8'),
1070 'uploader': video_uploader
.decode('utf-8'),
1071 'title': video_title
,
1072 'stitle': simple_title
,
1073 'ext': video_extension
.decode('utf-8'),
1077 except UnavailableVideoError
:
1078 self
._downloader
.trouble(u
'ERROR: unable to download video')
1081 class DailymotionIE(InfoExtractor
):
1082 """Information Extractor for Dailymotion"""
1084 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1086 def __init__(self
, downloader
=None):
1087 InfoExtractor
.__init
__(self
, downloader
)
1091 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None)
1093 def report_download_webpage(self
, video_id
):
1094 """Report webpage download."""
1095 self
._downloader
.to_stdout(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1097 def report_extraction(self
, video_id
):
1098 """Report information extraction."""
1099 self
._downloader
.to_stdout(u
'[dailymotion] %s: Extracting information' % video_id
)
1101 def _real_initialize(self
):
1104 def _real_extract(self
, url
):
1105 # Extract id and simplified title from URL
1106 mobj
= re
.match(self
._VALID
_URL
, url
)
1108 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1111 # At this point we have a new video
1112 self
._downloader
.increment_downloads()
1113 video_id
= mobj
.group(1)
1115 simple_title
= mobj
.group(2).decode('utf-8')
1116 video_extension
= 'flv'
1118 # Retrieve video webpage to extract further information
1119 request
= urllib2
.Request(url
)
1121 self
.report_download_webpage(video_id
)
1122 webpage
= urllib2
.urlopen(request
).read()
1123 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1124 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1127 # Extract URL, uploader and title from webpage
1128 self
.report_extraction(video_id
)
1129 mobj
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
)
1131 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1133 mediaURL
= urllib
.unquote(mobj
.group(1))
1135 # if needed add http://www.dailymotion.com/ if relative URL
1137 video_url
= mediaURL
1139 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1140 mobj
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
)
1142 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1144 video_title
= mobj
.group(1).decode('utf-8')
1145 video_title
= sanitize_title(video_title
)
1147 mobj
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a></div>', webpage
)
1149 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1151 video_uploader
= mobj
.group(1)
1154 # Process video information
1155 self
._downloader
.process_info({
1156 'id': video_id
.decode('utf-8'),
1157 'url': video_url
.decode('utf-8'),
1158 'uploader': video_uploader
.decode('utf-8'),
1159 'title': video_title
,
1160 'stitle': simple_title
,
1161 'ext': video_extension
.decode('utf-8'),
1165 except UnavailableVideoError
:
1166 self
._downloader
.trouble(u
'ERROR: unable to download video')
1168 class GoogleIE(InfoExtractor
):
1169 """Information extractor for video.google.com."""
1171 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1173 def __init__(self
, downloader
=None):
1174 InfoExtractor
.__init
__(self
, downloader
)
1178 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None)
1180 def report_download_webpage(self
, video_id
):
1181 """Report webpage download."""
1182 self
._downloader
.to_stdout(u
'[video.google] %s: Downloading webpage' % video_id
)
1184 def report_extraction(self
, video_id
):
1185 """Report information extraction."""
1186 self
._downloader
.to_stdout(u
'[video.google] %s: Extracting information' % video_id
)
1188 def _real_initialize(self
):
1191 def _real_extract(self
, url
):
1192 # Extract id from URL
1193 mobj
= re
.match(self
._VALID
_URL
, url
)
1195 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1198 # At this point we have a new video
1199 self
._downloader
.increment_downloads()
1200 video_id
= mobj
.group(1)
1202 video_extension
= 'mp4'
1204 # Retrieve video webpage to extract further information
1205 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1207 self
.report_download_webpage(video_id
)
1208 webpage
= urllib2
.urlopen(request
).read()
1209 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1210 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1213 # Extract URL, uploader, and title from webpage
1214 self
.report_extraction(video_id
)
1215 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1217 video_extension
= 'flv'
1218 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1220 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1222 mediaURL
= urllib
.unquote(mobj
.group(1))
1223 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1224 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1226 video_url
= mediaURL
1228 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1230 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1232 video_title
= mobj
.group(1).decode('utf-8')
1233 video_title
= sanitize_title(video_title
)
1234 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1236 # Extract video description
1237 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1239 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1241 video_description
= mobj
.group(1).decode('utf-8')
1242 if not video_description
:
1243 video_description
= 'No description available.'
1245 # Extract video thumbnail
1246 if self
._downloader
.params
.get('forcethumbnail', False):
1247 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1249 webpage
= urllib2
.urlopen(request
).read()
1250 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1251 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1253 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1255 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1257 video_thumbnail
= mobj
.group(1)
1258 else: # we need something to pass to process_info
1259 video_thumbnail
= ''
1263 # Process video information
1264 self
._downloader
.process_info({
1265 'id': video_id
.decode('utf-8'),
1266 'url': video_url
.decode('utf-8'),
1268 'title': video_title
,
1269 'stitle': simple_title
,
1270 'ext': video_extension
.decode('utf-8'),
1274 except UnavailableVideoError
:
1275 self
._downloader
.trouble(u
'ERROR: unable to download video')
1278 class PhotobucketIE(InfoExtractor
):
1279 """Information extractor for photobucket.com."""
1281 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1283 def __init__(self
, downloader
=None):
1284 InfoExtractor
.__init
__(self
, downloader
)
1288 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None)
1290 def report_download_webpage(self
, video_id
):
1291 """Report webpage download."""
1292 self
._downloader
.to_stdout(u
'[photobucket] %s: Downloading webpage' % video_id
)
1294 def report_extraction(self
, video_id
):
1295 """Report information extraction."""
1296 self
._downloader
.to_stdout(u
'[photobucket] %s: Extracting information' % video_id
)
1298 def _real_initialize(self
):
1301 def _real_extract(self
, url
):
1302 # Extract id from URL
1303 mobj
= re
.match(self
._VALID
_URL
, url
)
1305 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1308 # At this point we have a new video
1309 self
._downloader
.increment_downloads()
1310 video_id
= mobj
.group(1)
1312 video_extension
= 'flv'
1314 # Retrieve video webpage to extract further information
1315 request
= urllib2
.Request(url
)
1317 self
.report_download_webpage(video_id
)
1318 webpage
= urllib2
.urlopen(request
).read()
1319 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1320 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1323 # Extract URL, uploader, and title from webpage
1324 self
.report_extraction(video_id
)
1325 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1327 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1329 mediaURL
= urllib
.unquote(mobj
.group(1))
1331 video_url
= mediaURL
1333 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1335 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1337 video_title
= mobj
.group(1).decode('utf-8')
1338 video_title
= sanitize_title(video_title
)
1339 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1341 video_uploader
= mobj
.group(2).decode('utf-8')
1344 # Process video information
1345 self
._downloader
.process_info({
1346 'id': video_id
.decode('utf-8'),
1347 'url': video_url
.decode('utf-8'),
1348 'uploader': video_uploader
,
1349 'title': video_title
,
1350 'stitle': simple_title
,
1351 'ext': video_extension
.decode('utf-8'),
1355 except UnavailableVideoError
:
1356 self
._downloader
.trouble(u
'ERROR: unable to download video')
1359 class YahooIE(InfoExtractor
):
1360 """Information extractor for video.yahoo.com."""
1362 # _VALID_URL matches all Yahoo! Video URLs
1363 # _VPAGE_URL matches only the extractable '/watch/' URLs
1364 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1365 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1367 def __init__(self
, downloader
=None):
1368 InfoExtractor
.__init
__(self
, downloader
)
1372 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None)
1374 def report_download_webpage(self
, video_id
):
1375 """Report webpage download."""
1376 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1378 def report_extraction(self
, video_id
):
1379 """Report information extraction."""
1380 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Extracting information' % video_id
)
1382 def _real_initialize(self
):
1385 def _real_extract(self
, url
, new_video
=True):
1386 # Extract ID from URL
1387 mobj
= re
.match(self
._VALID
_URL
, url
)
1389 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1392 # At this point we have a new video
1393 self
._downloader
.increment_downloads()
1394 video_id
= mobj
.group(2)
1395 video_extension
= 'flv'
1397 # Rewrite valid but non-extractable URLs as
1398 # extractable English language /watch/ URLs
1399 if re
.match(self
._VPAGE
_URL
, url
) is None:
1400 request
= urllib2
.Request(url
)
1402 webpage
= urllib2
.urlopen(request
).read()
1403 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1404 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1407 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1409 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1411 yahoo_id
= mobj
.group(1)
1413 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1415 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1417 yahoo_vid
= mobj
.group(1)
1419 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1420 return self
._real
_extract
(url
, new_video
=False)
1422 # Retrieve video webpage to extract further information
1423 request
= urllib2
.Request(url
)
1425 self
.report_download_webpage(video_id
)
1426 webpage
= urllib2
.urlopen(request
).read()
1427 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1428 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1431 # Extract uploader and title from webpage
1432 self
.report_extraction(video_id
)
1433 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1435 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1437 video_title
= mobj
.group(1).decode('utf-8')
1438 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1440 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1442 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1444 video_uploader
= mobj
.group(1).decode('utf-8')
1446 # Extract video thumbnail
1447 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1449 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1451 video_thumbnail
= mobj
.group(1).decode('utf-8')
1453 # Extract video description
1454 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1456 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1458 video_description
= mobj
.group(1).decode('utf-8')
1459 if not video_description
: video_description
= 'No description available.'
1461 # Extract video height and width
1462 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1464 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1466 yv_video_height
= mobj
.group(1)
1468 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1470 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1472 yv_video_width
= mobj
.group(1)
1474 # Retrieve video playlist to extract media URL
1475 # I'm not completely sure what all these options are, but we
1476 # seem to need most of them, otherwise the server sends a 401.
1477 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1478 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1479 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1480 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1481 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1483 self
.report_download_webpage(video_id
)
1484 webpage
= urllib2
.urlopen(request
).read()
1485 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1486 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1489 # Extract media URL from playlist XML
1490 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1492 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1494 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1495 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1498 # Process video information
1499 self
._downloader
.process_info({
1500 'id': video_id
.decode('utf-8'),
1502 'uploader': video_uploader
,
1503 'title': video_title
,
1504 'stitle': simple_title
,
1505 'ext': video_extension
.decode('utf-8'),
1506 'thumbnail': video_thumbnail
.decode('utf-8'),
1507 'description': video_description
,
1508 'thumbnail': video_thumbnail
,
1509 'description': video_description
,
1512 except UnavailableVideoError
:
1513 self
._downloader
.trouble(u
'ERROR: unable to download video')
1516 class GenericIE(InfoExtractor
):
1517 """Generic last-resort information extractor."""
1519 def __init__(self
, downloader
=None):
1520 InfoExtractor
.__init
__(self
, downloader
)
1526 def report_download_webpage(self
, video_id
):
1527 """Report webpage download."""
1528 self
._downloader
.to_stdout(u
'WARNING: Falling back on generic information extractor.')
1529 self
._downloader
.to_stdout(u
'[generic] %s: Downloading webpage' % video_id
)
1531 def report_extraction(self
, video_id
):
1532 """Report information extraction."""
1533 self
._downloader
.to_stdout(u
'[generic] %s: Extracting information' % video_id
)
1535 def _real_initialize(self
):
1538 def _real_extract(self
, url
):
1539 # At this point we have a new video
1540 self
._downloader
.increment_downloads()
1542 video_id
= url
.split('/')[-1]
1543 request
= urllib2
.Request(url
)
1545 self
.report_download_webpage(video_id
)
1546 webpage
= urllib2
.urlopen(request
).read()
1547 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1548 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1550 except ValueError, err
:
1551 # since this is the last-resort InfoExtractor, if
1552 # this error is thrown, it'll be thrown here
1553 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1556 # Start with something easy: JW Player in SWFObject
1557 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1559 # Broaden the search a little bit
1560 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1562 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1565 # It's possible that one of the regexes
1566 # matched, but returned an empty group:
1567 if mobj.group(1) is None:
1568 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1571 video_url = urllib.unquote(mobj.group(1))
1572 video_id = os.path.basename(video_url)
1574 # here's a fun little line of code for you:
1575 video_extension = os.path.splitext(video_id)[1][1:]
1576 video_id = os.path.splitext(video_id)[0]
1578 # it's tempting to parse this further, but you would
1579 # have to take into account all the variations like
1580 # Video Title - Site Name
1581 # Site Name | Video Title
1582 # Video Title - Tagline | Site Name
1583 # and so on and so forth; it's just not practical
1584 mobj = re.search(r'<title>(.*)</title>', webpage)
1586 self._downloader.trouble(u'ERROR: unable to extract title')
1588 video_title = mobj.group(1).decode('utf-8')
1589 video_title = sanitize_title(video_title)
1590 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1592 # video uploader is domain name
1593 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1595 self._downloader.trouble(u'ERROR: unable to extract title')
1597 video_uploader = mobj.group(1).decode('utf-8')
1600 # Process video information
1601 self._downloader.process_info({
1602 'id': video_id.decode('utf-8'),
1603 'url': video_url.decode('utf-8'),
1604 'uploader': video_uploader,
1605 'title': video_title,
1606 'stitle': simple_title,
1607 'ext': video_extension.decode('utf-8'),
1611 except UnavailableVideoError, err:
1612 self._downloader.trouble(u'ERROR: unable to download video')
1615 class YoutubeSearchIE(InfoExtractor):
1616 """Information Extractor for YouTube search queries."""
1617 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1618 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1619 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
1620 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1622 _max_youtube_results = 1000
1624 def __init__(self, youtube_ie, downloader=None):
1625 InfoExtractor.__init__(self, downloader)
1626 self._youtube_ie = youtube_ie
1630 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1632 def report_download_page(self, query, pagenum):
1633 """Report attempt to download playlist page with given number."""
1634 query = query.decode(preferredencoding())
1635 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1637 def _real_initialize(self):
1638 self._youtube_ie.initialize()
1640 def _real_extract(self, query):
1641 mobj = re.match(self._VALID_QUERY, query)
1643 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1646 prefix, query = query.split(':')
1648 query = query.encode('utf-8')
1650 self._download_n_results(query, 1)
1652 elif prefix == 'all':
1653 self._download_n_results(query, self._max_youtube_results)
1659 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1661 elif n > self._max_youtube_results:
1662 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1663 n = self._max_youtube_results
1664 self._download_n_results(query, n)
1666 except ValueError: # parsing prefix as integer fails
1667 self._download_n_results(query, 1)
1670 def _download_n_results(self, query, n):
1671 """Downloads a specified number of results for a query"""
1674 already_seen = set()
1678 self.report_download_page(query, pagenum)
1679 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1680 request = urllib2.Request(result_url, None, std_headers)
1682 page = urllib2.urlopen(request).read()
1683 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1684 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1687 # Extract video identifiers
1688 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1689 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1690 if video_id not in already_seen:
1691 video_ids.append(video_id)
1692 already_seen.add(video_id)
1693 if len(video_ids) == n:
1694 # Specified n videos reached
1695 for id in video_ids:
1696 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1699 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1700 for id in video_ids:
1701 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1704 pagenum = pagenum + 1
1706 class GoogleSearchIE(InfoExtractor):
1707 """Information Extractor for Google Video search queries."""
1708 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1709 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1710 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1711 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1713 _max_google_results = 1000
1715 def __init__(self, google_ie, downloader=None):
1716 InfoExtractor.__init__(self, downloader)
1717 self._google_ie = google_ie
1721 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1723 def report_download_page(self, query, pagenum):
1724 """Report attempt to download playlist page with given number."""
1725 query = query.decode(preferredencoding())
1726 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1728 def _real_initialize(self):
1729 self._google_ie.initialize()
1731 def _real_extract(self, query):
1732 mobj = re.match(self._VALID_QUERY, query)
1734 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1737 prefix, query = query.split(':')
1739 query = query.encode('utf-8')
1741 self._download_n_results(query, 1)
1743 elif prefix == 'all':
1744 self._download_n_results(query, self._max_google_results)
1750 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1752 elif n > self._max_google_results:
1753 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1754 n = self._max_google_results
1755 self._download_n_results(query, n)
1757 except ValueError: # parsing prefix as integer fails
1758 self._download_n_results(query, 1)
1761 def _download_n_results(self, query, n):
1762 """Downloads a specified number of results for a query"""
1765 already_seen = set()
1769 self.report_download_page(query, pagenum)
1770 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1771 request = urllib2.Request(result_url, None, std_headers)
1773 page = urllib2.urlopen(request).read()
1774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1775 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1778 # Extract video identifiers
1779 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1780 video_id = mobj.group(1)
1781 if video_id not in already_seen:
1782 video_ids.append(video_id)
1783 already_seen.add(video_id)
1784 if len(video_ids) == n:
1785 # Specified n videos reached
1786 for id in video_ids:
1787 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1790 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1791 for id in video_ids:
1792 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1795 pagenum = pagenum + 1
1797 class YahooSearchIE(InfoExtractor):
1798 """Information Extractor for Yahoo! Video search queries."""
1799 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1800 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1801 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1802 _MORE_PAGES_INDICATOR = r'\s*Next'
1804 _max_yahoo_results = 1000
1806 def __init__(self, yahoo_ie, downloader=None):
1807 InfoExtractor.__init__(self, downloader)
1808 self._yahoo_ie = yahoo_ie
1812 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1814 def report_download_page(self, query, pagenum):
1815 """Report attempt to download playlist page with given number."""
1816 query = query.decode(preferredencoding())
1817 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1819 def _real_initialize(self):
1820 self._yahoo_ie.initialize()
1822 def _real_extract(self, query):
1823 mobj = re.match(self._VALID_QUERY, query)
1825 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1828 prefix, query = query.split(':')
1830 query = query.encode('utf-8')
1832 self._download_n_results(query, 1)
1834 elif prefix == 'all':
1835 self._download_n_results(query, self._max_yahoo_results)
1841 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1843 elif n > self._max_yahoo_results:
1844 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1845 n = self._max_yahoo_results
1846 self._download_n_results(query, n)
1848 except ValueError: # parsing prefix as integer fails
1849 self._download_n_results(query, 1)
1852 def _download_n_results(self, query, n):
1853 """Downloads a specified number of results for a query"""
1856 already_seen = set()
1860 self.report_download_page(query, pagenum)
1861 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1862 request = urllib2.Request(result_url, None, std_headers)
1864 page = urllib2.urlopen(request).read()
1865 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1866 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1869 # Extract video identifiers
1870 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1871 video_id = mobj.group(1)
1872 if video_id not in already_seen:
1873 video_ids.append(video_id)
1874 already_seen.add(video_id)
1875 if len(video_ids) == n:
1876 # Specified n videos reached
1877 for id in video_ids:
1878 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1881 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1882 for id in video_ids:
1883 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1886 pagenum = pagenum + 1
1888 class YoutubePlaylistIE(InfoExtractor):
1889 """Information Extractor for YouTube playlists."""
1891 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1892 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1893 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1894 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1897 def __init__(self, youtube_ie, downloader=None):
1898 InfoExtractor.__init__(self, downloader)
1899 self._youtube_ie = youtube_ie
1903 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1905 def report_download_page(self, playlist_id, pagenum):
1906 """Report attempt to download playlist page with given number."""
1907 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1909 def _real_initialize(self):
1910 self._youtube_ie.initialize()
1912 def _real_extract(self, url):
1913 # Extract playlist id
1914 mobj = re.match(self._VALID_URL, url)
1916 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1919 # Download playlist pages
1920 playlist_id = mobj.group(1)
1925 self.report_download_page(playlist_id, pagenum)
1926 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1928 page = urllib2.urlopen(request).read()
1929 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1930 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1933 # Extract video identifiers
1935 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936 if mobj.group(1) not in ids_in_page:
1937 ids_in_page.append(mobj.group(1))
1938 video_ids.extend(ids_in_page)
1940 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1942 pagenum = pagenum + 1
1944 playliststart = self._downloader.params.get('playliststart', 1)
1945 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1946 if playliststart > 0:
1947 video_ids = video_ids[playliststart:]
1949 for id in video_ids:
1950 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1953 class YoutubeUserIE(InfoExtractor):
1954 """Information Extractor for YouTube users."""
1956 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1957 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1958 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1961 def __init__(self, youtube_ie, downloader=None):
1962 InfoExtractor.__init__(self, downloader)
1963 self._youtube_ie = youtube_ie
1967 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1969 def report_download_page(self, username):
1970 """Report attempt to download user page."""
1971 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1973 def _real_initialize(self):
1974 self._youtube_ie.initialize()
1976 def _real_extract(self, url):
1978 mobj = re.match(self._VALID_URL, url)
1980 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1983 # Download user page
1984 username = mobj.group(1)
1988 self.report_download_page(username)
1989 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1991 page = urllib2.urlopen(request).read()
1992 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1993 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1996 # Extract video identifiers
1999 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2000 if mobj.group(1) not in ids_in_page:
2001 ids_in_page.append(mobj.group(1))
2002 video_ids.extend(ids_in_page)
2004 playliststart = self._downloader.params.get('playliststart', 1)
2005 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2006 if playliststart > 0:
2007 video_ids = video_ids[playliststart:]
2009 for id in video_ids:
2010 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2013 class PostProcessor(object):
2014 """Post Processor class.
2016 PostProcessor objects can be added to downloaders with their
2017 add_post_processor() method. When the downloader has finished a
2018 successful download, it will take its internal chain of PostProcessors
2019 and start calling the run() method on each one of them, first with
2020 an initial argument and then with the returned value of the previous
2023 The chain will be stopped if one of them ever returns None or the end
2024 of the chain is reached.
2026 PostProcessor objects follow a "mutual registration
" process similar
2027 to InfoExtractor objects.
2032 def __init__(self, downloader=None):
2033 self._downloader = downloader
2035 def set_downloader(self, downloader):
2036 """Sets the downloader for this PP."""
2037 self._downloader = downloader
2039 def run(self, information):
2040 """Run the PostProcessor.
2042 The "information
" argument is a dictionary like the ones
2043 composed by InfoExtractors. The only difference is that this
2044 one has an extra field called "filepath
" that points to the
2047 When this method returns None, the postprocessing chain is
2048 stopped. However, this method may return an information
2049 dictionary that will be passed to the next postprocessing
2050 object in the chain. It can be the one it received after
2051 changing some fields.
2053 In addition, this method may raise a PostProcessingError
2054 exception that will be taken into account by the downloader
2057 return information # by default, do nothing
2059 ### MAIN PROGRAM ###
2060 if __name__ == '__main__':
2062 # Modules needed only when running the main program
2066 # Function to update the program file with the latest version from bitbucket.org
2067 def update_self(downloader, filename):
2068 # Note: downloader only used for options
2069 if not os.access (filename, os.W_OK):
2070 sys.exit('ERROR: no write permissions on %s' % filename)
2072 downloader.to_stdout('Updating to latest stable version...')
2073 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2074 latest_version = urllib.urlopen(latest_url).read().strip()
2075 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2076 newcontent = urllib.urlopen(prog_url).read()
2077 stream = open(filename, 'w')
2078 stream.write(newcontent)
2080 downloader.to_stdout('Updated to version %s' % latest_version)
2082 # General configuration
2083 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2084 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2085 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2087 # Parse command line
2088 parser = optparse.OptionParser(
2089 usage='Usage: %prog [options] url...',
2090 version='2010.08.04',
2091 conflict_handler='resolve',
2094 parser.add_option('-h', '--help',
2095 action='help', help='print this help text and exit')
2096 parser.add_option('-v', '--version',
2097 action='version', help='print program version and exit')
2098 parser.add_option('-U', '--update',
2099 action='store_true', dest='update_self', help='update this program to latest stable version')
2100 parser.add_option('-i', '--ignore-errors',
2101 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2102 parser.add_option('-r', '--rate-limit',
2103 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2104 parser.add_option('-R', '--retries',
2105 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2106 parser.add_option('--playlist-start',
2107 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2109 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2110 authentication.add_option('-u', '--username',
2111 dest='username', metavar='USERNAME', help='account username')
2112 authentication.add_option('-p', '--password',
2113 dest='password', metavar='PASSWORD', help='account password')
2114 authentication.add_option('-n', '--netrc',
2115 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2116 parser.add_option_group(authentication)
2118 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2119 video_format.add_option('-f', '--format',
2120 action='store', dest='format', metavar='FORMAT', help='video format code')
2121 video_format.add_option('-m', '--mobile-version',
2122 action='store_const', dest='format', help='alias for -f 17', const='17')
2123 video_format.add_option('--all-formats',
2124 action='store_const', dest='format', help='download all available video formats', const='-1')
2125 video_format.add_option('--max-quality',
2126 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2127 video_format.add_option('-b', '--best-quality',
2128 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2129 parser.add_option_group(video_format)
2131 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2132 verbosity.add_option('-q', '--quiet',
2133 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2134 verbosity.add_option('-s', '--simulate',
2135 action='store_true', dest='simulate', help='do not download video', default=False)
2136 verbosity.add_option('-g', '--get-url',
2137 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2138 verbosity.add_option('-e', '--get-title',
2139 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2140 verbosity.add_option('--get-thumbnail',
2141 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2142 verbosity.add_option('--get-description',
2143 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2144 verbosity.add_option('--no-progress',
2145 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2146 parser.add_option_group(verbosity)
2148 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2149 filesystem.add_option('-t', '--title',
2150 action='store_true', dest='usetitle', help='use title in file name', default=False)
2151 filesystem.add_option('-l', '--literal',
2152 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2153 filesystem.add_option('-o', '--output',
2154 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2155 filesystem.add_option('-a', '--batch-file',
2156 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2157 filesystem.add_option('-w', '--no-overwrites',
2158 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2159 filesystem.add_option('-c', '--continue',
2160 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2161 parser.add_option_group(filesystem)
2163 (opts, args) = parser.parse_args()
2165 # Batch file verification
2167 if opts.batchfile is not None:
2169 if opts.batchfile == '-':
2172 batchfd = open(opts.batchfile, 'r')
2173 batchurls = batchfd.readlines()
2174 batchurls = [x.strip() for x in batchurls]
2175 batchurls = [x for x in batchurls if len(x) > 0]
2177 sys.exit(u'ERROR: batch file could not be read')
2178 all_urls = batchurls + args
2180 # Conflicting, missing and erroneous options
2181 if opts.bestquality:
2182 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2183 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2184 parser.error(u'using .netrc conflicts with giving username/password')
2185 if opts.password is not None and opts.username is None:
2186 parser.error(u'account username missing')
2187 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2188 parser.error(u'using output template conflicts with using title or literal title')
2189 if opts.usetitle and opts.useliteral:
2190 parser.error(u'using title conflicts with using literal title')
2191 if opts.username is not None and opts.password is None:
2192 opts.password = getpass.getpass(u'Type account password and press return:')
2193 if opts.ratelimit is not None:
2194 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2195 if numeric_limit is None:
2196 parser.error(u'invalid rate limit specified')
2197 opts.ratelimit = numeric_limit
2198 if opts.retries is not None:
2200 opts.retries = long(opts.retries)
2201 except (TypeError, ValueError), err:
2202 parser.error(u'invalid retry count specified')
2203 if opts.playliststart is not None:
2205 opts.playliststart = long(opts.playliststart)
2206 except (TypeError, ValueError), err:
2207 parser.error(u'invalid playlist page specified')
2209 # Information extractors
2210 youtube_ie = YoutubeIE()
2211 metacafe_ie = MetacafeIE(youtube_ie)
2212 dailymotion_ie = DailymotionIE()
2213 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2214 youtube_user_ie = YoutubeUserIE(youtube_ie)
2215 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2216 google_ie = GoogleIE()
2217 google_search_ie = GoogleSearchIE(google_ie)
2218 photobucket_ie = PhotobucketIE()
2219 yahoo_ie = YahooIE()
2220 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2221 generic_ie = GenericIE()
2224 fd = FileDownloader({
2225 'usenetrc': opts.usenetrc,
2226 'username': opts.username,
2227 'password': opts.password,
2228 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2229 'forceurl': opts.geturl,
2230 'forcetitle': opts.gettitle,
2231 'forcethumbnail': opts.getthumbnail,
2232 'forcedescription': opts.getdescription,
2233 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2234 'format': opts.format,
2235 'format_limit': opts.format_limit,
2236 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2237 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2238 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2239 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2240 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2241 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2242 or u'%(id)s.%(ext)s'),
2243 'ignoreerrors': opts.ignoreerrors,
2244 'ratelimit': opts.ratelimit,
2245 'nooverwrites': opts.nooverwrites,
2246 'retries': opts.retries,
2247 'continuedl': opts.continue_dl,
2248 'noprogress': opts.noprogress,
2249 'playliststart': opts.playliststart,
2251 fd.add_info_extractor(youtube_search_ie)
2252 fd.add_info_extractor(youtube_pl_ie)
2253 fd.add_info_extractor(youtube_user_ie)
2254 fd.add_info_extractor(metacafe_ie)
2255 fd.add_info_extractor(dailymotion_ie)
2256 fd.add_info_extractor(youtube_ie)
2257 fd.add_info_extractor(google_ie)
2258 fd.add_info_extractor(google_search_ie)
2259 fd.add_info_extractor(photobucket_ie)
2260 fd.add_info_extractor(yahoo_ie)
2261 fd.add_info_extractor(yahoo_search_ie)
2263 # This must come last since it's the
2264 # fallback if none of the others work
2265 fd.add_info_extractor(generic_ie)
2268 if opts.update_self:
2269 update_self(fd, sys.argv[0])
2272 if len(all_urls) < 1:
2273 if not opts.update_self:
2274 parser.error(u'you must provide at least one URL')
2277 retcode = fd.download(all_urls)
2280 except DownloadError:
2282 except SameFileError:
2283 sys.exit(u'ERROR: fixed output name but more than one file to download')
2284 except KeyboardInterrupt:
2285 sys.exit(u'\nERROR: Interrupted by user')