]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
23 # parse_qs was moved from the cgi module to the urlparse module recently.
25 from urlparse
import parse_qs
27 from cgi
import parse_qs
30 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100723 Firefox/3.6.8',
31 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
32 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
33 'Accept-Language': 'en-us,en;q=0.5',
36 simple_title_chars
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii')
38 def preferredencoding():
39 """Get preferred encoding.
41 Returns the best encoding scheme for the system, based on
42 locale.getpreferredencoding() and some further tweaks.
44 def yield_preferredencoding():
46 pref
= locale
.getpreferredencoding()
52 return yield_preferredencoding().next()
54 def htmlentity_transform(matchobj
):
55 """Transforms an HTML entity to a Unicode character.
57 This function receives a match object and is intended to be used with
58 the re.sub() function.
60 entity
= matchobj
.group(1)
62 # Known non-numeric HTML entity
63 if entity
in htmlentitydefs
.name2codepoint
:
64 return unichr(htmlentitydefs
.name2codepoint
[entity
])
67 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
69 numstr
= mobj
.group(1)
70 if numstr
.startswith(u
'x'):
72 numstr
= u
'0%s' % numstr
75 return unichr(long(numstr
, base
))
77 # Unknown entity in name, return its literal representation
78 return (u
'&%s;' % entity
)
80 def sanitize_title(utitle
):
81 """Sanitizes a video title so it could be used as part of a filename."""
82 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
83 return utitle
.replace(unicode(os
.sep
), u
'%')
85 def sanitize_open(filename
, open_mode
):
86 """Try to open the given filename, and slightly tweak it if this fails.
88 Attempts to open the given filename. If this fails, it tries to change
89 the filename slightly, step by step, until it's either able to open it
90 or it fails and raises a final exception, like the standard open()
93 It returns the tuple (stream, definitive_file_name).
97 return (sys
.stdout
, filename
)
98 stream
= open(filename
, open_mode
)
99 return (stream
, filename
)
100 except (IOError, OSError), err
:
101 # In case of error, try to remove win32 forbidden chars
102 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
104 # An exception here should be caught in the caller
105 stream
= open(filename
, open_mode
)
106 return (stream
, filename
)
109 class DownloadError(Exception):
110 """Download Error exception.
112 This exception may be thrown by FileDownloader objects if they are not
113 configured to continue on errors. They will contain the appropriate
118 class SameFileError(Exception):
119 """Same File exception.
121 This exception will be thrown by FileDownloader objects if they detect
122 multiple files would have to be downloaded to the same file on disk.
126 class PostProcessingError(Exception):
127 """Post Processing exception.
129 This exception may be raised by PostProcessor's .run() method to
130 indicate an error in the postprocessing task.
134 class UnavailableVideoError(Exception):
135 """Unavailable Format exception.
137 This exception will be thrown when a video is requested
138 in a format that is not available for that video.
142 class ContentTooShortError(Exception):
143 """Content Too Short exception.
145 This exception may be raised by FileDownloader objects when a file they
146 download is too small for what the server announced first, indicating
147 the connection was probably interrupted.
153 def __init__(self
, downloaded
, expected
):
154 self
.downloaded
= downloaded
155 self
.expected
= expected
157 class FileDownloader(object):
158 """File Downloader class.
160 File downloader objects are the ones responsible of downloading the
161 actual video file and writing it to disk if the user has requested
162 it, among some other tasks. In most cases there should be one per
163 program. As, given a video URL, the downloader doesn't know how to
164 extract all the needed information, task that InfoExtractors do, it
165 has to pass the URL to one of them.
167 For this, file downloader objects have a method that allows
168 InfoExtractors to be registered in a given order. When it is passed
169 a URL, the file downloader handles it to the first InfoExtractor it
170 finds that reports being able to handle it. The InfoExtractor extracts
171 all the information about the video or videos the URL refers to, and
172 asks the FileDownloader to process the video information, possibly
173 downloading the video.
175 File downloaders accept a lot of parameters. In order not to saturate
176 the object constructor with arguments, it receives a dictionary of
177 options instead. These options are available through the params
178 attribute for the InfoExtractors to use. The FileDownloader also
179 registers itself as the downloader in charge for the InfoExtractors
180 that are added to it, so this is a "mutual registration".
184 username: Username for authentication purposes.
185 password: Password for authentication purposes.
186 usenetrc: Use netrc for authentication instead.
187 quiet: Do not print messages to stdout.
188 forceurl: Force printing final URL.
189 forcetitle: Force printing title.
190 simulate: Do not download the video files.
191 format: Video format code.
192 format_limit: Highest quality format to try.
193 outtmpl: Template for output names.
194 ignoreerrors: Do not stop on download errors.
195 ratelimit: Download speed limit, in bytes/sec.
196 nooverwrites: Prevent overwriting files.
197 retries: Number of times to retry for HTTP error 5xx
198 continuedl: Try to continue downloads if possible.
199 noprogress: Do not print the progress bar.
205 _download_retcode
= None
206 _num_downloads
= None
208 def __init__(self
, params
):
209 """Create a FileDownloader object with the given options."""
212 self
._download
_retcode
= 0
213 self
._num
_downloads
= 0
217 def pmkdir(filename
):
218 """Create directory components in filename. Similar to Unix "mkdir -p"."""
219 components
= filename
.split(os
.sep
)
220 aggregate
= [os
.sep
.join(components
[0:x
]) for x
in xrange(1, len(components
))]
221 aggregate
= ['%s%s' % (x
, os
.sep
) for x
in aggregate
] # Finish names with separator
222 for dir in aggregate
:
223 if not os
.path
.exists(dir):
227 def format_bytes(bytes):
230 if type(bytes) is str:
235 exponent
= long(math
.log(bytes, 1024.0))
236 suffix
= 'bkMGTPEZY'[exponent
]
237 converted
= float(bytes) / float(1024**exponent
)
238 return '%.2f%s' % (converted
, suffix
)
241 def calc_percent(byte_counter
, data_len
):
244 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
247 def calc_eta(start
, now
, total
, current
):
251 if current
== 0 or dif
< 0.001: # One millisecond
253 rate
= float(current
) / dif
254 eta
= long((float(total
) - float(current
)) / rate
)
255 (eta_mins
, eta_secs
) = divmod(eta
, 60)
258 return '%02d:%02d' % (eta_mins
, eta_secs
)
261 def calc_speed(start
, now
, bytes):
263 if bytes == 0 or dif
< 0.001: # One millisecond
264 return '%10s' % '---b/s'
265 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
268 def best_block_size(elapsed_time
, bytes):
269 new_min
= max(bytes / 2.0, 1.0)
270 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
271 if elapsed_time
< 0.001:
273 rate
= bytes / elapsed_time
281 def parse_bytes(bytestr
):
282 """Parse a string indicating a byte quantity into a long integer."""
283 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
286 number
= float(matchobj
.group(1))
287 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
288 return long(round(number
* multiplier
))
290 def add_info_extractor(self
, ie
):
291 """Add an InfoExtractor object to the end of the list."""
293 ie
.set_downloader(self
)
295 def add_post_processor(self
, pp
):
296 """Add a PostProcessor object to the end of the chain."""
298 pp
.set_downloader(self
)
300 def to_stdout(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
301 """Print message to stdout if not in quiet mode."""
303 if not self
.params
.get('quiet', False):
304 print (u
'%s%s' % (message
, [u
'\n', u
''][skip_eol
])).encode(preferredencoding()),
306 except (UnicodeEncodeError), err
:
307 if not ignore_encoding_errors
:
310 def to_stderr(self
, message
):
311 """Print message to stderr."""
312 print >>sys
.stderr
, message
.encode(preferredencoding())
314 def fixed_template(self
):
315 """Checks if the output template is fixed."""
316 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
318 def trouble(self
, message
=None):
319 """Determine action to take when a download problem appears.
321 Depending on if the downloader has been configured to ignore
322 download errors or not, this method may throw an exception or
323 not when errors are found, after printing the message.
325 if message
is not None:
326 self
.to_stderr(message
)
327 if not self
.params
.get('ignoreerrors', False):
328 raise DownloadError(message
)
329 self
._download
_retcode
= 1
331 def slow_down(self
, start_time
, byte_counter
):
332 """Sleep if the download speed is over the rate limit."""
333 rate_limit
= self
.params
.get('ratelimit', None)
334 if rate_limit
is None or byte_counter
== 0:
337 elapsed
= now
- start_time
340 speed
= float(byte_counter
) / elapsed
341 if speed
> rate_limit
:
342 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
344 def report_destination(self
, filename
):
345 """Report destination filename."""
346 self
.to_stdout(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
348 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
349 """Report download progress."""
350 if self
.params
.get('noprogress', False):
352 self
.to_stdout(u
'\r[download] %s of %s at %s ETA %s' %
353 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
355 def report_resuming_byte(self
, resume_len
):
356 """Report attempt to resume at given byte."""
357 self
.to_stdout(u
'[download] Resuming download at byte %s' % resume_len
)
359 def report_retry(self
, count
, retries
):
360 """Report retry in case of HTTP error 5xx"""
361 self
.to_stdout(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
))
363 def report_file_already_downloaded(self
, file_name
):
364 """Report file has already been fully downloaded."""
366 self
.to_stdout(u
'[download] %s has already been downloaded' % file_name
)
367 except (UnicodeEncodeError), err
:
368 self
.to_stdout(u
'[download] The file has already been downloaded')
370 def report_unable_to_resume(self
):
371 """Report it was impossible to resume download."""
372 self
.to_stdout(u
'[download] Unable to resume')
374 def report_finish(self
):
375 """Report download finished."""
376 if self
.params
.get('noprogress', False):
377 self
.to_stdout(u
'[download] Download completed')
381 def increment_downloads(self
):
382 """Increment the ordinal that assigns a number to each file."""
383 self
._num
_downloads
+= 1
385 def process_info(self
, info_dict
):
386 """Process a single dictionary returned by an InfoExtractor."""
387 # Do nothing else if in simulate mode
388 if self
.params
.get('simulate', False):
390 if self
.params
.get('forcetitle', False):
391 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
392 if self
.params
.get('forceurl', False):
393 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
394 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
395 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
396 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
397 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
402 template_dict
= dict(info_dict
)
403 template_dict
['epoch'] = unicode(long(time
.time()))
404 template_dict
['ord'] = unicode('%05d' % self
._num
_downloads
)
405 filename
= self
.params
['outtmpl'] % template_dict
406 except (ValueError, KeyError), err
:
407 self
.trouble(u
'ERROR: invalid system charset or erroneous output template')
409 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
410 self
.to_stderr(u
'WARNING: file exists and will be skipped')
414 self
.pmkdir(filename
)
415 except (OSError, IOError), err
:
416 self
.trouble(u
'ERROR: unable to create directories: %s' % str(err
))
420 success
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None))
421 except (OSError, IOError), err
:
422 raise UnavailableVideoError
423 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
424 self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
))
426 except (ContentTooShortError
, ), err
:
427 self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
432 self
.post_process(filename
, info_dict
)
433 except (PostProcessingError
), err
:
434 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
))
437 def download(self
, url_list
):
438 """Download a given list of URLs."""
439 if len(url_list
) > 1 and self
.fixed_template():
440 raise SameFileError(self
.params
['outtmpl'])
443 suitable_found
= False
445 # Go to next InfoExtractor if not suitable
446 if not ie
.suitable(url
):
449 # Suitable InfoExtractor found
450 suitable_found
= True
452 # Extract information from URL and process it
455 # Suitable InfoExtractor had been found; go to next URL
458 if not suitable_found
:
459 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
)
461 return self
._download
_retcode
463 def post_process(self
, filename
, ie_info
):
464 """Run the postprocessing chain on the given file."""
466 info
['filepath'] = filename
472 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
473 self
.report_destination(filename
)
475 # Check for rtmpdump first
477 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
478 except (OSError, IOError):
479 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
482 # Download using rtmpdump. rtmpdump returns exit code 2 when
483 # the connection was interrumpted and resuming appears to be
484 # possible. This is part of rtmpdump's normal usage, AFAIK.
485 basic_args
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', filename
]
486 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
487 while retval
== 2 or retval
== 1:
488 prevsize
= os
.path
.getsize(filename
)
489 self
.to_stdout(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
490 time
.sleep(5.0) # This seems to be needed
491 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
492 cursize
= os
.path
.getsize(filename
)
493 if prevsize
== cursize
and retval
== 1:
496 self
.to_stdout(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(filename
))
499 self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
)
502 def _do_download(self
, filename
, url
, player_url
):
503 # Attempt to download using rtmpdump
504 if url
.startswith('rtmp'):
505 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
509 basic_request
= urllib2
.Request(url
, None, std_headers
)
510 request
= urllib2
.Request(url
, None, std_headers
)
512 # Establish possible resume length
513 if os
.path
.isfile(filename
):
514 resume_len
= os
.path
.getsize(filename
)
518 # Request parameters in case of being able to resume
519 if self
.params
.get('continuedl', False) and resume_len
!= 0:
520 self
.report_resuming_byte(resume_len
)
521 request
.add_header('Range','bytes=%d-' % resume_len
)
525 retries
= self
.params
.get('retries', 0)
526 while count
<= retries
:
527 # Establish connection
529 data
= urllib2
.urlopen(request
)
531 except (urllib2
.HTTPError
, ), err
:
532 if (err
.code
< 500 or err
.code
>= 600) and err
.code
!= 416:
533 # Unexpected HTTP error
535 elif err
.code
== 416:
536 # Unable to resume (requested range not satisfiable)
538 # Open the connection again without the range header
539 data
= urllib2
.urlopen(basic_request
)
540 content_length
= data
.info()['Content-Length']
541 except (urllib2
.HTTPError
, ), err
:
542 if err
.code
< 500 or err
.code
>= 600:
545 # Examine the reported length
546 if (content_length
is not None and
547 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
548 # The file had already been fully downloaded.
549 # Explanation to the above condition: in issue #175 it was revealed that
550 # YouTube sometimes adds or removes a few bytes from the end of the file,
551 # changing the file size slightly and causing problems for some users. So
552 # I decided to implement a suggested change and consider the file
553 # completely downloaded if the file size differs less than 100 bytes from
554 # the one in the hard drive.
555 self
.report_file_already_downloaded(filename
)
558 # The length does not match, we start the download over
559 self
.report_unable_to_resume()
565 self
.report_retry(count
, retries
)
568 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
571 data_len
= data
.info().get('Content-length', None)
572 data_len_str
= self
.format_bytes(data_len
)
579 data_block
= data
.read(block_size
)
581 data_block_len
= len(data_block
)
582 if data_block_len
== 0:
584 byte_counter
+= data_block_len
586 # Open file just in time
589 (stream
, filename
) = sanitize_open(filename
, open_mode
)
590 self
.report_destination(filename
)
591 except (OSError, IOError), err
:
592 self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
))
595 stream
.write(data_block
)
596 except (IOError, OSError), err
:
597 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
))
599 block_size
= self
.best_block_size(after
- before
, data_block_len
)
602 percent_str
= self
.calc_percent(byte_counter
, data_len
)
603 eta_str
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
)
604 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
)
605 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
608 self
.slow_down(start
, byte_counter
)
611 if data_len
is not None and str(byte_counter
) != data_len
:
612 raise ContentTooShortError(byte_counter
, long(data_len
))
615 class InfoExtractor(object):
616 """Information Extractor class.
618 Information extractors are the classes that, given a URL, extract
619 information from the video (or videos) the URL refers to. This
620 information includes the real video URL, the video title and simplified
621 title, author and others. The information is stored in a dictionary
622 which is then passed to the FileDownloader. The FileDownloader
623 processes this information possibly downloading the video to the file
624 system, among other possible outcomes. The dictionaries must include
625 the following fields:
627 id: Video identifier.
628 url: Final video URL.
629 uploader: Nickname of the video uploader.
630 title: Literal title.
631 stitle: Simplified title.
632 ext: Video filename extension.
633 format: Video format.
634 player_url: SWF Player URL (may be None).
636 The following fields are optional. Their primary purpose is to allow
637 youtube-dl to serve as the backend for a video search function, such
638 as the one in youtube2mp3. They are only used when their respective
639 forced printing functions are called:
641 thumbnail: Full URL to a video thumbnail image.
642 description: One-line video description.
644 Subclasses of this one should re-define the _real_initialize() and
645 _real_extract() methods, as well as the suitable() static method.
646 Probably, they should also be instantiated and added to the main
653 def __init__(self
, downloader
=None):
654 """Constructor. Receives an optional downloader."""
656 self
.set_downloader(downloader
)
660 """Receives a URL and returns True if suitable for this IE."""
663 def initialize(self
):
664 """Initializes an instance (authentication, etc)."""
666 self
._real
_initialize
()
669 def extract(self
, url
):
670 """Extracts URL information and returns it in list of dicts."""
672 return self
._real
_extract
(url
)
674 def set_downloader(self
, downloader
):
675 """Sets the downloader for this IE."""
676 self
._downloader
= downloader
678 def _real_initialize(self
):
679 """Real initialization process. Redefine in subclasses."""
682 def _real_extract(self
, url
):
683 """Real extraction process. Redefine in subclasses."""
686 class YoutubeIE(InfoExtractor
):
687 """Information extractor for youtube.com."""
689 _VALID_URL
= r
'^((?:http://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
690 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
691 _LOGIN_URL
= 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
692 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
693 _NETRC_MACHINE
= 'youtube'
694 # Listed in order of quality
695 _available_formats
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
696 _video_extensions
= {
702 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
709 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None)
711 def report_lang(self
):
712 """Report attempt to set language."""
713 self
._downloader
.to_stdout(u
'[youtube] Setting language')
715 def report_login(self
):
716 """Report attempt to log in."""
717 self
._downloader
.to_stdout(u
'[youtube] Logging in')
719 def report_age_confirmation(self
):
720 """Report attempt to confirm age."""
721 self
._downloader
.to_stdout(u
'[youtube] Confirming age')
723 def report_video_webpage_download(self
, video_id
):
724 """Report attempt to download video webpage."""
725 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video webpage' % video_id
)
727 def report_video_info_webpage_download(self
, video_id
):
728 """Report attempt to download video info webpage."""
729 self
._downloader
.to_stdout(u
'[youtube] %s: Downloading video info webpage' % video_id
)
731 def report_information_extraction(self
, video_id
):
732 """Report attempt to extract video information."""
733 self
._downloader
.to_stdout(u
'[youtube] %s: Extracting video information' % video_id
)
735 def report_unavailable_format(self
, video_id
, format
):
736 """Report extracted video URL."""
737 self
._downloader
.to_stdout(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
739 def report_rtmp_download(self
):
740 """Indicate the download will use the RTMP protocol."""
741 self
._downloader
.to_stdout(u
'[youtube] RTMP download detected')
743 def _real_initialize(self
):
744 if self
._downloader
is None:
749 downloader_params
= self
._downloader
.params
751 # Attempt to use provided username and password or .netrc data
752 if downloader_params
.get('username', None) is not None:
753 username
= downloader_params
['username']
754 password
= downloader_params
['password']
755 elif downloader_params
.get('usenetrc', False):
757 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
762 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
763 except (IOError, netrc
.NetrcParseError
), err
:
764 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
768 request
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
)
771 urllib2
.urlopen(request
).read()
772 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
773 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
776 # No authentication to be performed
782 'current_form': 'loginForm',
784 'action_login': 'Log In',
785 'username': username
,
786 'password': password
,
788 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
)
791 login_results
= urllib2
.urlopen(request
).read()
792 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
793 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
795 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
796 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
802 'action_confirm': 'Confirm',
804 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
)
806 self
.report_age_confirmation()
807 age_results
= urllib2
.urlopen(request
).read()
808 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
809 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
812 def _real_extract(self
, url
):
813 # Extract video id from URL
814 mobj
= re
.match(self
._VALID
_URL
, url
)
816 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
818 video_id
= mobj
.group(2)
821 self
.report_video_webpage_download(video_id
)
822 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en' % video_id
, None, std_headers
)
824 video_webpage
= urllib2
.urlopen(request
).read()
825 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
826 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
829 # Attempt to extract SWF player URL
830 mobj
= re
.search(r
'swfConfig.*"(http://.*?watch.*?-.*?\.swf)"', video_webpage
)
832 player_url
= mobj
.group(1)
837 self
.report_video_info_webpage_download(video_id
)
838 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
839 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
840 % (video_id
, el_type
))
841 request
= urllib2
.Request(video_info_url
, None, std_headers
)
843 video_info_webpage
= urllib2
.urlopen(request
).read()
844 video_info
= parse_qs(video_info_webpage
)
845 if 'token' in video_info
:
847 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
848 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
850 if 'token' not in video_info
:
851 if 'reason' in video_info
:
852 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
854 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
857 # Start extracting information
858 self
.report_information_extraction(video_id
)
861 if 'author' not in video_info
:
862 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
864 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
867 if 'title' not in video_info
:
868 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
870 video_title
= urllib
.unquote_plus(video_info
['title'][0])
871 video_title
= video_title
.decode('utf-8')
872 video_title
= sanitize_title(video_title
)
875 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
876 simple_title
= simple_title
.strip(ur
'_')
879 if 'thumbnail_url' not in video_info
:
880 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
882 else: # don't panic if we can't find it
883 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
886 video_description
= 'No description available.'
887 if self
._downloader
.params
.get('forcedescription', False):
888 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
890 video_description
= mobj
.group(1)
893 video_token
= urllib
.unquote_plus(video_info
['token'][0])
895 # Decide which formats to download
896 requested_format
= self
._downloader
.params
.get('format', None)
897 get_video_template
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
)
899 if 'fmt_url_map' in video_info
:
900 url_map
= dict(tuple(pair
.split('|')) for pair
in video_info
['fmt_url_map'][0].split(','))
901 format_limit
= self
._downloader
.params
.get('format_limit', None)
902 if format_limit
is not None and format_limit
in self
._available
_formats
:
903 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
905 format_list
= self
._available
_formats
906 existing_formats
= [x
for x
in format_list
if x
in url_map
]
907 if len(existing_formats
) == 0:
908 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
910 if requested_format
is None:
911 video_url_list
= [(existing_formats
[0], get_video_template
% existing_formats
[0])] # Best quality
912 elif requested_format
== '-1':
913 video_url_list
= [(f
, get_video_template
% f
) for f
in existing_formats
] # All formats
915 video_url_list
= [(requested_format
, get_video_template
% requested_format
)] # Specific format
917 elif 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
918 self
.report_rtmp_download()
919 video_url_list
= [(None, video_info
['conn'][0])]
922 self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info')
925 for format_param
, video_real_url
in video_url_list
:
926 # At this point we have a new video
927 self
._downloader
.increment_downloads()
930 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
932 # Find the video URL in fmt_url_map or conn paramters
934 # Process video information
935 self
._downloader
.process_info({
936 'id': video_id
.decode('utf-8'),
937 'url': video_real_url
.decode('utf-8'),
938 'uploader': video_uploader
.decode('utf-8'),
939 'title': video_title
,
940 'stitle': simple_title
,
941 'ext': video_extension
.decode('utf-8'),
942 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
943 'thumbnail': video_thumbnail
.decode('utf-8'),
944 'description': video_description
.decode('utf-8'),
945 'player_url': player_url
,
947 except UnavailableVideoError
, err
:
948 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)')
951 class MetacafeIE(InfoExtractor
):
952 """Information Extractor for metacafe.com."""
954 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
955 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
956 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
959 def __init__(self
, youtube_ie
, downloader
=None):
960 InfoExtractor
.__init
__(self
, downloader
)
961 self
._youtube
_ie
= youtube_ie
965 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None)
967 def report_disclaimer(self
):
968 """Report disclaimer retrieval."""
969 self
._downloader
.to_stdout(u
'[metacafe] Retrieving disclaimer')
971 def report_age_confirmation(self
):
972 """Report attempt to confirm age."""
973 self
._downloader
.to_stdout(u
'[metacafe] Confirming age')
975 def report_download_webpage(self
, video_id
):
976 """Report webpage download."""
977 self
._downloader
.to_stdout(u
'[metacafe] %s: Downloading webpage' % video_id
)
979 def report_extraction(self
, video_id
):
980 """Report information extraction."""
981 self
._downloader
.to_stdout(u
'[metacafe] %s: Extracting information' % video_id
)
983 def _real_initialize(self
):
984 # Retrieve disclaimer
985 request
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
)
987 self
.report_disclaimer()
988 disclaimer
= urllib2
.urlopen(request
).read()
989 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
990 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
996 'submit': "Continue - I'm over 18",
998 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
)
1000 self
.report_age_confirmation()
1001 disclaimer
= urllib2
.urlopen(request
).read()
1002 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1003 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1006 def _real_extract(self
, url
):
1007 # Extract id and simplified title from URL
1008 mobj
= re
.match(self
._VALID
_URL
, url
)
1010 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1013 video_id
= mobj
.group(1)
1015 # Check if video comes from YouTube
1016 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1017 if mobj2
is not None:
1018 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1021 # At this point we have a new video
1022 self
._downloader
.increment_downloads()
1024 simple_title
= mobj
.group(2).decode('utf-8')
1026 # Retrieve video webpage to extract further information
1027 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1029 self
.report_download_webpage(video_id
)
1030 webpage
= urllib2
.urlopen(request
).read()
1031 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1032 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1035 # Extract URL, uploader and title from webpage
1036 self
.report_extraction(video_id
)
1037 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1038 if mobj
is not None:
1039 mediaURL
= urllib
.unquote(mobj
.group(1))
1040 video_extension
= mediaURL
[-3:]
1042 # Extract gdaKey if available
1043 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1045 video_url
= mediaURL
1047 gdaKey
= mobj
.group(1)
1048 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1050 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
1052 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1054 vardict
= parse_qs(mobj
.group(1))
1055 if 'mediaData' not in vardict
:
1056 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1058 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
1060 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1062 mediaURL
= mobj
.group(1).replace('\\/', '/')
1063 video_extension
= mediaURL
[-3:]
1064 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
1066 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1068 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1070 video_title
= mobj
.group(1).decode('utf-8')
1071 video_title
= sanitize_title(video_title
)
1073 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1075 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1077 video_uploader
= mobj
.group(1)
1080 # Process video information
1081 self
._downloader
.process_info({
1082 'id': video_id
.decode('utf-8'),
1083 'url': video_url
.decode('utf-8'),
1084 'uploader': video_uploader
.decode('utf-8'),
1085 'title': video_title
,
1086 'stitle': simple_title
,
1087 'ext': video_extension
.decode('utf-8'),
1091 except UnavailableVideoError
:
1092 self
._downloader
.trouble(u
'ERROR: unable to download video')
1095 class DailymotionIE(InfoExtractor
):
1096 """Information Extractor for Dailymotion"""
1098 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1100 def __init__(self
, downloader
=None):
1101 InfoExtractor
.__init
__(self
, downloader
)
1105 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None)
1107 def report_download_webpage(self
, video_id
):
1108 """Report webpage download."""
1109 self
._downloader
.to_stdout(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1111 def report_extraction(self
, video_id
):
1112 """Report information extraction."""
1113 self
._downloader
.to_stdout(u
'[dailymotion] %s: Extracting information' % video_id
)
1115 def _real_initialize(self
):
1118 def _real_extract(self
, url
):
1119 # Extract id and simplified title from URL
1120 mobj
= re
.match(self
._VALID
_URL
, url
)
1122 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1125 # At this point we have a new video
1126 self
._downloader
.increment_downloads()
1127 video_id
= mobj
.group(1)
1129 simple_title
= mobj
.group(2).decode('utf-8')
1130 video_extension
= 'flv'
1132 # Retrieve video webpage to extract further information
1133 request
= urllib2
.Request(url
)
1135 self
.report_download_webpage(video_id
)
1136 webpage
= urllib2
.urlopen(request
).read()
1137 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1138 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1141 # Extract URL, uploader and title from webpage
1142 self
.report_extraction(video_id
)
1143 mobj
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
)
1145 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1147 mediaURL
= urllib
.unquote(mobj
.group(1))
1149 # if needed add http://www.dailymotion.com/ if relative URL
1151 video_url
= mediaURL
1153 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1154 mobj
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
)
1156 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1158 video_title
= mobj
.group(1).decode('utf-8')
1159 video_title
= sanitize_title(video_title
)
1161 mobj
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage
)
1163 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1165 video_uploader
= mobj
.group(1)
1168 # Process video information
1169 self
._downloader
.process_info({
1170 'id': video_id
.decode('utf-8'),
1171 'url': video_url
.decode('utf-8'),
1172 'uploader': video_uploader
.decode('utf-8'),
1173 'title': video_title
,
1174 'stitle': simple_title
,
1175 'ext': video_extension
.decode('utf-8'),
1179 except UnavailableVideoError
:
1180 self
._downloader
.trouble(u
'ERROR: unable to download video')
1182 class GoogleIE(InfoExtractor
):
1183 """Information extractor for video.google.com."""
1185 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1187 def __init__(self
, downloader
=None):
1188 InfoExtractor
.__init
__(self
, downloader
)
1192 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None)
1194 def report_download_webpage(self
, video_id
):
1195 """Report webpage download."""
1196 self
._downloader
.to_stdout(u
'[video.google] %s: Downloading webpage' % video_id
)
1198 def report_extraction(self
, video_id
):
1199 """Report information extraction."""
1200 self
._downloader
.to_stdout(u
'[video.google] %s: Extracting information' % video_id
)
1202 def _real_initialize(self
):
1205 def _real_extract(self
, url
):
1206 # Extract id from URL
1207 mobj
= re
.match(self
._VALID
_URL
, url
)
1209 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1212 # At this point we have a new video
1213 self
._downloader
.increment_downloads()
1214 video_id
= mobj
.group(1)
1216 video_extension
= 'mp4'
1218 # Retrieve video webpage to extract further information
1219 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1221 self
.report_download_webpage(video_id
)
1222 webpage
= urllib2
.urlopen(request
).read()
1223 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1224 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1227 # Extract URL, uploader, and title from webpage
1228 self
.report_extraction(video_id
)
1229 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1231 video_extension
= 'flv'
1232 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1234 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1236 mediaURL
= urllib
.unquote(mobj
.group(1))
1237 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1238 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1240 video_url
= mediaURL
1242 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1244 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1246 video_title
= mobj
.group(1).decode('utf-8')
1247 video_title
= sanitize_title(video_title
)
1248 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1250 # Extract video description
1251 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1253 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1255 video_description
= mobj
.group(1).decode('utf-8')
1256 if not video_description
:
1257 video_description
= 'No description available.'
1259 # Extract video thumbnail
1260 if self
._downloader
.params
.get('forcethumbnail', False):
1261 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1263 webpage
= urllib2
.urlopen(request
).read()
1264 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1265 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1267 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1269 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1271 video_thumbnail
= mobj
.group(1)
1272 else: # we need something to pass to process_info
1273 video_thumbnail
= ''
1277 # Process video information
1278 self
._downloader
.process_info({
1279 'id': video_id
.decode('utf-8'),
1280 'url': video_url
.decode('utf-8'),
1282 'title': video_title
,
1283 'stitle': simple_title
,
1284 'ext': video_extension
.decode('utf-8'),
1288 except UnavailableVideoError
:
1289 self
._downloader
.trouble(u
'ERROR: unable to download video')
1292 class PhotobucketIE(InfoExtractor
):
1293 """Information extractor for photobucket.com."""
1295 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1297 def __init__(self
, downloader
=None):
1298 InfoExtractor
.__init
__(self
, downloader
)
1302 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None)
1304 def report_download_webpage(self
, video_id
):
1305 """Report webpage download."""
1306 self
._downloader
.to_stdout(u
'[photobucket] %s: Downloading webpage' % video_id
)
1308 def report_extraction(self
, video_id
):
1309 """Report information extraction."""
1310 self
._downloader
.to_stdout(u
'[photobucket] %s: Extracting information' % video_id
)
1312 def _real_initialize(self
):
1315 def _real_extract(self
, url
):
1316 # Extract id from URL
1317 mobj
= re
.match(self
._VALID
_URL
, url
)
1319 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1322 # At this point we have a new video
1323 self
._downloader
.increment_downloads()
1324 video_id
= mobj
.group(1)
1326 video_extension
= 'flv'
1328 # Retrieve video webpage to extract further information
1329 request
= urllib2
.Request(url
)
1331 self
.report_download_webpage(video_id
)
1332 webpage
= urllib2
.urlopen(request
).read()
1333 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1334 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1337 # Extract URL, uploader, and title from webpage
1338 self
.report_extraction(video_id
)
1339 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1341 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1343 mediaURL
= urllib
.unquote(mobj
.group(1))
1345 video_url
= mediaURL
1347 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1349 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1351 video_title
= mobj
.group(1).decode('utf-8')
1352 video_title
= sanitize_title(video_title
)
1353 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1355 video_uploader
= mobj
.group(2).decode('utf-8')
1358 # Process video information
1359 self
._downloader
.process_info({
1360 'id': video_id
.decode('utf-8'),
1361 'url': video_url
.decode('utf-8'),
1362 'uploader': video_uploader
,
1363 'title': video_title
,
1364 'stitle': simple_title
,
1365 'ext': video_extension
.decode('utf-8'),
1369 except UnavailableVideoError
:
1370 self
._downloader
.trouble(u
'ERROR: unable to download video')
1373 class YahooIE(InfoExtractor
):
1374 """Information extractor for video.yahoo.com."""
1376 # _VALID_URL matches all Yahoo! Video URLs
1377 # _VPAGE_URL matches only the extractable '/watch/' URLs
1378 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1379 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1381 def __init__(self
, downloader
=None):
1382 InfoExtractor
.__init
__(self
, downloader
)
1386 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None)
1388 def report_download_webpage(self
, video_id
):
1389 """Report webpage download."""
1390 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1392 def report_extraction(self
, video_id
):
1393 """Report information extraction."""
1394 self
._downloader
.to_stdout(u
'[video.yahoo] %s: Extracting information' % video_id
)
1396 def _real_initialize(self
):
1399 def _real_extract(self
, url
, new_video
=True):
1400 # Extract ID from URL
1401 mobj
= re
.match(self
._VALID
_URL
, url
)
1403 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1406 # At this point we have a new video
1407 self
._downloader
.increment_downloads()
1408 video_id
= mobj
.group(2)
1409 video_extension
= 'flv'
1411 # Rewrite valid but non-extractable URLs as
1412 # extractable English language /watch/ URLs
1413 if re
.match(self
._VPAGE
_URL
, url
) is None:
1414 request
= urllib2
.Request(url
)
1416 webpage
= urllib2
.urlopen(request
).read()
1417 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1418 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1421 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1423 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1425 yahoo_id
= mobj
.group(1)
1427 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1429 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1431 yahoo_vid
= mobj
.group(1)
1433 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1434 return self
._real
_extract
(url
, new_video
=False)
1436 # Retrieve video webpage to extract further information
1437 request
= urllib2
.Request(url
)
1439 self
.report_download_webpage(video_id
)
1440 webpage
= urllib2
.urlopen(request
).read()
1441 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1442 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1445 # Extract uploader and title from webpage
1446 self
.report_extraction(video_id
)
1447 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1449 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1451 video_title
= mobj
.group(1).decode('utf-8')
1452 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1454 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1456 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1458 video_uploader
= mobj
.group(1).decode('utf-8')
1460 # Extract video thumbnail
1461 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1463 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1465 video_thumbnail
= mobj
.group(1).decode('utf-8')
1467 # Extract video description
1468 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1470 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1472 video_description
= mobj
.group(1).decode('utf-8')
1473 if not video_description
: video_description
= 'No description available.'
1475 # Extract video height and width
1476 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1478 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1480 yv_video_height
= mobj
.group(1)
1482 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1484 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1486 yv_video_width
= mobj
.group(1)
1488 # Retrieve video playlist to extract media URL
1489 # I'm not completely sure what all these options are, but we
1490 # seem to need most of them, otherwise the server sends a 401.
1491 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1492 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1493 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1494 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1495 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1497 self
.report_download_webpage(video_id
)
1498 webpage
= urllib2
.urlopen(request
).read()
1499 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1500 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1503 # Extract media URL from playlist XML
1504 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1506 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1508 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1509 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1512 # Process video information
1513 self
._downloader
.process_info({
1514 'id': video_id
.decode('utf-8'),
1516 'uploader': video_uploader
,
1517 'title': video_title
,
1518 'stitle': simple_title
,
1519 'ext': video_extension
.decode('utf-8'),
1520 'thumbnail': video_thumbnail
.decode('utf-8'),
1521 'description': video_description
,
1522 'thumbnail': video_thumbnail
,
1523 'description': video_description
,
1526 except UnavailableVideoError
:
1527 self
._downloader
.trouble(u
'ERROR: unable to download video')
1530 class GenericIE(InfoExtractor
):
1531 """Generic last-resort information extractor."""
1533 def __init__(self
, downloader
=None):
1534 InfoExtractor
.__init
__(self
, downloader
)
1540 def report_download_webpage(self
, video_id
):
1541 """Report webpage download."""
1542 self
._downloader
.to_stdout(u
'WARNING: Falling back on generic information extractor.')
1543 self
._downloader
.to_stdout(u
'[generic] %s: Downloading webpage' % video_id
)
1545 def report_extraction(self
, video_id
):
1546 """Report information extraction."""
1547 self
._downloader
.to_stdout(u
'[generic] %s: Extracting information' % video_id
)
1549 def _real_initialize(self
):
1552 def _real_extract(self
, url
):
1553 # At this point we have a new video
1554 self
._downloader
.increment_downloads()
1556 video_id
= url
.split('/')[-1]
1557 request
= urllib2
.Request(url
)
1559 self
.report_download_webpage(video_id
)
1560 webpage
= urllib2
.urlopen(request
).read()
1561 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1562 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1564 except ValueError, err
:
1565 # since this is the last-resort InfoExtractor, if
1566 # this error is thrown, it'll be thrown here
1567 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1570 # Start with something easy: JW Player in SWFObject
1571 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1573 # Broaden the search a little bit
1574 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1576 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1579 # It's possible that one of the regexes
1580 # matched, but returned an empty group:
1581 if mobj.group(1) is None:
1582 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1585 video_url = urllib.unquote(mobj.group(1))
1586 video_id = os.path.basename(video_url)
1588 # here's a fun little line of code for you:
1589 video_extension = os.path.splitext(video_id)[1][1:]
1590 video_id = os.path.splitext(video_id)[0]
1592 # it's tempting to parse this further, but you would
1593 # have to take into account all the variations like
1594 # Video Title - Site Name
1595 # Site Name | Video Title
1596 # Video Title - Tagline | Site Name
1597 # and so on and so forth; it's just not practical
1598 mobj = re.search(r'<title>(.*)</title>', webpage)
1600 self._downloader.trouble(u'ERROR: unable to extract title')
1602 video_title = mobj.group(1).decode('utf-8')
1603 video_title = sanitize_title(video_title)
1604 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1606 # video uploader is domain name
1607 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1609 self._downloader.trouble(u'ERROR: unable to extract title')
1611 video_uploader = mobj.group(1).decode('utf-8')
1614 # Process video information
1615 self._downloader.process_info({
1616 'id': video_id.decode('utf-8'),
1617 'url': video_url.decode('utf-8'),
1618 'uploader': video_uploader,
1619 'title': video_title,
1620 'stitle': simple_title,
1621 'ext': video_extension.decode('utf-8'),
1625 except UnavailableVideoError, err:
1626 self._downloader.trouble(u'ERROR: unable to download video')
1629 class YoutubeSearchIE(InfoExtractor):
1630 """Information Extractor for YouTube search queries."""
1631 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1632 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1633 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
1634 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1636 _max_youtube_results = 1000
1638 def __init__(self, youtube_ie, downloader=None):
1639 InfoExtractor.__init__(self, downloader)
1640 self._youtube_ie = youtube_ie
1644 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1646 def report_download_page(self, query, pagenum):
1647 """Report attempt to download playlist page with given number."""
1648 query = query.decode(preferredencoding())
1649 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1651 def _real_initialize(self):
1652 self._youtube_ie.initialize()
1654 def _real_extract(self, query):
1655 mobj = re.match(self._VALID_QUERY, query)
1657 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1660 prefix, query = query.split(':')
1662 query = query.encode('utf-8')
1664 self._download_n_results(query, 1)
1666 elif prefix == 'all':
1667 self._download_n_results(query, self._max_youtube_results)
1673 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1675 elif n > self._max_youtube_results:
1676 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1677 n = self._max_youtube_results
1678 self._download_n_results(query, n)
1680 except ValueError: # parsing prefix as integer fails
1681 self._download_n_results(query, 1)
1684 def _download_n_results(self, query, n):
1685 """Downloads a specified number of results for a query"""
1688 already_seen = set()
1692 self.report_download_page(query, pagenum)
1693 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1694 request = urllib2.Request(result_url, None, std_headers)
1696 page = urllib2.urlopen(request).read()
1697 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1701 # Extract video identifiers
1702 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1703 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1704 if video_id not in already_seen:
1705 video_ids.append(video_id)
1706 already_seen.add(video_id)
1707 if len(video_ids) == n:
1708 # Specified n videos reached
1709 for id in video_ids:
1710 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1713 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1714 for id in video_ids:
1715 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1718 pagenum = pagenum + 1
1720 class GoogleSearchIE(InfoExtractor):
1721 """Information Extractor for Google Video search queries."""
1722 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1723 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1724 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1725 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1727 _max_google_results = 1000
1729 def __init__(self, google_ie, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1731 self._google_ie = google_ie
1735 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1737 def report_download_page(self, query, pagenum):
1738 """Report attempt to download playlist page with given number."""
1739 query = query.decode(preferredencoding())
1740 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1742 def _real_initialize(self):
1743 self._google_ie.initialize()
1745 def _real_extract(self, query):
1746 mobj = re.match(self._VALID_QUERY, query)
1748 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1751 prefix, query = query.split(':')
1753 query = query.encode('utf-8')
1755 self._download_n_results(query, 1)
1757 elif prefix == 'all':
1758 self._download_n_results(query, self._max_google_results)
1764 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1766 elif n > self._max_google_results:
1767 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1768 n = self._max_google_results
1769 self._download_n_results(query, n)
1771 except ValueError: # parsing prefix as integer fails
1772 self._download_n_results(query, 1)
1775 def _download_n_results(self, query, n):
1776 """Downloads a specified number of results for a query"""
1779 already_seen = set()
1783 self.report_download_page(query, pagenum)
1784 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1785 request = urllib2.Request(result_url, None, std_headers)
1787 page = urllib2.urlopen(request).read()
1788 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1792 # Extract video identifiers
1793 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1794 video_id = mobj.group(1)
1795 if video_id not in already_seen:
1796 video_ids.append(video_id)
1797 already_seen.add(video_id)
1798 if len(video_ids) == n:
1799 # Specified n videos reached
1800 for id in video_ids:
1801 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1804 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1805 for id in video_ids:
1806 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1809 pagenum = pagenum + 1
1811 class YahooSearchIE(InfoExtractor):
1812 """Information Extractor for Yahoo! Video search queries."""
1813 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1814 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1815 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1816 _MORE_PAGES_INDICATOR = r'\s*Next'
1818 _max_yahoo_results = 1000
1820 def __init__(self, yahoo_ie, downloader=None):
1821 InfoExtractor.__init__(self, downloader)
1822 self._yahoo_ie = yahoo_ie
1826 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1828 def report_download_page(self, query, pagenum):
1829 """Report attempt to download playlist page with given number."""
1830 query = query.decode(preferredencoding())
1831 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1833 def _real_initialize(self):
1834 self._yahoo_ie.initialize()
1836 def _real_extract(self, query):
1837 mobj = re.match(self._VALID_QUERY, query)
1839 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1842 prefix, query = query.split(':')
1844 query = query.encode('utf-8')
1846 self._download_n_results(query, 1)
1848 elif prefix == 'all':
1849 self._download_n_results(query, self._max_yahoo_results)
1855 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1857 elif n > self._max_yahoo_results:
1858 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1859 n = self._max_yahoo_results
1860 self._download_n_results(query, n)
1862 except ValueError: # parsing prefix as integer fails
1863 self._download_n_results(query, 1)
1866 def _download_n_results(self, query, n):
1867 """Downloads a specified number of results for a query"""
1870 already_seen = set()
1874 self.report_download_page(query, pagenum)
1875 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1876 request = urllib2.Request(result_url, None, std_headers)
1878 page = urllib2.urlopen(request).read()
1879 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1880 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1883 # Extract video identifiers
1884 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1885 video_id = mobj.group(1)
1886 if video_id not in already_seen:
1887 video_ids.append(video_id)
1888 already_seen.add(video_id)
1889 if len(video_ids) == n:
1890 # Specified n videos reached
1891 for id in video_ids:
1892 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1895 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1896 for id in video_ids:
1897 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1900 pagenum = pagenum + 1
1902 class YoutubePlaylistIE(InfoExtractor):
1903 """Information Extractor for YouTube playlists."""
1905 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1906 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1907 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1908 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1911 def __init__(self, youtube_ie, downloader=None):
1912 InfoExtractor.__init__(self, downloader)
1913 self._youtube_ie = youtube_ie
1917 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1919 def report_download_page(self, playlist_id, pagenum):
1920 """Report attempt to download playlist page with given number."""
1921 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1923 def _real_initialize(self):
1924 self._youtube_ie.initialize()
1926 def _real_extract(self, url):
1927 # Extract playlist id
1928 mobj = re.match(self._VALID_URL, url)
1930 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1933 # Download playlist pages
1934 playlist_id = mobj.group(1)
1939 self.report_download_page(playlist_id, pagenum)
1940 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1942 page = urllib2.urlopen(request).read()
1943 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1944 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1947 # Extract video identifiers
1949 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1950 if mobj.group(1) not in ids_in_page:
1951 ids_in_page.append(mobj.group(1))
1952 video_ids.extend(ids_in_page)
1954 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1956 pagenum = pagenum + 1
1958 playliststart = self._downloader.params.get('playliststart', 1)
1959 playliststart -= 1 #our arrays are zero-based but the playlist is 1-based
1960 if playliststart > 0:
1961 video_ids = video_ids[playliststart:]
1963 for id in video_ids:
1964 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1967 class YoutubeUserIE(InfoExtractor):
1968 """Information Extractor for YouTube users."""
1970 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1971 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1972 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1975 def __init__(self, youtube_ie, downloader=None):
1976 InfoExtractor.__init__(self, downloader)
1977 self._youtube_ie = youtube_ie
1981 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1983 def report_download_page(self, username):
1984 """Report attempt to download user page."""
1985 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1987 def _real_initialize(self):
1988 self._youtube_ie.initialize()
1990 def _real_extract(self, url):
1992 mobj = re.match(self._VALID_URL, url)
1994 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1997 # Download user page
1998 username = mobj.group(1)
2002 self.report_download_page(username)
2003 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2005 page = urllib2.urlopen(request).read()
2006 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2007 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2010 # Extract video identifiers
2013 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2014 if mobj.group(1) not in ids_in_page:
2015 ids_in_page.append(mobj.group(1))
2016 video_ids.extend(ids_in_page)
2018 playliststart = self._downloader.params.get('playliststart', 1)
2019 playliststart = playliststart-1 #our arrays are zero-based but the playlist is 1-based
2020 if playliststart > 0:
2021 video_ids = video_ids[playliststart:]
2023 for id in video_ids:
2024 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2027 class PostProcessor(object):
2028 """Post Processor class.
2030 PostProcessor objects can be added to downloaders with their
2031 add_post_processor() method. When the downloader has finished a
2032 successful download, it will take its internal chain of PostProcessors
2033 and start calling the run() method on each one of them, first with
2034 an initial argument and then with the returned value of the previous
2037 The chain will be stopped if one of them ever returns None or the end
2038 of the chain is reached.
2040 PostProcessor objects follow a "mutual registration
" process similar
2041 to InfoExtractor objects.
2046 def __init__(self, downloader=None):
2047 self._downloader = downloader
2049 def set_downloader(self, downloader):
2050 """Sets the downloader for this PP."""
2051 self._downloader = downloader
2053 def run(self, information):
2054 """Run the PostProcessor.
2056 The "information
" argument is a dictionary like the ones
2057 composed by InfoExtractors. The only difference is that this
2058 one has an extra field called "filepath
" that points to the
2061 When this method returns None, the postprocessing chain is
2062 stopped. However, this method may return an information
2063 dictionary that will be passed to the next postprocessing
2064 object in the chain. It can be the one it received after
2065 changing some fields.
2067 In addition, this method may raise a PostProcessingError
2068 exception that will be taken into account by the downloader
2071 return information # by default, do nothing
2073 ### MAIN PROGRAM ###
2074 if __name__ == '__main__':
2076 # Modules needed only when running the main program
2080 # Function to update the program file with the latest version from bitbucket.org
2081 def update_self(downloader, filename):
2082 # Note: downloader only used for options
2083 if not os.access (filename, os.W_OK):
2084 sys.exit('ERROR: no write permissions on %s' % filename)
2086 downloader.to_stdout('Updating to latest stable version...')
2087 latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
2088 latest_version = urllib.urlopen(latest_url).read().strip()
2089 prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2090 newcontent = urllib.urlopen(prog_url).read()
2091 stream = open(filename, 'w')
2092 stream.write(newcontent)
2094 downloader.to_stdout('Updated to version %s' % latest_version)
2096 # General configuration
2097 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2098 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
2099 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2101 # Parse command line
2102 parser = optparse.OptionParser(
2103 usage='Usage: %prog [options] url...',
2104 version='2010.10.03',
2105 conflict_handler='resolve',
2108 parser.add_option('-h', '--help',
2109 action='help', help='print this help text and exit')
2110 parser.add_option('-v', '--version',
2111 action='version', help='print program version and exit')
2112 parser.add_option('-U', '--update',
2113 action='store_true', dest='update_self', help='update this program to latest stable version')
2114 parser.add_option('-i', '--ignore-errors',
2115 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2116 parser.add_option('-r', '--rate-limit',
2117 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2118 parser.add_option('-R', '--retries',
2119 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2120 parser.add_option('--playlist-start',
2121 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2123 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2124 authentication.add_option('-u', '--username',
2125 dest='username', metavar='USERNAME', help='account username')
2126 authentication.add_option('-p', '--password',
2127 dest='password', metavar='PASSWORD', help='account password')
2128 authentication.add_option('-n', '--netrc',
2129 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2130 parser.add_option_group(authentication)
2132 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2133 video_format.add_option('-f', '--format',
2134 action='store', dest='format', metavar='FORMAT', help='video format code')
2135 video_format.add_option('-m', '--mobile-version',
2136 action='store_const', dest='format', help='alias for -f 17', const='17')
2137 video_format.add_option('--all-formats',
2138 action='store_const', dest='format', help='download all available video formats', const='-1')
2139 video_format.add_option('--max-quality',
2140 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2141 video_format.add_option('-b', '--best-quality',
2142 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2143 parser.add_option_group(video_format)
2145 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2146 verbosity.add_option('-q', '--quiet',
2147 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2148 verbosity.add_option('-s', '--simulate',
2149 action='store_true', dest='simulate', help='do not download video', default=False)
2150 verbosity.add_option('-g', '--get-url',
2151 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2152 verbosity.add_option('-e', '--get-title',
2153 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2154 verbosity.add_option('--get-thumbnail',
2155 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2156 verbosity.add_option('--get-description',
2157 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2158 verbosity.add_option('--no-progress',
2159 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2160 parser.add_option_group(verbosity)
2162 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2163 filesystem.add_option('-t', '--title',
2164 action='store_true', dest='usetitle', help='use title in file name', default=False)
2165 filesystem.add_option('-l', '--literal',
2166 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2167 filesystem.add_option('-o', '--output',
2168 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2169 filesystem.add_option('-a', '--batch-file',
2170 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2171 filesystem.add_option('-w', '--no-overwrites',
2172 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2173 filesystem.add_option('-c', '--continue',
2174 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2175 parser.add_option_group(filesystem)
2177 (opts, args) = parser.parse_args()
2179 # Batch file verification
2181 if opts.batchfile is not None:
2183 if opts.batchfile == '-':
2186 batchfd = open(opts.batchfile, 'r')
2187 batchurls = batchfd.readlines()
2188 batchurls = [x.strip() for x in batchurls]
2189 batchurls = [x for x in batchurls if len(x) > 0]
2191 sys.exit(u'ERROR: batch file could not be read')
2192 all_urls = batchurls + args
2194 # Conflicting, missing and erroneous options
2195 if opts.bestquality:
2196 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2197 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2198 parser.error(u'using .netrc conflicts with giving username/password')
2199 if opts.password is not None and opts.username is None:
2200 parser.error(u'account username missing')
2201 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2202 parser.error(u'using output template conflicts with using title or literal title')
2203 if opts.usetitle and opts.useliteral:
2204 parser.error(u'using title conflicts with using literal title')
2205 if opts.username is not None and opts.password is None:
2206 opts.password = getpass.getpass(u'Type account password and press return:')
2207 if opts.ratelimit is not None:
2208 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2209 if numeric_limit is None:
2210 parser.error(u'invalid rate limit specified')
2211 opts.ratelimit = numeric_limit
2212 if opts.retries is not None:
2214 opts.retries = long(opts.retries)
2215 except (TypeError, ValueError), err:
2216 parser.error(u'invalid retry count specified')
2217 if opts.playliststart is not None:
2219 opts.playliststart = long(opts.playliststart)
2220 except (TypeError, ValueError), err:
2221 parser.error(u'invalid playlist page specified')
2223 # Information extractors
2224 youtube_ie = YoutubeIE()
2225 metacafe_ie = MetacafeIE(youtube_ie)
2226 dailymotion_ie = DailymotionIE()
2227 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2228 youtube_user_ie = YoutubeUserIE(youtube_ie)
2229 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2230 google_ie = GoogleIE()
2231 google_search_ie = GoogleSearchIE(google_ie)
2232 photobucket_ie = PhotobucketIE()
2233 yahoo_ie = YahooIE()
2234 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2235 generic_ie = GenericIE()
2238 fd = FileDownloader({
2239 'usenetrc': opts.usenetrc,
2240 'username': opts.username,
2241 'password': opts.password,
2242 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2243 'forceurl': opts.geturl,
2244 'forcetitle': opts.gettitle,
2245 'forcethumbnail': opts.getthumbnail,
2246 'forcedescription': opts.getdescription,
2247 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2248 'format': opts.format,
2249 'format_limit': opts.format_limit,
2250 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2251 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2252 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2253 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2254 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2255 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2256 or u'%(id)s.%(ext)s'),
2257 'ignoreerrors': opts.ignoreerrors,
2258 'ratelimit': opts.ratelimit,
2259 'nooverwrites': opts.nooverwrites,
2260 'retries': opts.retries,
2261 'continuedl': opts.continue_dl,
2262 'noprogress': opts.noprogress,
2263 'playliststart': opts.playliststart,
2265 fd.add_info_extractor(youtube_search_ie)
2266 fd.add_info_extractor(youtube_pl_ie)
2267 fd.add_info_extractor(youtube_user_ie)
2268 fd.add_info_extractor(metacafe_ie)
2269 fd.add_info_extractor(dailymotion_ie)
2270 fd.add_info_extractor(youtube_ie)
2271 fd.add_info_extractor(google_ie)
2272 fd.add_info_extractor(google_search_ie)
2273 fd.add_info_extractor(photobucket_ie)
2274 fd.add_info_extractor(yahoo_ie)
2275 fd.add_info_extractor(yahoo_search_ie)
2277 # This must come last since it's the
2278 # fallback if none of the others work
2279 fd.add_info_extractor(generic_ie)
2282 if opts.update_self:
2283 update_self(fd, sys.argv[0])
2286 if len(all_urls) < 1:
2287 if not opts.update_self:
2288 parser.error(u'you must provide at least one URL')
2291 retcode = fd.download(all_urls)
2294 except DownloadError:
2296 except SameFileError:
2297 sys.exit(u'ERROR: fixed output name but more than one file to download')
2298 except KeyboardInterrupt:
2299 sys.exit(u'\nERROR: Interrupted by user')