]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube-dl
2 # -*- coding: utf-8 -*-
3 # Author: Ricardo Garcia Gonzalez
4 # Author: Danny Colligan
5 # Author: Benjamin Johnson
6 # License: Public domain code
24 # parse_qs was moved from the cgi module to the urlparse module recently.
26 from urlparse
import parse_qs
28 from cgi
import parse_qs
31 'User-Agent': 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.12) Gecko/20101028 Firefox/3.6.12',
32 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
33 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
34 'Accept-Language': 'en-us,en;q=0.5',
37 simple_title_chars
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii')
39 month_name_to_number
= {
54 def preferredencoding():
55 """Get preferred encoding.
57 Returns the best encoding scheme for the system, based on
58 locale.getpreferredencoding() and some further tweaks.
60 def yield_preferredencoding():
62 pref
= locale
.getpreferredencoding()
68 return yield_preferredencoding().next()
70 def htmlentity_transform(matchobj
):
71 """Transforms an HTML entity to a Unicode character.
73 This function receives a match object and is intended to be used with
74 the re.sub() function.
76 entity
= matchobj
.group(1)
78 # Known non-numeric HTML entity
79 if entity
in htmlentitydefs
.name2codepoint
:
80 return unichr(htmlentitydefs
.name2codepoint
[entity
])
83 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
85 numstr
= mobj
.group(1)
86 if numstr
.startswith(u
'x'):
88 numstr
= u
'0%s' % numstr
91 return unichr(long(numstr
, base
))
93 # Unknown entity in name, return its literal representation
94 return (u
'&%s;' % entity
)
96 def sanitize_title(utitle
):
97 """Sanitizes a video title so it could be used as part of a filename."""
98 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
99 return utitle
.replace(unicode(os
.sep
), u
'%')
101 def sanitize_open(filename
, open_mode
):
102 """Try to open the given filename, and slightly tweak it if this fails.
104 Attempts to open the given filename. If this fails, it tries to change
105 the filename slightly, step by step, until it's either able to open it
106 or it fails and raises a final exception, like the standard open()
109 It returns the tuple (stream, definitive_file_name).
113 if sys
.platform
== 'win32':
115 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
116 return (sys
.stdout
, filename
)
117 stream
= open(filename
, open_mode
)
118 return (stream
, filename
)
119 except (IOError, OSError), err
:
120 # In case of error, try to remove win32 forbidden chars
121 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
123 # An exception here should be caught in the caller
124 stream
= open(filename
, open_mode
)
125 return (stream
, filename
)
128 class DownloadError(Exception):
129 """Download Error exception.
131 This exception may be thrown by FileDownloader objects if they are not
132 configured to continue on errors. They will contain the appropriate
137 class SameFileError(Exception):
138 """Same File exception.
140 This exception will be thrown by FileDownloader objects if they detect
141 multiple files would have to be downloaded to the same file on disk.
145 class PostProcessingError(Exception):
146 """Post Processing exception.
148 This exception may be raised by PostProcessor's .run() method to
149 indicate an error in the postprocessing task.
153 class UnavailableVideoError(Exception):
154 """Unavailable Format exception.
156 This exception will be thrown when a video is requested
157 in a format that is not available for that video.
161 class ContentTooShortError(Exception):
162 """Content Too Short exception.
164 This exception may be raised by FileDownloader objects when a file they
165 download is too small for what the server announced first, indicating
166 the connection was probably interrupted.
172 def __init__(self
, downloaded
, expected
):
173 self
.downloaded
= downloaded
174 self
.expected
= expected
176 class FileDownloader(object):
177 """File Downloader class.
179 File downloader objects are the ones responsible of downloading the
180 actual video file and writing it to disk if the user has requested
181 it, among some other tasks. In most cases there should be one per
182 program. As, given a video URL, the downloader doesn't know how to
183 extract all the needed information, task that InfoExtractors do, it
184 has to pass the URL to one of them.
186 For this, file downloader objects have a method that allows
187 InfoExtractors to be registered in a given order. When it is passed
188 a URL, the file downloader handles it to the first InfoExtractor it
189 finds that reports being able to handle it. The InfoExtractor extracts
190 all the information about the video or videos the URL refers to, and
191 asks the FileDownloader to process the video information, possibly
192 downloading the video.
194 File downloaders accept a lot of parameters. In order not to saturate
195 the object constructor with arguments, it receives a dictionary of
196 options instead. These options are available through the params
197 attribute for the InfoExtractors to use. The FileDownloader also
198 registers itself as the downloader in charge for the InfoExtractors
199 that are added to it, so this is a "mutual registration".
203 username: Username for authentication purposes.
204 password: Password for authentication purposes.
205 usenetrc: Use netrc for authentication instead.
206 quiet: Do not print messages to stdout.
207 forceurl: Force printing final URL.
208 forcetitle: Force printing title.
209 forcethumbnail: Force printing thumbnail URL.
210 forcedescription: Force printing description.
211 simulate: Do not download the video files.
212 format: Video format code.
213 format_limit: Highest quality format to try.
214 outtmpl: Template for output names.
215 ignoreerrors: Do not stop on download errors.
216 ratelimit: Download speed limit, in bytes/sec.
217 nooverwrites: Prevent overwriting files.
218 retries: Number of times to retry for HTTP error 5xx
219 continuedl: Try to continue downloads if possible.
220 noprogress: Do not print the progress bar.
221 playliststart: Playlist item to start at.
222 playlistend: Playlist item to end at.
223 logtostderr: Log messages to stderr instead of stdout.
229 _download_retcode
= None
230 _num_downloads
= None
233 def __init__(self
, params
):
234 """Create a FileDownloader object with the given options."""
237 self
._download
_retcode
= 0
238 self
._num
_downloads
= 0
239 self
._screen
_file
= [sys
.stdout
, sys
.stderr
][params
.get('logtostderr', False)]
243 def pmkdir(filename
):
244 """Create directory components in filename. Similar to Unix "mkdir -p"."""
245 components
= filename
.split(os
.sep
)
246 aggregate
= [os
.sep
.join(components
[0:x
]) for x
in xrange(1, len(components
))]
247 aggregate
= ['%s%s' % (x
, os
.sep
) for x
in aggregate
] # Finish names with separator
248 for dir in aggregate
:
249 if not os
.path
.exists(dir):
253 def format_bytes(bytes):
256 if type(bytes) is str:
261 exponent
= long(math
.log(bytes, 1024.0))
262 suffix
= 'bkMGTPEZY'[exponent
]
263 converted
= float(bytes) / float(1024**exponent
)
264 return '%.2f%s' % (converted
, suffix
)
267 def calc_percent(byte_counter
, data_len
):
270 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
273 def calc_eta(start
, now
, total
, current
):
277 if current
== 0 or dif
< 0.001: # One millisecond
279 rate
= float(current
) / dif
280 eta
= long((float(total
) - float(current
)) / rate
)
281 (eta_mins
, eta_secs
) = divmod(eta
, 60)
284 return '%02d:%02d' % (eta_mins
, eta_secs
)
287 def calc_speed(start
, now
, bytes):
289 if bytes == 0 or dif
< 0.001: # One millisecond
290 return '%10s' % '---b/s'
291 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
294 def best_block_size(elapsed_time
, bytes):
295 new_min
= max(bytes / 2.0, 1.0)
296 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
297 if elapsed_time
< 0.001:
299 rate
= bytes / elapsed_time
307 def parse_bytes(bytestr
):
308 """Parse a string indicating a byte quantity into a long integer."""
309 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
312 number
= float(matchobj
.group(1))
313 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
314 return long(round(number
* multiplier
))
316 def add_info_extractor(self
, ie
):
317 """Add an InfoExtractor object to the end of the list."""
319 ie
.set_downloader(self
)
321 def add_post_processor(self
, pp
):
322 """Add a PostProcessor object to the end of the chain."""
324 pp
.set_downloader(self
)
326 def to_screen(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
327 """Print message to stdout if not in quiet mode."""
329 if not self
.params
.get('quiet', False):
330 terminator
= [u
'\n', u
''][skip_eol
]
331 print >>self
._screen
_file
, (u
'%s%s' % (message
, terminator
)).encode(preferredencoding()),
332 self
._screen
_file
.flush()
333 except (UnicodeEncodeError), err
:
334 if not ignore_encoding_errors
:
337 def to_stderr(self
, message
):
338 """Print message to stderr."""
339 print >>sys
.stderr
, message
.encode(preferredencoding())
341 def fixed_template(self
):
342 """Checks if the output template is fixed."""
343 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
345 def trouble(self
, message
=None):
346 """Determine action to take when a download problem appears.
348 Depending on if the downloader has been configured to ignore
349 download errors or not, this method may throw an exception or
350 not when errors are found, after printing the message.
352 if message
is not None:
353 self
.to_stderr(message
)
354 if not self
.params
.get('ignoreerrors', False):
355 raise DownloadError(message
)
356 self
._download
_retcode
= 1
358 def slow_down(self
, start_time
, byte_counter
):
359 """Sleep if the download speed is over the rate limit."""
360 rate_limit
= self
.params
.get('ratelimit', None)
361 if rate_limit
is None or byte_counter
== 0:
364 elapsed
= now
- start_time
367 speed
= float(byte_counter
) / elapsed
368 if speed
> rate_limit
:
369 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
371 def report_destination(self
, filename
):
372 """Report destination filename."""
373 self
.to_screen(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
375 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
376 """Report download progress."""
377 if self
.params
.get('noprogress', False):
379 self
.to_screen(u
'\r[download] %s of %s at %s ETA %s' %
380 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
382 def report_resuming_byte(self
, resume_len
):
383 """Report attempt to resume at given byte."""
384 self
.to_screen(u
'[download] Resuming download at byte %s' % resume_len
)
386 def report_retry(self
, count
, retries
):
387 """Report retry in case of HTTP error 5xx"""
388 self
.to_screen(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
))
390 def report_file_already_downloaded(self
, file_name
):
391 """Report file has already been fully downloaded."""
393 self
.to_screen(u
'[download] %s has already been downloaded' % file_name
)
394 except (UnicodeEncodeError), err
:
395 self
.to_screen(u
'[download] The file has already been downloaded')
397 def report_unable_to_resume(self
):
398 """Report it was impossible to resume download."""
399 self
.to_screen(u
'[download] Unable to resume')
401 def report_finish(self
):
402 """Report download finished."""
403 if self
.params
.get('noprogress', False):
404 self
.to_screen(u
'[download] Download completed')
408 def increment_downloads(self
):
409 """Increment the ordinal that assigns a number to each file."""
410 self
._num
_downloads
+= 1
412 def process_info(self
, info_dict
):
413 """Process a single dictionary returned by an InfoExtractor."""
414 # Do nothing else if in simulate mode
415 if self
.params
.get('simulate', False):
417 if self
.params
.get('forcetitle', False):
418 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
419 if self
.params
.get('forceurl', False):
420 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
421 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
422 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
423 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
424 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
429 template_dict
= dict(info_dict
)
430 template_dict
['epoch'] = unicode(long(time
.time()))
431 template_dict
['autonumber'] = unicode('%05d' % self
._num
_downloads
)
432 filename
= self
.params
['outtmpl'] % template_dict
433 except (ValueError, KeyError), err
:
434 self
.trouble(u
'ERROR: invalid system charset or erroneous output template')
436 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
437 self
.to_stderr(u
'WARNING: file exists and will be skipped')
441 self
.pmkdir(filename
)
442 except (OSError, IOError), err
:
443 self
.trouble(u
'ERROR: unable to create directories: %s' % str(err
))
447 success
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None))
448 except (OSError, IOError), err
:
449 raise UnavailableVideoError
450 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
451 self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
))
453 except (ContentTooShortError
, ), err
:
454 self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
459 self
.post_process(filename
, info_dict
)
460 except (PostProcessingError
), err
:
461 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
))
464 def download(self
, url_list
):
465 """Download a given list of URLs."""
466 if len(url_list
) > 1 and self
.fixed_template():
467 raise SameFileError(self
.params
['outtmpl'])
470 suitable_found
= False
472 # Go to next InfoExtractor if not suitable
473 if not ie
.suitable(url
):
476 # Suitable InfoExtractor found
477 suitable_found
= True
479 # Extract information from URL and process it
482 # Suitable InfoExtractor had been found; go to next URL
485 if not suitable_found
:
486 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
)
488 return self
._download
_retcode
490 def post_process(self
, filename
, ie_info
):
491 """Run the postprocessing chain on the given file."""
493 info
['filepath'] = filename
499 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
500 self
.report_destination(filename
)
502 # Check for rtmpdump first
504 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
505 except (OSError, IOError):
506 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
509 # Download using rtmpdump. rtmpdump returns exit code 2 when
510 # the connection was interrumpted and resuming appears to be
511 # possible. This is part of rtmpdump's normal usage, AFAIK.
512 basic_args
= ['rtmpdump', '-q'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', filename
]
513 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
514 while retval
== 2 or retval
== 1:
515 prevsize
= os
.path
.getsize(filename
)
516 self
.to_screen(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
517 time
.sleep(5.0) # This seems to be needed
518 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
519 cursize
= os
.path
.getsize(filename
)
520 if prevsize
== cursize
and retval
== 1:
523 self
.to_screen(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(filename
))
526 self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
)
529 def _do_download(self
, filename
, url
, player_url
):
530 # Attempt to download using rtmpdump
531 if url
.startswith('rtmp'):
532 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
536 basic_request
= urllib2
.Request(url
, None, std_headers
)
537 request
= urllib2
.Request(url
, None, std_headers
)
539 # Establish possible resume length
540 if os
.path
.isfile(filename
):
541 resume_len
= os
.path
.getsize(filename
)
545 # Request parameters in case of being able to resume
546 if self
.params
.get('continuedl', False) and resume_len
!= 0:
547 self
.report_resuming_byte(resume_len
)
548 request
.add_header('Range','bytes=%d-' % resume_len
)
552 retries
= self
.params
.get('retries', 0)
553 while count
<= retries
:
554 # Establish connection
556 data
= urllib2
.urlopen(request
)
558 except (urllib2
.HTTPError
, ), err
:
559 if (err
.code
< 500 or err
.code
>= 600) and err
.code
!= 416:
560 # Unexpected HTTP error
562 elif err
.code
== 416:
563 # Unable to resume (requested range not satisfiable)
565 # Open the connection again without the range header
566 data
= urllib2
.urlopen(basic_request
)
567 content_length
= data
.info()['Content-Length']
568 except (urllib2
.HTTPError
, ), err
:
569 if err
.code
< 500 or err
.code
>= 600:
572 # Examine the reported length
573 if (content_length
is not None and
574 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
575 # The file had already been fully downloaded.
576 # Explanation to the above condition: in issue #175 it was revealed that
577 # YouTube sometimes adds or removes a few bytes from the end of the file,
578 # changing the file size slightly and causing problems for some users. So
579 # I decided to implement a suggested change and consider the file
580 # completely downloaded if the file size differs less than 100 bytes from
581 # the one in the hard drive.
582 self
.report_file_already_downloaded(filename
)
585 # The length does not match, we start the download over
586 self
.report_unable_to_resume()
592 self
.report_retry(count
, retries
)
595 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
598 data_len
= data
.info().get('Content-length', None)
599 data_len_str
= self
.format_bytes(data_len
)
606 data_block
= data
.read(block_size
)
608 data_block_len
= len(data_block
)
609 if data_block_len
== 0:
611 byte_counter
+= data_block_len
613 # Open file just in time
616 (stream
, filename
) = sanitize_open(filename
, open_mode
)
617 self
.report_destination(filename
)
618 except (OSError, IOError), err
:
619 self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
))
622 stream
.write(data_block
)
623 except (IOError, OSError), err
:
624 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
))
626 block_size
= self
.best_block_size(after
- before
, data_block_len
)
629 percent_str
= self
.calc_percent(byte_counter
, data_len
)
630 eta_str
= self
.calc_eta(start
, time
.time(), data_len
, byte_counter
)
631 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
)
632 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
635 self
.slow_down(start
, byte_counter
)
638 if data_len
is not None and str(byte_counter
) != data_len
:
639 raise ContentTooShortError(byte_counter
, long(data_len
))
642 class InfoExtractor(object):
643 """Information Extractor class.
645 Information extractors are the classes that, given a URL, extract
646 information from the video (or videos) the URL refers to. This
647 information includes the real video URL, the video title and simplified
648 title, author and others. The information is stored in a dictionary
649 which is then passed to the FileDownloader. The FileDownloader
650 processes this information possibly downloading the video to the file
651 system, among other possible outcomes. The dictionaries must include
652 the following fields:
654 id: Video identifier.
655 url: Final video URL.
656 uploader: Nickname of the video uploader.
657 title: Literal title.
658 stitle: Simplified title.
659 ext: Video filename extension.
660 format: Video format.
661 player_url: SWF Player URL (may be None).
663 The following fields are optional. Their primary purpose is to allow
664 youtube-dl to serve as the backend for a video search function, such
665 as the one in youtube2mp3. They are only used when their respective
666 forced printing functions are called:
668 thumbnail: Full URL to a video thumbnail image.
669 description: One-line video description.
671 Subclasses of this one should re-define the _real_initialize() and
672 _real_extract() methods, as well as the suitable() static method.
673 Probably, they should also be instantiated and added to the main
680 def __init__(self
, downloader
=None):
681 """Constructor. Receives an optional downloader."""
683 self
.set_downloader(downloader
)
687 """Receives a URL and returns True if suitable for this IE."""
690 def initialize(self
):
691 """Initializes an instance (authentication, etc)."""
693 self
._real
_initialize
()
696 def extract(self
, url
):
697 """Extracts URL information and returns it in list of dicts."""
699 return self
._real
_extract
(url
)
701 def set_downloader(self
, downloader
):
702 """Sets the downloader for this IE."""
703 self
._downloader
= downloader
705 def _real_initialize(self
):
706 """Real initialization process. Redefine in subclasses."""
709 def _real_extract(self
, url
):
710 """Real extraction process. Redefine in subclasses."""
713 class YoutubeIE(InfoExtractor
):
714 """Information extractor for youtube.com."""
716 _VALID_URL
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/(?:(?:v/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))))?([0-9A-Za-z_-]+)(?(1).+)?$'
717 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
718 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
719 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
720 _NETRC_MACHINE
= 'youtube'
721 # Listed in order of quality
722 _available_formats
= ['38', '37', '22', '45', '35', '34', '43', '18', '6', '5', '17', '13']
723 _video_extensions
= {
729 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
736 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None)
738 def report_lang(self
):
739 """Report attempt to set language."""
740 self
._downloader
.to_screen(u
'[youtube] Setting language')
742 def report_login(self
):
743 """Report attempt to log in."""
744 self
._downloader
.to_screen(u
'[youtube] Logging in')
746 def report_age_confirmation(self
):
747 """Report attempt to confirm age."""
748 self
._downloader
.to_screen(u
'[youtube] Confirming age')
750 def report_video_webpage_download(self
, video_id
):
751 """Report attempt to download video webpage."""
752 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
754 def report_video_info_webpage_download(self
, video_id
):
755 """Report attempt to download video info webpage."""
756 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
758 def report_information_extraction(self
, video_id
):
759 """Report attempt to extract video information."""
760 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
762 def report_unavailable_format(self
, video_id
, format
):
763 """Report extracted video URL."""
764 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
766 def report_rtmp_download(self
):
767 """Indicate the download will use the RTMP protocol."""
768 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
770 def _real_initialize(self
):
771 if self
._downloader
is None:
776 downloader_params
= self
._downloader
.params
778 # Attempt to use provided username and password or .netrc data
779 if downloader_params
.get('username', None) is not None:
780 username
= downloader_params
['username']
781 password
= downloader_params
['password']
782 elif downloader_params
.get('usenetrc', False):
784 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
789 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
790 except (IOError, netrc
.NetrcParseError
), err
:
791 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
795 request
= urllib2
.Request(self
._LANG
_URL
, None, std_headers
)
798 urllib2
.urlopen(request
).read()
799 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
800 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
803 # No authentication to be performed
809 'current_form': 'loginForm',
811 'action_login': 'Log In',
812 'username': username
,
813 'password': password
,
815 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
), std_headers
)
818 login_results
= urllib2
.urlopen(request
).read()
819 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
820 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
822 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
823 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
829 'action_confirm': 'Confirm',
831 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
), std_headers
)
833 self
.report_age_confirmation()
834 age_results
= urllib2
.urlopen(request
).read()
835 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
836 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
839 def _real_extract(self
, url
):
840 # Extract video id from URL
841 mobj
= re
.match(self
._VALID
_URL
, url
)
843 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
845 video_id
= mobj
.group(2)
848 self
.report_video_webpage_download(video_id
)
849 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
, None, std_headers
)
851 video_webpage
= urllib2
.urlopen(request
).read()
852 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
853 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
856 # Attempt to extract SWF player URL
857 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
859 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
864 self
.report_video_info_webpage_download(video_id
)
865 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
866 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
867 % (video_id
, el_type
))
868 request
= urllib2
.Request(video_info_url
, None, std_headers
)
870 video_info_webpage
= urllib2
.urlopen(request
).read()
871 video_info
= parse_qs(video_info_webpage
)
872 if 'token' in video_info
:
874 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
875 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
877 if 'token' not in video_info
:
878 if 'reason' in video_info
:
879 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
881 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
884 # Start extracting information
885 self
.report_information_extraction(video_id
)
888 if 'author' not in video_info
:
889 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
891 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
894 if 'title' not in video_info
:
895 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
897 video_title
= urllib
.unquote_plus(video_info
['title'][0])
898 video_title
= video_title
.decode('utf-8')
899 video_title
= sanitize_title(video_title
)
902 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
903 simple_title
= simple_title
.strip(ur
'_')
906 if 'thumbnail_url' not in video_info
:
907 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
909 else: # don't panic if we can't find it
910 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
914 mobj
= re
.search(r
'id="eow-date".*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
917 if ',' in mobj
.group(1):
919 m
, d
, y
= mobj
.group(1).replace(',', '').split()
921 # Day Month Year, we'll suppose
922 d
, m
, y
= mobj
.group(1).split()
923 m
= month_name_to_number
[m
]
924 d
= '%02d' % (long(d
))
925 upload_date
= '%s%s%s' % (y
, m
, d
)
930 video_description
= 'No description available.'
931 if self
._downloader
.params
.get('forcedescription', False):
932 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
934 video_description
= mobj
.group(1)
937 video_token
= urllib
.unquote_plus(video_info
['token'][0])
939 # Decide which formats to download
940 requested_format
= self
._downloader
.params
.get('format', None)
941 get_video_template
= 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=&ps=&asv=&fmt=%%s' % (video_id
, video_token
)
943 if 'fmt_url_map' in video_info
:
944 url_map
= dict(tuple(pair
.split('|')) for pair
in video_info
['fmt_url_map'][0].split(','))
945 format_limit
= self
._downloader
.params
.get('format_limit', None)
946 if format_limit
is not None and format_limit
in self
._available
_formats
:
947 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
949 format_list
= self
._available
_formats
950 existing_formats
= [x
for x
in format_list
if x
in url_map
]
951 if len(existing_formats
) == 0:
952 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
954 if requested_format
is None:
955 video_url_list
= [(existing_formats
[0], get_video_template
% existing_formats
[0])] # Best quality
956 elif requested_format
== '-1':
957 video_url_list
= [(f
, get_video_template
% f
) for f
in existing_formats
] # All formats
959 video_url_list
= [(requested_format
, get_video_template
% requested_format
)] # Specific format
961 elif 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
962 self
.report_rtmp_download()
963 video_url_list
= [(None, video_info
['conn'][0])]
966 self
._downloader
.trouble(u
'ERROR: no fmt_url_map or conn information found in video info')
969 for format_param
, video_real_url
in video_url_list
:
970 # At this point we have a new video
971 self
._downloader
.increment_downloads()
974 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
976 # Find the video URL in fmt_url_map or conn paramters
978 # Process video information
979 self
._downloader
.process_info({
980 'id': video_id
.decode('utf-8'),
981 'url': video_real_url
.decode('utf-8'),
982 'uploader': video_uploader
.decode('utf-8'),
983 'upload_date': upload_date
,
984 'title': video_title
,
985 'stitle': simple_title
,
986 'ext': video_extension
.decode('utf-8'),
987 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
988 'thumbnail': video_thumbnail
.decode('utf-8'),
989 'description': video_description
.decode('utf-8'),
990 'player_url': player_url
,
992 except UnavailableVideoError
, err
:
993 self
._downloader
.trouble(u
'ERROR: unable to download video (format may not be available)')
996 class MetacafeIE(InfoExtractor
):
997 """Information Extractor for metacafe.com."""
999 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1000 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
1001 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1004 def __init__(self
, youtube_ie
, downloader
=None):
1005 InfoExtractor
.__init
__(self
, downloader
)
1006 self
._youtube
_ie
= youtube_ie
1010 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None)
1012 def report_disclaimer(self
):
1013 """Report disclaimer retrieval."""
1014 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
1016 def report_age_confirmation(self
):
1017 """Report attempt to confirm age."""
1018 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
1020 def report_download_webpage(self
, video_id
):
1021 """Report webpage download."""
1022 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
1024 def report_extraction(self
, video_id
):
1025 """Report information extraction."""
1026 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
1028 def _real_initialize(self
):
1029 # Retrieve disclaimer
1030 request
= urllib2
.Request(self
._DISCLAIMER
, None, std_headers
)
1032 self
.report_disclaimer()
1033 disclaimer
= urllib2
.urlopen(request
).read()
1034 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1035 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
1041 'submit': "Continue - I'm over 18",
1043 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
), std_headers
)
1045 self
.report_age_confirmation()
1046 disclaimer
= urllib2
.urlopen(request
).read()
1047 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1048 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1051 def _real_extract(self
, url
):
1052 # Extract id and simplified title from URL
1053 mobj
= re
.match(self
._VALID
_URL
, url
)
1055 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1058 video_id
= mobj
.group(1)
1060 # Check if video comes from YouTube
1061 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1062 if mobj2
is not None:
1063 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1066 # At this point we have a new video
1067 self
._downloader
.increment_downloads()
1069 simple_title
= mobj
.group(2).decode('utf-8')
1071 # Retrieve video webpage to extract further information
1072 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1074 self
.report_download_webpage(video_id
)
1075 webpage
= urllib2
.urlopen(request
).read()
1076 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1077 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1080 # Extract URL, uploader and title from webpage
1081 self
.report_extraction(video_id
)
1082 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1083 if mobj
is not None:
1084 mediaURL
= urllib
.unquote(mobj
.group(1))
1085 video_extension
= mediaURL
[-3:]
1087 # Extract gdaKey if available
1088 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1090 video_url
= mediaURL
1092 gdaKey
= mobj
.group(1)
1093 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1095 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
1097 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1099 vardict
= parse_qs(mobj
.group(1))
1100 if 'mediaData' not in vardict
:
1101 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1103 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
1105 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1107 mediaURL
= mobj
.group(1).replace('\\/', '/')
1108 video_extension
= mediaURL
[-3:]
1109 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
1111 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1113 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1115 video_title
= mobj
.group(1).decode('utf-8')
1116 video_title
= sanitize_title(video_title
)
1118 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1120 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1122 video_uploader
= mobj
.group(1)
1125 # Process video information
1126 self
._downloader
.process_info({
1127 'id': video_id
.decode('utf-8'),
1128 'url': video_url
.decode('utf-8'),
1129 'uploader': video_uploader
.decode('utf-8'),
1130 'upload_date': u
'NA',
1131 'title': video_title
,
1132 'stitle': simple_title
,
1133 'ext': video_extension
.decode('utf-8'),
1137 except UnavailableVideoError
:
1138 self
._downloader
.trouble(u
'ERROR: unable to download video')
1141 class DailymotionIE(InfoExtractor
):
1142 """Information Extractor for Dailymotion"""
1144 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1146 def __init__(self
, downloader
=None):
1147 InfoExtractor
.__init
__(self
, downloader
)
1151 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None)
1153 def report_download_webpage(self
, video_id
):
1154 """Report webpage download."""
1155 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1157 def report_extraction(self
, video_id
):
1158 """Report information extraction."""
1159 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
1161 def _real_initialize(self
):
1164 def _real_extract(self
, url
):
1165 # Extract id and simplified title from URL
1166 mobj
= re
.match(self
._VALID
_URL
, url
)
1168 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1171 # At this point we have a new video
1172 self
._downloader
.increment_downloads()
1173 video_id
= mobj
.group(1)
1175 simple_title
= mobj
.group(2).decode('utf-8')
1176 video_extension
= 'flv'
1178 # Retrieve video webpage to extract further information
1179 request
= urllib2
.Request(url
)
1181 self
.report_download_webpage(video_id
)
1182 webpage
= urllib2
.urlopen(request
).read()
1183 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1184 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1187 # Extract URL, uploader and title from webpage
1188 self
.report_extraction(video_id
)
1189 mobj
= re
.search(r
'(?i)addVariable\(\"video\"\s*,\s*\"([^\"]*)\"\)', webpage
)
1191 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1193 mediaURL
= urllib
.unquote(mobj
.group(1))
1195 # if needed add http://www.dailymotion.com/ if relative URL
1197 video_url
= mediaURL
1199 # '<meta\s+name="title"\s+content="Dailymotion\s*[:\-]\s*(.*?)"\s*\/\s*>'
1200 mobj
= re
.search(r
'(?im)<title>Dailymotion\s*[\-:]\s*(.+?)</title>', webpage
)
1202 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1204 video_title
= mobj
.group(1).decode('utf-8')
1205 video_title
= sanitize_title(video_title
)
1207 mobj
= re
.search(r
'(?im)<div class="dmco_html owner">.*?<a class="name" href="/.+?">(.+?)</a>', webpage
)
1209 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1211 video_uploader
= mobj
.group(1)
1214 # Process video information
1215 self
._downloader
.process_info({
1216 'id': video_id
.decode('utf-8'),
1217 'url': video_url
.decode('utf-8'),
1218 'uploader': video_uploader
.decode('utf-8'),
1219 'upload_date': u
'NA',
1220 'title': video_title
,
1221 'stitle': simple_title
,
1222 'ext': video_extension
.decode('utf-8'),
1226 except UnavailableVideoError
:
1227 self
._downloader
.trouble(u
'ERROR: unable to download video')
1229 class GoogleIE(InfoExtractor
):
1230 """Information extractor for video.google.com."""
1232 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1234 def __init__(self
, downloader
=None):
1235 InfoExtractor
.__init
__(self
, downloader
)
1239 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None)
1241 def report_download_webpage(self
, video_id
):
1242 """Report webpage download."""
1243 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
1245 def report_extraction(self
, video_id
):
1246 """Report information extraction."""
1247 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
1249 def _real_initialize(self
):
1252 def _real_extract(self
, url
):
1253 # Extract id from URL
1254 mobj
= re
.match(self
._VALID
_URL
, url
)
1256 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1259 # At this point we have a new video
1260 self
._downloader
.increment_downloads()
1261 video_id
= mobj
.group(1)
1263 video_extension
= 'mp4'
1265 # Retrieve video webpage to extract further information
1266 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1268 self
.report_download_webpage(video_id
)
1269 webpage
= urllib2
.urlopen(request
).read()
1270 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1271 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1274 # Extract URL, uploader, and title from webpage
1275 self
.report_extraction(video_id
)
1276 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1278 video_extension
= 'flv'
1279 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1281 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1283 mediaURL
= urllib
.unquote(mobj
.group(1))
1284 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1285 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1287 video_url
= mediaURL
1289 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1291 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1293 video_title
= mobj
.group(1).decode('utf-8')
1294 video_title
= sanitize_title(video_title
)
1295 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1297 # Extract video description
1298 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1300 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1302 video_description
= mobj
.group(1).decode('utf-8')
1303 if not video_description
:
1304 video_description
= 'No description available.'
1306 # Extract video thumbnail
1307 if self
._downloader
.params
.get('forcethumbnail', False):
1308 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1310 webpage
= urllib2
.urlopen(request
).read()
1311 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1312 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1314 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1316 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1318 video_thumbnail
= mobj
.group(1)
1319 else: # we need something to pass to process_info
1320 video_thumbnail
= ''
1324 # Process video information
1325 self
._downloader
.process_info({
1326 'id': video_id
.decode('utf-8'),
1327 'url': video_url
.decode('utf-8'),
1329 'upload_date': u
'NA',
1330 'title': video_title
,
1331 'stitle': simple_title
,
1332 'ext': video_extension
.decode('utf-8'),
1336 except UnavailableVideoError
:
1337 self
._downloader
.trouble(u
'ERROR: unable to download video')
1340 class PhotobucketIE(InfoExtractor
):
1341 """Information extractor for photobucket.com."""
1343 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1345 def __init__(self
, downloader
=None):
1346 InfoExtractor
.__init
__(self
, downloader
)
1350 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None)
1352 def report_download_webpage(self
, video_id
):
1353 """Report webpage download."""
1354 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
1356 def report_extraction(self
, video_id
):
1357 """Report information extraction."""
1358 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
1360 def _real_initialize(self
):
1363 def _real_extract(self
, url
):
1364 # Extract id from URL
1365 mobj
= re
.match(self
._VALID
_URL
, url
)
1367 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1370 # At this point we have a new video
1371 self
._downloader
.increment_downloads()
1372 video_id
= mobj
.group(1)
1374 video_extension
= 'flv'
1376 # Retrieve video webpage to extract further information
1377 request
= urllib2
.Request(url
)
1379 self
.report_download_webpage(video_id
)
1380 webpage
= urllib2
.urlopen(request
).read()
1381 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1382 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1385 # Extract URL, uploader, and title from webpage
1386 self
.report_extraction(video_id
)
1387 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1389 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1391 mediaURL
= urllib
.unquote(mobj
.group(1))
1393 video_url
= mediaURL
1395 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1397 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1399 video_title
= mobj
.group(1).decode('utf-8')
1400 video_title
= sanitize_title(video_title
)
1401 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1403 video_uploader
= mobj
.group(2).decode('utf-8')
1406 # Process video information
1407 self
._downloader
.process_info({
1408 'id': video_id
.decode('utf-8'),
1409 'url': video_url
.decode('utf-8'),
1410 'uploader': video_uploader
,
1411 'upload_date': u
'NA',
1412 'title': video_title
,
1413 'stitle': simple_title
,
1414 'ext': video_extension
.decode('utf-8'),
1418 except UnavailableVideoError
:
1419 self
._downloader
.trouble(u
'ERROR: unable to download video')
1422 class YahooIE(InfoExtractor
):
1423 """Information extractor for video.yahoo.com."""
1425 # _VALID_URL matches all Yahoo! Video URLs
1426 # _VPAGE_URL matches only the extractable '/watch/' URLs
1427 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1428 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1430 def __init__(self
, downloader
=None):
1431 InfoExtractor
.__init
__(self
, downloader
)
1435 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None)
1437 def report_download_webpage(self
, video_id
):
1438 """Report webpage download."""
1439 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1441 def report_extraction(self
, video_id
):
1442 """Report information extraction."""
1443 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
1445 def _real_initialize(self
):
1448 def _real_extract(self
, url
, new_video
=True):
1449 # Extract ID from URL
1450 mobj
= re
.match(self
._VALID
_URL
, url
)
1452 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1455 # At this point we have a new video
1456 self
._downloader
.increment_downloads()
1457 video_id
= mobj
.group(2)
1458 video_extension
= 'flv'
1460 # Rewrite valid but non-extractable URLs as
1461 # extractable English language /watch/ URLs
1462 if re
.match(self
._VPAGE
_URL
, url
) is None:
1463 request
= urllib2
.Request(url
)
1465 webpage
= urllib2
.urlopen(request
).read()
1466 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1467 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1470 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1472 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1474 yahoo_id
= mobj
.group(1)
1476 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1478 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1480 yahoo_vid
= mobj
.group(1)
1482 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1483 return self
._real
_extract
(url
, new_video
=False)
1485 # Retrieve video webpage to extract further information
1486 request
= urllib2
.Request(url
)
1488 self
.report_download_webpage(video_id
)
1489 webpage
= urllib2
.urlopen(request
).read()
1490 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1491 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1494 # Extract uploader and title from webpage
1495 self
.report_extraction(video_id
)
1496 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1498 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1500 video_title
= mobj
.group(1).decode('utf-8')
1501 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1503 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1505 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1507 video_uploader
= mobj
.group(1).decode('utf-8')
1509 # Extract video thumbnail
1510 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1512 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1514 video_thumbnail
= mobj
.group(1).decode('utf-8')
1516 # Extract video description
1517 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1519 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1521 video_description
= mobj
.group(1).decode('utf-8')
1522 if not video_description
: video_description
= 'No description available.'
1524 # Extract video height and width
1525 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1527 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1529 yv_video_height
= mobj
.group(1)
1531 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1533 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1535 yv_video_width
= mobj
.group(1)
1537 # Retrieve video playlist to extract media URL
1538 # I'm not completely sure what all these options are, but we
1539 # seem to need most of them, otherwise the server sends a 401.
1540 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1541 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1542 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1543 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1544 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1546 self
.report_download_webpage(video_id
)
1547 webpage
= urllib2
.urlopen(request
).read()
1548 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1549 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1552 # Extract media URL from playlist XML
1553 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1555 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1557 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1558 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1561 # Process video information
1562 self
._downloader
.process_info({
1563 'id': video_id
.decode('utf-8'),
1565 'uploader': video_uploader
,
1566 'upload_date': u
'NA',
1567 'title': video_title
,
1568 'stitle': simple_title
,
1569 'ext': video_extension
.decode('utf-8'),
1570 'thumbnail': video_thumbnail
.decode('utf-8'),
1571 'description': video_description
,
1572 'thumbnail': video_thumbnail
,
1573 'description': video_description
,
1576 except UnavailableVideoError
:
1577 self
._downloader
.trouble(u
'ERROR: unable to download video')
1580 class GenericIE(InfoExtractor
):
1581 """Generic last-resort information extractor."""
1583 def __init__(self
, downloader
=None):
1584 InfoExtractor
.__init
__(self
, downloader
)
1590 def report_download_webpage(self
, video_id
):
1591 """Report webpage download."""
1592 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
1593 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
1595 def report_extraction(self
, video_id
):
1596 """Report information extraction."""
1597 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
1599 def _real_initialize(self
):
1602 def _real_extract(self
, url
):
1603 # At this point we have a new video
1604 self
._downloader
.increment_downloads()
1606 video_id
= url
.split('/')[-1]
1607 request
= urllib2
.Request(url
)
1609 self
.report_download_webpage(video_id
)
1610 webpage
= urllib2
.urlopen(request
).read()
1611 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1612 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1614 except ValueError, err
:
1615 # since this is the last-resort InfoExtractor, if
1616 # this error is thrown, it'll be thrown here
1617 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1620 # Start with something easy: JW Player in SWFObject
1621 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1623 # Broaden the search a little bit
1624 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1626 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1629 # It's possible that one of the regexes
1630 # matched, but returned an empty group:
1631 if mobj.group(1) is None:
1632 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1635 video_url = urllib.unquote(mobj.group(1))
1636 video_id = os.path.basename(video_url)
1638 # here's a fun little line of code for you:
1639 video_extension = os.path.splitext(video_id)[1][1:]
1640 video_id = os.path.splitext(video_id)[0]
1642 # it's tempting to parse this further, but you would
1643 # have to take into account all the variations like
1644 # Video Title - Site Name
1645 # Site Name | Video Title
1646 # Video Title - Tagline | Site Name
1647 # and so on and so forth; it's just not practical
1648 mobj = re.search(r'<title>(.*)</title>', webpage)
1650 self._downloader.trouble(u'ERROR: unable to extract title')
1652 video_title = mobj.group(1).decode('utf-8')
1653 video_title = sanitize_title(video_title)
1654 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1656 # video uploader is domain name
1657 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1659 self._downloader.trouble(u'ERROR: unable to extract title')
1661 video_uploader = mobj.group(1).decode('utf-8')
1664 # Process video information
1665 self._downloader.process_info({
1666 'id': video_id.decode('utf-8'),
1667 'url': video_url.decode('utf-8'),
1668 'uploader': video_uploader,
1669 'upload_date': u'NA',
1670 'title': video_title,
1671 'stitle': simple_title,
1672 'ext': video_extension.decode('utf-8'),
1676 except UnavailableVideoError, err:
1677 self._downloader.trouble(u'ERROR: unable to download video')
1680 class YoutubeSearchIE(InfoExtractor):
1681 """Information Extractor for YouTube search queries."""
1682 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1683 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1684 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
1685 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1687 _max_youtube_results = 1000
1689 def __init__(self, youtube_ie, downloader=None):
1690 InfoExtractor.__init__(self, downloader)
1691 self._youtube_ie = youtube_ie
1695 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1697 def report_download_page(self, query, pagenum):
1698 """Report attempt to download playlist page with given number."""
1699 query = query.decode(preferredencoding())
1700 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1702 def _real_initialize(self):
1703 self._youtube_ie.initialize()
1705 def _real_extract(self, query):
1706 mobj = re.match(self._VALID_QUERY, query)
1708 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1711 prefix, query = query.split(':')
1713 query = query.encode('utf-8')
1715 self._download_n_results(query, 1)
1717 elif prefix == 'all':
1718 self._download_n_results(query, self._max_youtube_results)
1724 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1726 elif n > self._max_youtube_results:
1727 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1728 n = self._max_youtube_results
1729 self._download_n_results(query, n)
1731 except ValueError: # parsing prefix as integer fails
1732 self._download_n_results(query, 1)
1735 def _download_n_results(self, query, n):
1736 """Downloads a specified number of results for a query"""
1739 already_seen = set()
1743 self.report_download_page(query, pagenum)
1744 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1745 request = urllib2.Request(result_url, None, std_headers)
1747 page = urllib2.urlopen(request).read()
1748 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1749 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1752 # Extract video identifiers
1753 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1754 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1755 if video_id not in already_seen:
1756 video_ids.append(video_id)
1757 already_seen.add(video_id)
1758 if len(video_ids) == n:
1759 # Specified n videos reached
1760 for id in video_ids:
1761 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1764 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1765 for id in video_ids:
1766 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1769 pagenum = pagenum + 1
1771 class GoogleSearchIE(InfoExtractor):
1772 """Information Extractor for Google Video search queries."""
1773 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1774 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1775 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1776 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1778 _max_google_results = 1000
1780 def __init__(self, google_ie, downloader=None):
1781 InfoExtractor.__init__(self, downloader)
1782 self._google_ie = google_ie
1786 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1788 def report_download_page(self, query, pagenum):
1789 """Report attempt to download playlist page with given number."""
1790 query = query.decode(preferredencoding())
1791 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1793 def _real_initialize(self):
1794 self._google_ie.initialize()
1796 def _real_extract(self, query):
1797 mobj = re.match(self._VALID_QUERY, query)
1799 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1802 prefix, query = query.split(':')
1804 query = query.encode('utf-8')
1806 self._download_n_results(query, 1)
1808 elif prefix == 'all':
1809 self._download_n_results(query, self._max_google_results)
1815 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1817 elif n > self._max_google_results:
1818 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1819 n = self._max_google_results
1820 self._download_n_results(query, n)
1822 except ValueError: # parsing prefix as integer fails
1823 self._download_n_results(query, 1)
1826 def _download_n_results(self, query, n):
1827 """Downloads a specified number of results for a query"""
1830 already_seen = set()
1834 self.report_download_page(query, pagenum)
1835 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1836 request = urllib2.Request(result_url, None, std_headers)
1838 page = urllib2.urlopen(request).read()
1839 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1840 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1843 # Extract video identifiers
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 video_id = mobj.group(1)
1846 if video_id not in already_seen:
1847 video_ids.append(video_id)
1848 already_seen.add(video_id)
1849 if len(video_ids) == n:
1850 # Specified n videos reached
1851 for id in video_ids:
1852 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1855 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1856 for id in video_ids:
1857 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1860 pagenum = pagenum + 1
1862 class YahooSearchIE(InfoExtractor):
1863 """Information Extractor for Yahoo! Video search queries."""
1864 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1865 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1866 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1867 _MORE_PAGES_INDICATOR = r'\s*Next'
1869 _max_yahoo_results = 1000
1871 def __init__(self, yahoo_ie, downloader=None):
1872 InfoExtractor.__init__(self, downloader)
1873 self._yahoo_ie = yahoo_ie
1877 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1879 def report_download_page(self, query, pagenum):
1880 """Report attempt to download playlist page with given number."""
1881 query = query.decode(preferredencoding())
1882 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1884 def _real_initialize(self):
1885 self._yahoo_ie.initialize()
1887 def _real_extract(self, query):
1888 mobj = re.match(self._VALID_QUERY, query)
1890 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1893 prefix, query = query.split(':')
1895 query = query.encode('utf-8')
1897 self._download_n_results(query, 1)
1899 elif prefix == 'all':
1900 self._download_n_results(query, self._max_yahoo_results)
1906 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1908 elif n > self._max_yahoo_results:
1909 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1910 n = self._max_yahoo_results
1911 self._download_n_results(query, n)
1913 except ValueError: # parsing prefix as integer fails
1914 self._download_n_results(query, 1)
1917 def _download_n_results(self, query, n):
1918 """Downloads a specified number of results for a query"""
1921 already_seen = set()
1925 self.report_download_page(query, pagenum)
1926 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1927 request = urllib2.Request(result_url, None, std_headers)
1929 page = urllib2.urlopen(request).read()
1930 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1931 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1934 # Extract video identifiers
1935 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1936 video_id = mobj.group(1)
1937 if video_id not in already_seen:
1938 video_ids.append(video_id)
1939 already_seen.add(video_id)
1940 if len(video_ids) == n:
1941 # Specified n videos reached
1942 for id in video_ids:
1943 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1946 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1947 for id in video_ids:
1948 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1951 pagenum = pagenum + 1
1953 class YoutubePlaylistIE(InfoExtractor):
1954 """Information Extractor for YouTube playlists."""
1956 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1957 _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1958 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1959 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1962 def __init__(self, youtube_ie, downloader=None):
1963 InfoExtractor.__init__(self, downloader)
1964 self._youtube_ie = youtube_ie
1968 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1970 def report_download_page(self, playlist_id, pagenum):
1971 """Report attempt to download playlist page with given number."""
1972 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1974 def _real_initialize(self):
1975 self._youtube_ie.initialize()
1977 def _real_extract(self, url):
1978 # Extract playlist id
1979 mobj = re.match(self._VALID_URL, url)
1981 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1984 # Download playlist pages
1985 playlist_id = mobj.group(1)
1990 self.report_download_page(playlist_id, pagenum)
1991 request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1993 page = urllib2.urlopen(request).read()
1994 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1995 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1998 # Extract video identifiers
2000 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2001 if mobj.group(1) not in ids_in_page:
2002 ids_in_page.append(mobj.group(1))
2003 video_ids.extend(ids_in_page)
2005 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2007 pagenum = pagenum + 1
2009 playliststart = self._downloader.params.get('playliststart', 1) - 1
2010 playlistend = self._downloader.params.get('playlistend', -1)
2011 video_ids = video_ids[playliststart:playlistend]
2013 for id in video_ids:
2014 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2017 class YoutubeUserIE(InfoExtractor):
2018 """Information Extractor for YouTube users."""
2020 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
2021 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2022 _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
2025 def __init__(self, youtube_ie, downloader=None):
2026 InfoExtractor.__init__(self, downloader)
2027 self._youtube_ie = youtube_ie
2031 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2033 def report_download_page(self, username):
2034 """Report attempt to download user page."""
2035 self._downloader.to_screen(u'[youtube] user %s: Downloading page ' % (username))
2037 def _real_initialize(self):
2038 self._youtube_ie.initialize()
2040 def _real_extract(self, url):
2042 mobj = re.match(self._VALID_URL, url)
2044 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2047 # Download user page
2048 username = mobj.group(1)
2052 self.report_download_page(username)
2053 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
2055 page = urllib2.urlopen(request).read()
2056 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2057 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2060 # Extract video identifiers
2063 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2064 if mobj.group(1) not in ids_in_page:
2065 ids_in_page.append(mobj.group(1))
2066 video_ids.extend(ids_in_page)
2068 playliststart = self._downloader.params.get('playliststart', 1) - 1
2069 playlistend = self._downloader.params.get('playlistend', -1)
2070 video_ids = video_ids[playliststart:playlistend]
2072 for id in video_ids:
2073 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2076 class PostProcessor(object):
2077 """Post Processor class.
2079 PostProcessor objects can be added to downloaders with their
2080 add_post_processor() method. When the downloader has finished a
2081 successful download, it will take its internal chain of PostProcessors
2082 and start calling the run() method on each one of them, first with
2083 an initial argument and then with the returned value of the previous
2086 The chain will be stopped if one of them ever returns None or the end
2087 of the chain is reached.
2089 PostProcessor objects follow a "mutual registration
" process similar
2090 to InfoExtractor objects.
2095 def __init__(self, downloader=None):
2096 self._downloader = downloader
2098 def set_downloader(self, downloader):
2099 """Sets the downloader for this PP."""
2100 self._downloader = downloader
2102 def run(self, information):
2103 """Run the PostProcessor.
2105 The "information
" argument is a dictionary like the ones
2106 composed by InfoExtractors. The only difference is that this
2107 one has an extra field called "filepath
" that points to the
2110 When this method returns None, the postprocessing chain is
2111 stopped. However, this method may return an information
2112 dictionary that will be passed to the next postprocessing
2113 object in the chain. It can be the one it received after
2114 changing some fields.
2116 In addition, this method may raise a PostProcessingError
2117 exception that will be taken into account by the downloader
2120 return information # by default, do nothing
2122 ### MAIN PROGRAM ###
2123 if __name__ == '__main__':
2125 # Modules needed only when running the main program
2129 # Function to update the program file with the latest version from bitbucket.org
2130 def update_self(downloader, filename):
2131 # Note: downloader only used for options
2132 if not os.access (filename, os.W_OK):
2133 sys.exit('ERROR: no write permissions on %s' % filename)
2135 downloader.to_screen('Updating to latest stable version...')
2136 latest_url = 'http://github.com/rg3/youtube-dl/raw/master/LATEST_VERSION'
2137 latest_version = urllib.urlopen(latest_url).read().strip()
2138 prog_url = 'http://github.com/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
2139 newcontent = urllib.urlopen(prog_url).read()
2140 stream = open(filename, 'w')
2141 stream.write(newcontent)
2143 downloader.to_screen('Updated to version %s' % latest_version)
2145 # Parse command line
2146 parser = optparse.OptionParser(
2147 usage='Usage: %prog [options] url...',
2148 version='2010.11.19',
2149 conflict_handler='resolve',
2152 parser.add_option('-h', '--help',
2153 action='help', help='print this help text and exit')
2154 parser.add_option('-v', '--version',
2155 action='version', help='print program version and exit')
2156 parser.add_option('-U', '--update',
2157 action='store_true', dest='update_self', help='update this program to latest stable version')
2158 parser.add_option('-i', '--ignore-errors',
2159 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
2160 parser.add_option('-r', '--rate-limit',
2161 dest='ratelimit', metavar='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
2162 parser.add_option('-R', '--retries',
2163 dest='retries', metavar='RETRIES', help='number of retries (default is 10)', default=10)
2164 parser.add_option('--playlist-start',
2165 dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1)
2166 parser.add_option('--playlist-end',
2167 dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
2169 authentication = optparse.OptionGroup(parser, 'Authentication Options')
2170 authentication.add_option('-u', '--username',
2171 dest='username', metavar='USERNAME', help='account username')
2172 authentication.add_option('-p', '--password',
2173 dest='password', metavar='PASSWORD', help='account password')
2174 authentication.add_option('-n', '--netrc',
2175 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
2176 parser.add_option_group(authentication)
2178 video_format = optparse.OptionGroup(parser, 'Video Format Options')
2179 video_format.add_option('-f', '--format',
2180 action='store', dest='format', metavar='FORMAT', help='video format code')
2181 video_format.add_option('-m', '--mobile-version',
2182 action='store_const', dest='format', help='alias for -f 17', const='17')
2183 video_format.add_option('--all-formats',
2184 action='store_const', dest='format', help='download all available video formats', const='-1')
2185 video_format.add_option('--max-quality',
2186 action='store', dest='format_limit', metavar='FORMAT', help='highest quality format to download')
2187 video_format.add_option('-b', '--best-quality',
2188 action='store_true', dest='bestquality', help='download the best video quality (DEPRECATED)')
2189 parser.add_option_group(video_format)
2191 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
2192 verbosity.add_option('-q', '--quiet',
2193 action='store_true', dest='quiet', help='activates quiet mode', default=False)
2194 verbosity.add_option('-s', '--simulate',
2195 action='store_true', dest='simulate', help='do not download video', default=False)
2196 verbosity.add_option('-g', '--get-url',
2197 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
2198 verbosity.add_option('-e', '--get-title',
2199 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
2200 verbosity.add_option('--get-thumbnail',
2201 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
2202 verbosity.add_option('--get-description',
2203 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
2204 verbosity.add_option('--no-progress',
2205 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
2206 parser.add_option_group(verbosity)
2208 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2209 filesystem.add_option('-t', '--title',
2210 action='store_true', dest='usetitle', help='use title in file name', default=False)
2211 filesystem.add_option('-l', '--literal',
2212 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2213 filesystem.add_option('-A', '--auto-number',
2214 action='store_true', dest='autonumber', help='number downloaded files starting from 00000', default=False)
2215 filesystem.add_option('-o', '--output',
2216 dest='outtmpl', metavar='TEMPLATE', help='output filename template')
2217 filesystem.add_option('-a', '--batch-file',
2218 dest='batchfile', metavar='FILE', help='file containing URLs to download (\'-\' for stdin)')
2219 filesystem.add_option('-w', '--no-overwrites',
2220 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2221 filesystem.add_option('-c', '--continue',
2222 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2223 filesystem.add_option('--cookies',
2224 dest='cookiefile', metavar='FILE', help='file to dump cookie jar to')
2225 parser.add_option_group(filesystem)
2227 (opts, args) = parser.parse_args()
2229 # Open appropriate CookieJar
2230 if opts.cookiefile is None:
2231 jar = cookielib.CookieJar()
2234 jar = cookielib.MozillaCookieJar(opts.cookiefile)
2235 if os.path.isfile(opts.cookiefile) and os.access(opts.cookiefile, os.R_OK):
2237 except (IOError, OSError), err:
2238 sys.exit(u'ERROR: unable to open cookie file')
2240 # General configuration
2241 cookie_processor = urllib2.HTTPCookieProcessor(jar)
2242 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
2243 urllib2.install_opener(urllib2.build_opener(cookie_processor))
2244 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
2246 # Batch file verification
2248 if opts.batchfile is not None:
2250 if opts.batchfile == '-':
2253 batchfd = open(opts.batchfile, 'r')
2254 batchurls = batchfd.readlines()
2255 batchurls = [x.strip() for x in batchurls]
2256 batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
2258 sys.exit(u'ERROR: batch file could not be read')
2259 all_urls = batchurls + args
2261 # Conflicting, missing and erroneous options
2262 if opts.bestquality:
2263 print >>sys.stderr, u'\nWARNING: -b/--best-quality IS DEPRECATED AS IT IS THE DEFAULT BEHAVIOR NOW\n'
2264 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2265 parser.error(u'using .netrc conflicts with giving username/password')
2266 if opts.password is not None and opts.username is None:
2267 parser.error(u'account username missing')
2268 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle or opts.autonumber):
2269 parser.error(u'using output template conflicts with using title, literal title or auto number')
2270 if opts.usetitle and opts.useliteral:
2271 parser.error(u'using title conflicts with using literal title')
2272 if opts.username is not None and opts.password is None:
2273 opts.password = getpass.getpass(u'Type account password and press return:')
2274 if opts.ratelimit is not None:
2275 numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2276 if numeric_limit is None:
2277 parser.error(u'invalid rate limit specified')
2278 opts.ratelimit = numeric_limit
2279 if opts.retries is not None:
2281 opts.retries = long(opts.retries)
2282 except (TypeError, ValueError), err:
2283 parser.error(u'invalid retry count specified')
2285 opts.playliststart = long(opts.playliststart)
2286 if opts.playliststart <= 0:
2288 except (TypeError, ValueError), err:
2289 parser.error(u'invalid playlist start number specified')
2291 opts.playlistend = long(opts.playlistend)
2292 if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
2294 except (TypeError, ValueError), err:
2295 parser.error(u'invalid playlist end number specified')
2297 # Information extractors
2298 youtube_ie = YoutubeIE()
2299 metacafe_ie = MetacafeIE(youtube_ie)
2300 dailymotion_ie = DailymotionIE()
2301 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2302 youtube_user_ie = YoutubeUserIE(youtube_ie)
2303 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2304 google_ie = GoogleIE()
2305 google_search_ie = GoogleSearchIE(google_ie)
2306 photobucket_ie = PhotobucketIE()
2307 yahoo_ie = YahooIE()
2308 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2309 generic_ie = GenericIE()
2312 fd = FileDownloader({
2313 'usenetrc': opts.usenetrc,
2314 'username': opts.username,
2315 'password': opts.password,
2316 'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2317 'forceurl': opts.geturl,
2318 'forcetitle': opts.gettitle,
2319 'forcethumbnail': opts.getthumbnail,
2320 'forcedescription': opts.getdescription,
2321 'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2322 'format': opts.format,
2323 'format_limit': opts.format_limit,
2324 'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2325 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2326 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2327 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2328 or (opts.usetitle and opts.autonumber and u'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
2329 or (opts.useliteral and opts.autonumber and u'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
2330 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2331 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2332 or (opts.autonumber and u'%(autonumber)s-%(id)s.%(ext)s')
2333 or u'%(id)s.%(ext)s'),
2334 'ignoreerrors': opts.ignoreerrors,
2335 'ratelimit': opts.ratelimit,
2336 'nooverwrites': opts.nooverwrites,
2337 'retries': opts.retries,
2338 'continuedl': opts.continue_dl,
2339 'noprogress': opts.noprogress,
2340 'playliststart': opts.playliststart,
2341 'playlistend': opts.playlistend,
2342 'logtostderr': opts.outtmpl == '-',
2344 fd.add_info_extractor(youtube_search_ie)
2345 fd.add_info_extractor(youtube_pl_ie)
2346 fd.add_info_extractor(youtube_user_ie)
2347 fd.add_info_extractor(metacafe_ie)
2348 fd.add_info_extractor(dailymotion_ie)
2349 fd.add_info_extractor(youtube_ie)
2350 fd.add_info_extractor(google_ie)
2351 fd.add_info_extractor(google_search_ie)
2352 fd.add_info_extractor(photobucket_ie)
2353 fd.add_info_extractor(yahoo_ie)
2354 fd.add_info_extractor(yahoo_search_ie)
2356 # This must come last since it's the
2357 # fallback if none of the others work
2358 fd.add_info_extractor(generic_ie)
2361 if opts.update_self:
2362 update_self(fd, sys.argv[0])
2365 if len(all_urls) < 1:
2366 if not opts.update_self:
2367 parser.error(u'you must provide at least one URL')
2370 retcode = fd.download(all_urls)
2372 # Dump cookie jar if requested
2373 if opts.cookiefile is not None:
2376 except (IOError, OSError), err:
2377 sys.exit(u'ERROR: unable to save cookie jar')
2381 except DownloadError:
2383 except SameFileError:
2384 sys.exit(u'ERROR: fixed output name but more than one file to download')
2385 except KeyboardInterrupt:
2386 sys.exit(u'\nERROR: Interrupted by user')