2 # -*- coding: utf-8 -*-
5 'Ricardo Garcia Gonzalez',
13 'Philipp Hagemeister',
17 __license__
= 'Public Domain'
18 __version__
= '2011.09.14'
20 UPDATE_URL
= 'https://raw.github.com/rg3/youtube-dl/master/youtube-dl'
48 except ImportError: # Python 2.4
51 import cStringIO
as StringIO
55 # parse_qs was moved from the cgi module to the urlparse module recently.
57 from urlparse
import parse_qs
59 from cgi
import parse_qs
67 import xml
.etree
.ElementTree
68 except ImportError: # Python<2.5
69 pass # Not officially supported, but let it slip
72 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
73 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
74 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
75 'Accept-Encoding': 'gzip, deflate',
76 'Accept-Language': 'en-us,en;q=0.5',
79 simple_title_chars
= string
.ascii_letters
.decode('ascii') + string
.digits
.decode('ascii')
83 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
89 def raiseError(msg
, i
):
90 raise ValueError(msg
+ ' at position ' + str(i
) + ' of ' + repr(s
) + ': ' + repr(s
[i
:]))
91 def skipSpace(i
, expectMore
=True):
92 while i
< len(s
) and s
[i
] in ' \t\r\n':
96 raiseError('Premature end', i
)
98 def decodeEscape(match
):
114 return unichr(int(esc
[1:5], 16))
115 if len(esc
) == 5+6 and esc
[5:7] == '\\u':
116 hi
= int(esc
[1:5], 16)
117 low
= int(esc
[7:11], 16)
118 return unichr((hi
- 0xd800) * 0x400 + low
- 0xdc00 + 0x10000)
119 raise ValueError('Unknown escape ' + str(esc
))
126 while s
[e
-bslashes
-1] == '\\':
128 if bslashes
% 2 == 1:
132 rexp
= re
.compile(r
'\\(u[dD][89aAbB][0-9a-fA-F]{2}\\u[0-9a-fA-F]{4}|u[0-9a-fA-F]{4}|.|$)')
133 stri
= rexp
.sub(decodeEscape
, s
[i
:e
])
139 if s
[i
] == '}': # Empty dictionary
143 raiseError('Expected a string object key', i
)
144 i
,key
= parseString(i
)
146 if i
>= len(s
) or s
[i
] != ':':
147 raiseError('Expected a colon', i
)
154 raiseError('Expected comma or closing curly brace', i
)
159 if s
[i
] == ']': # Empty array
164 i
= skipSpace(i
) # Raise exception if premature end
168 raiseError('Expected a comma or closing bracket', i
)
170 def parseDiscrete(i
):
171 for k
,v
in {'true': True, 'false': False, 'null': None}.items():
172 if s
.startswith(k
, i
):
174 raiseError('Not a boolean (or null)', i
)
176 mobj
= re
.match('^(-?(0|[1-9][0-9]*)(\.[0-9]*)?([eE][+-]?[0-9]+)?)', s
[i
:])
178 raiseError('Not a number', i
)
180 if '.' in nums
or 'e' in nums
or 'E' in nums
:
181 return (i
+len(nums
), float(nums
))
182 return (i
+len(nums
), int(nums
))
183 CHARMAP
= {'{': parseObj
, '[': parseArray
, '"': parseString
, 't': parseDiscrete
, 'f': parseDiscrete
, 'n': parseDiscrete
}
186 i
,res
= CHARMAP
.get(s
[i
], parseNumber
)(i
)
187 i
= skipSpace(i
, False)
191 raise ValueError('Extra data at end of input (index ' + str(i
) + ' of ' + repr(s
) + ': ' + repr(s
[i
:]) + ')')
194 def preferredencoding():
195 """Get preferred encoding.
197 Returns the best encoding scheme for the system, based on
198 locale.getpreferredencoding() and some further tweaks.
200 def yield_preferredencoding():
202 pref
= locale
.getpreferredencoding()
208 return yield_preferredencoding().next()
211 def htmlentity_transform(matchobj
):
212 """Transforms an HTML entity to a Unicode character.
214 This function receives a match object and is intended to be used with
215 the re.sub() function.
217 entity
= matchobj
.group(1)
219 # Known non-numeric HTML entity
220 if entity
in htmlentitydefs
.name2codepoint
:
221 return unichr(htmlentitydefs
.name2codepoint
[entity
])
224 mobj
= re
.match(ur
'(?u)#(x?\d+)', entity
)
226 numstr
= mobj
.group(1)
227 if numstr
.startswith(u
'x'):
229 numstr
= u
'0%s' % numstr
232 return unichr(long(numstr
, base
))
234 # Unknown entity in name, return its literal representation
235 return (u
'&%s;' % entity
)
238 def sanitize_title(utitle
):
239 """Sanitizes a video title so it could be used as part of a filename."""
240 utitle
= re
.sub(ur
'(?u)&(.+?);', htmlentity_transform
, utitle
)
241 return utitle
.replace(unicode(os
.sep
), u
'%')
244 def sanitize_open(filename
, open_mode
):
245 """Try to open the given filename, and slightly tweak it if this fails.
247 Attempts to open the given filename. If this fails, it tries to change
248 the filename slightly, step by step, until it's either able to open it
249 or it fails and raises a final exception, like the standard open()
252 It returns the tuple (stream, definitive_file_name).
256 if sys
.platform
== 'win32':
258 msvcrt
.setmode(sys
.stdout
.fileno(), os
.O_BINARY
)
259 return (sys
.stdout
, filename
)
260 stream
= open(filename
, open_mode
)
261 return (stream
, filename
)
262 except (IOError, OSError), err
:
263 # In case of error, try to remove win32 forbidden chars
264 filename
= re
.sub(ur
'[/<>:"\|\?\*]', u
'#', filename
)
266 # An exception here should be caught in the caller
267 stream
= open(filename
, open_mode
)
268 return (stream
, filename
)
271 def timeconvert(timestr
):
272 """Convert RFC 2822 defined time string into system timestamp"""
274 timetuple
= email
.utils
.parsedate_tz(timestr
)
275 if timetuple
is not None:
276 timestamp
= email
.utils
.mktime_tz(timetuple
)
280 class DownloadError(Exception):
281 """Download Error exception.
283 This exception may be thrown by FileDownloader objects if they are not
284 configured to continue on errors. They will contain the appropriate
290 class SameFileError(Exception):
291 """Same File exception.
293 This exception will be thrown by FileDownloader objects if they detect
294 multiple files would have to be downloaded to the same file on disk.
299 class PostProcessingError(Exception):
300 """Post Processing exception.
302 This exception may be raised by PostProcessor's .run() method to
303 indicate an error in the postprocessing task.
308 class UnavailableVideoError(Exception):
309 """Unavailable Format exception.
311 This exception will be thrown when a video is requested
312 in a format that is not available for that video.
317 class ContentTooShortError(Exception):
318 """Content Too Short exception.
320 This exception may be raised by FileDownloader objects when a file they
321 download is too small for what the server announced first, indicating
322 the connection was probably interrupted.
328 def __init__(self
, downloaded
, expected
):
329 self
.downloaded
= downloaded
330 self
.expected
= expected
333 class YoutubeDLHandler(urllib2
.HTTPHandler
):
334 """Handler for HTTP requests and responses.
336 This class, when installed with an OpenerDirector, automatically adds
337 the standard headers to every HTTP request and handles gzipped and
338 deflated responses from web servers. If compression is to be avoided in
339 a particular request, the original request in the program code only has
340 to include the HTTP header "Youtubedl-No-Compression", which will be
341 removed before making the real request.
343 Part of this code was copied from:
345 http://techknack.net/python-urllib2-handlers/
347 Andrew Rowls, the author of that code, agreed to release it to the
354 return zlib
.decompress(data
, -zlib
.MAX_WBITS
)
356 return zlib
.decompress(data
)
359 def addinfourl_wrapper(stream
, headers
, url
, code
):
360 if hasattr(urllib2
.addinfourl
, 'getcode'):
361 return urllib2
.addinfourl(stream
, headers
, url
, code
)
362 ret
= urllib2
.addinfourl(stream
, headers
, url
)
366 def http_request(self
, req
):
367 for h
in std_headers
:
370 req
.add_header(h
, std_headers
[h
])
371 if 'Youtubedl-no-compression' in req
.headers
:
372 if 'Accept-encoding' in req
.headers
:
373 del req
.headers
['Accept-encoding']
374 del req
.headers
['Youtubedl-no-compression']
377 def http_response(self
, req
, resp
):
380 if resp
.headers
.get('Content-encoding', '') == 'gzip':
381 gz
= gzip
.GzipFile(fileobj
=StringIO
.StringIO(resp
.read()), mode
='r')
382 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
383 resp
.msg
= old_resp
.msg
385 if resp
.headers
.get('Content-encoding', '') == 'deflate':
386 gz
= StringIO
.StringIO(self
.deflate(resp
.read()))
387 resp
= self
.addinfourl_wrapper(gz
, old_resp
.headers
, old_resp
.url
, old_resp
.code
)
388 resp
.msg
= old_resp
.msg
392 class FileDownloader(object):
393 """File Downloader class.
395 File downloader objects are the ones responsible of downloading the
396 actual video file and writing it to disk if the user has requested
397 it, among some other tasks. In most cases there should be one per
398 program. As, given a video URL, the downloader doesn't know how to
399 extract all the needed information, task that InfoExtractors do, it
400 has to pass the URL to one of them.
402 For this, file downloader objects have a method that allows
403 InfoExtractors to be registered in a given order. When it is passed
404 a URL, the file downloader handles it to the first InfoExtractor it
405 finds that reports being able to handle it. The InfoExtractor extracts
406 all the information about the video or videos the URL refers to, and
407 asks the FileDownloader to process the video information, possibly
408 downloading the video.
410 File downloaders accept a lot of parameters. In order not to saturate
411 the object constructor with arguments, it receives a dictionary of
412 options instead. These options are available through the params
413 attribute for the InfoExtractors to use. The FileDownloader also
414 registers itself as the downloader in charge for the InfoExtractors
415 that are added to it, so this is a "mutual registration".
419 username: Username for authentication purposes.
420 password: Password for authentication purposes.
421 usenetrc: Use netrc for authentication instead.
422 quiet: Do not print messages to stdout.
423 forceurl: Force printing final URL.
424 forcetitle: Force printing title.
425 forcethumbnail: Force printing thumbnail URL.
426 forcedescription: Force printing description.
427 forcefilename: Force printing final filename.
428 simulate: Do not download the video files.
429 format: Video format code.
430 format_limit: Highest quality format to try.
431 outtmpl: Template for output names.
432 ignoreerrors: Do not stop on download errors.
433 ratelimit: Download speed limit, in bytes/sec.
434 nooverwrites: Prevent overwriting files.
435 retries: Number of times to retry for HTTP error 5xx
436 continuedl: Try to continue downloads if possible.
437 noprogress: Do not print the progress bar.
438 playliststart: Playlist item to start at.
439 playlistend: Playlist item to end at.
440 logtostderr: Log messages to stderr instead of stdout.
441 consoletitle: Display progress in console window's titlebar.
442 nopart: Do not use temporary .part files.
443 updatetime: Use the Last-modified header to set output file timestamps.
444 writedescription: Write the video description to a .description file
445 writeinfojson: Write the video description to a .info.json file
451 _download_retcode
= None
452 _num_downloads
= None
455 def __init__(self
, params
):
456 """Create a FileDownloader object with the given options."""
459 self
._download
_retcode
= 0
460 self
._num
_downloads
= 0
461 self
._screen
_file
= [sys
.stdout
, sys
.stderr
][params
.get('logtostderr', False)]
465 def format_bytes(bytes):
468 if type(bytes) is str:
473 exponent
= long(math
.log(bytes, 1024.0))
474 suffix
= 'bkMGTPEZY'[exponent
]
475 converted
= float(bytes) / float(1024 ** exponent
)
476 return '%.2f%s' % (converted
, suffix
)
479 def calc_percent(byte_counter
, data_len
):
482 return '%6s' % ('%3.1f%%' % (float(byte_counter
) / float(data_len
) * 100.0))
485 def calc_eta(start
, now
, total
, current
):
489 if current
== 0 or dif
< 0.001: # One millisecond
491 rate
= float(current
) / dif
492 eta
= long((float(total
) - float(current
)) / rate
)
493 (eta_mins
, eta_secs
) = divmod(eta
, 60)
496 return '%02d:%02d' % (eta_mins
, eta_secs
)
499 def calc_speed(start
, now
, bytes):
501 if bytes == 0 or dif
< 0.001: # One millisecond
502 return '%10s' % '---b/s'
503 return '%10s' % ('%s/s' % FileDownloader
.format_bytes(float(bytes) / dif
))
506 def best_block_size(elapsed_time
, bytes):
507 new_min
= max(bytes / 2.0, 1.0)
508 new_max
= min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
509 if elapsed_time
< 0.001:
511 rate
= bytes / elapsed_time
519 def parse_bytes(bytestr
):
520 """Parse a string indicating a byte quantity into a long integer."""
521 matchobj
= re
.match(r
'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr
)
524 number
= float(matchobj
.group(1))
525 multiplier
= 1024.0 ** 'bkmgtpezy'.index(matchobj
.group(2).lower())
526 return long(round(number
* multiplier
))
528 def add_info_extractor(self
, ie
):
529 """Add an InfoExtractor object to the end of the list."""
531 ie
.set_downloader(self
)
533 def add_post_processor(self
, pp
):
534 """Add a PostProcessor object to the end of the chain."""
536 pp
.set_downloader(self
)
538 def to_screen(self
, message
, skip_eol
=False, ignore_encoding_errors
=False):
539 """Print message to stdout if not in quiet mode."""
541 if not self
.params
.get('quiet', False):
542 terminator
= [u
'\n', u
''][skip_eol
]
543 print >>self
._screen
_file
, (u
'%s%s' % (message
, terminator
)).encode(preferredencoding()),
544 self
._screen
_file
.flush()
545 except (UnicodeEncodeError), err
:
546 if not ignore_encoding_errors
:
549 def to_stderr(self
, message
):
550 """Print message to stderr."""
551 print >>sys
.stderr
, message
.encode(preferredencoding())
553 def to_cons_title(self
, message
):
554 """Set console/terminal window title to message."""
555 if not self
.params
.get('consoletitle', False):
557 if os
.name
== 'nt' and ctypes
.windll
.kernel32
.GetConsoleWindow():
558 # c_wchar_p() might not be necessary if `message` is
559 # already of type unicode()
560 ctypes
.windll
.kernel32
.SetConsoleTitleW(ctypes
.c_wchar_p(message
))
561 elif 'TERM' in os
.environ
:
562 sys
.stderr
.write('\033]0;%s\007' % message
.encode(preferredencoding()))
564 def fixed_template(self
):
565 """Checks if the output template is fixed."""
566 return (re
.search(ur
'(?u)%\(.+?\)s', self
.params
['outtmpl']) is None)
568 def trouble(self
, message
=None):
569 """Determine action to take when a download problem appears.
571 Depending on if the downloader has been configured to ignore
572 download errors or not, this method may throw an exception or
573 not when errors are found, after printing the message.
575 if message
is not None:
576 self
.to_stderr(message
)
577 if not self
.params
.get('ignoreerrors', False):
578 raise DownloadError(message
)
579 self
._download
_retcode
= 1
581 def slow_down(self
, start_time
, byte_counter
):
582 """Sleep if the download speed is over the rate limit."""
583 rate_limit
= self
.params
.get('ratelimit', None)
584 if rate_limit
is None or byte_counter
== 0:
587 elapsed
= now
- start_time
590 speed
= float(byte_counter
) / elapsed
591 if speed
> rate_limit
:
592 time
.sleep((byte_counter
- rate_limit
* (now
- start_time
)) / rate_limit
)
594 def temp_name(self
, filename
):
595 """Returns a temporary filename for the given filename."""
596 if self
.params
.get('nopart', False) or filename
== u
'-' or \
597 (os
.path
.exists(filename
) and not os
.path
.isfile(filename
)):
599 return filename
+ u
'.part'
601 def undo_temp_name(self
, filename
):
602 if filename
.endswith(u
'.part'):
603 return filename
[:-len(u
'.part')]
606 def try_rename(self
, old_filename
, new_filename
):
608 if old_filename
== new_filename
:
610 os
.rename(old_filename
, new_filename
)
611 except (IOError, OSError), err
:
612 self
.trouble(u
'ERROR: unable to rename file')
614 def try_utime(self
, filename
, last_modified_hdr
):
615 """Try to set the last-modified time of the given file."""
616 if last_modified_hdr
is None:
618 if not os
.path
.isfile(filename
):
620 timestr
= last_modified_hdr
623 filetime
= timeconvert(timestr
)
627 os
.utime(filename
, (time
.time(), filetime
))
631 def report_writedescription(self
, descfn
):
632 """ Report that the description file is being written """
633 self
.to_screen(u
'[info] Writing video description to: %s' % descfn
, ignore_encoding_errors
=True)
635 def report_writeinfojson(self
, infofn
):
636 """ Report that the metadata file has been written """
637 self
.to_screen(u
'[info] Video description metadata as JSON to: %s' % infofn
, ignore_encoding_errors
=True)
639 def report_destination(self
, filename
):
640 """Report destination filename."""
641 self
.to_screen(u
'[download] Destination: %s' % filename
, ignore_encoding_errors
=True)
643 def report_progress(self
, percent_str
, data_len_str
, speed_str
, eta_str
):
644 """Report download progress."""
645 if self
.params
.get('noprogress', False):
647 self
.to_screen(u
'\r[download] %s of %s at %s ETA %s' %
648 (percent_str
, data_len_str
, speed_str
, eta_str
), skip_eol
=True)
649 self
.to_cons_title(u
'youtube-dl - %s of %s at %s ETA %s' %
650 (percent_str
.strip(), data_len_str
.strip(), speed_str
.strip(), eta_str
.strip()))
652 def report_resuming_byte(self
, resume_len
):
653 """Report attempt to resume at given byte."""
654 self
.to_screen(u
'[download] Resuming download at byte %s' % resume_len
)
656 def report_retry(self
, count
, retries
):
657 """Report retry in case of HTTP error 5xx"""
658 self
.to_screen(u
'[download] Got server HTTP error. Retrying (attempt %d of %d)...' % (count
, retries
))
660 def report_file_already_downloaded(self
, file_name
):
661 """Report file has already been fully downloaded."""
663 self
.to_screen(u
'[download] %s has already been downloaded' % file_name
)
664 except (UnicodeEncodeError), err
:
665 self
.to_screen(u
'[download] The file has already been downloaded')
667 def report_unable_to_resume(self
):
668 """Report it was impossible to resume download."""
669 self
.to_screen(u
'[download] Unable to resume')
671 def report_finish(self
):
672 """Report download finished."""
673 if self
.params
.get('noprogress', False):
674 self
.to_screen(u
'[download] Download completed')
678 def increment_downloads(self
):
679 """Increment the ordinal that assigns a number to each file."""
680 self
._num
_downloads
+= 1
682 def prepare_filename(self
, info_dict
):
683 """Generate the output filename."""
685 template_dict
= dict(info_dict
)
686 template_dict
['epoch'] = unicode(long(time
.time()))
687 template_dict
['autonumber'] = unicode('%05d' % self
._num
_downloads
)
688 filename
= self
.params
['outtmpl'] % template_dict
690 except (ValueError, KeyError), err
:
691 self
.trouble(u
'ERROR: invalid system charset or erroneous output template')
694 def process_info(self
, info_dict
):
695 """Process a single dictionary returned by an InfoExtractor."""
696 filename
= self
.prepare_filename(info_dict
)
697 # Do nothing else if in simulate mode
698 if self
.params
.get('simulate', False):
700 if self
.params
.get('forcetitle', False):
701 print info_dict
['title'].encode(preferredencoding(), 'xmlcharrefreplace')
702 if self
.params
.get('forceurl', False):
703 print info_dict
['url'].encode(preferredencoding(), 'xmlcharrefreplace')
704 if self
.params
.get('forcethumbnail', False) and 'thumbnail' in info_dict
:
705 print info_dict
['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
706 if self
.params
.get('forcedescription', False) and 'description' in info_dict
:
707 print info_dict
['description'].encode(preferredencoding(), 'xmlcharrefreplace')
708 if self
.params
.get('forcefilename', False) and filename
is not None:
709 print filename
.encode(preferredencoding(), 'xmlcharrefreplace')
715 if self
.params
.get('nooverwrites', False) and os
.path
.exists(filename
):
716 self
.to_stderr(u
'WARNING: file exists and will be skipped')
720 dn
= os
.path
.dirname(filename
)
721 if dn
!= '' and not os
.path
.exists(dn
):
723 except (OSError, IOError), err
:
724 self
.trouble(u
'ERROR: unable to create directory ' + unicode(err
))
727 if self
.params
.get('writedescription', False):
729 descfn
= filename
+ '.description'
730 self
.report_writedescription(descfn
)
731 descfile
= open(descfn
, 'wb')
733 descfile
.write(info_dict
['description'].encode('utf-8'))
736 except (OSError, IOError):
737 self
.trouble(u
'ERROR: Cannot write description file ' + descfn
)
740 if self
.params
.get('writeinfojson', False):
741 infofn
= filename
+ '.info.json'
742 self
.report_writeinfojson(infofn
)
745 except (NameError,AttributeError):
746 self
.trouble(u
'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.')
749 infof
= open(infofn
, 'wb')
751 json
.dump(info_dict
, infof
)
754 except (OSError, IOError):
755 self
.trouble(u
'ERROR: Cannot write metadata to JSON file ' + infofn
)
759 success
= self
._do
_download
(filename
, info_dict
['url'].encode('utf-8'), info_dict
.get('player_url', None))
760 except (OSError, IOError), err
:
761 raise UnavailableVideoError
762 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
763 self
.trouble(u
'ERROR: unable to download video data: %s' % str(err
))
765 except (ContentTooShortError
, ), err
:
766 self
.trouble(u
'ERROR: content too short (expected %s bytes and served %s)' % (err
.expected
, err
.downloaded
))
771 self
.post_process(filename
, info_dict
)
772 except (PostProcessingError
), err
:
773 self
.trouble(u
'ERROR: postprocessing: %s' % str(err
))
776 def download(self
, url_list
):
777 """Download a given list of URLs."""
778 if len(url_list
) > 1 and self
.fixed_template():
779 raise SameFileError(self
.params
['outtmpl'])
782 suitable_found
= False
784 # Go to next InfoExtractor if not suitable
785 if not ie
.suitable(url
):
788 # Suitable InfoExtractor found
789 suitable_found
= True
791 # Extract information from URL and process it
794 # Suitable InfoExtractor had been found; go to next URL
797 if not suitable_found
:
798 self
.trouble(u
'ERROR: no suitable InfoExtractor: %s' % url
)
800 return self
._download
_retcode
802 def post_process(self
, filename
, ie_info
):
803 """Run the postprocessing chain on the given file."""
805 info
['filepath'] = filename
811 def _download_with_rtmpdump(self
, filename
, url
, player_url
):
812 self
.report_destination(filename
)
813 tmpfilename
= self
.temp_name(filename
)
815 # Check for rtmpdump first
817 subprocess
.call(['rtmpdump', '-h'], stdout
=(file(os
.path
.devnull
, 'w')), stderr
=subprocess
.STDOUT
)
818 except (OSError, IOError):
819 self
.trouble(u
'ERROR: RTMP download detected but "rtmpdump" could not be run')
822 # Download using rtmpdump. rtmpdump returns exit code 2 when
823 # the connection was interrumpted and resuming appears to be
824 # possible. This is part of rtmpdump's normal usage, AFAIK.
825 basic_args
= ['rtmpdump'] + [[], ['-W', player_url
]][player_url
is not None] + ['-r', url
, '-o', tmpfilename
]
826 retval
= subprocess
.call(basic_args
+ [[], ['-e', '-k', '1']][self
.params
.get('continuedl', False)])
827 while retval
== 2 or retval
== 1:
828 prevsize
= os
.path
.getsize(tmpfilename
)
829 self
.to_screen(u
'\r[rtmpdump] %s bytes' % prevsize
, skip_eol
=True)
830 time
.sleep(5.0) # This seems to be needed
831 retval
= subprocess
.call(basic_args
+ ['-e'] + [[], ['-k', '1']][retval
== 1])
832 cursize
= os
.path
.getsize(tmpfilename
)
833 if prevsize
== cursize
and retval
== 1:
836 self
.to_screen(u
'\r[rtmpdump] %s bytes' % os
.path
.getsize(tmpfilename
))
837 self
.try_rename(tmpfilename
, filename
)
840 self
.trouble(u
'\nERROR: rtmpdump exited with code %d' % retval
)
843 def _do_download(self
, filename
, url
, player_url
):
844 # Check file already present
845 if self
.params
.get('continuedl', False) and os
.path
.isfile(filename
) and not self
.params
.get('nopart', False):
846 self
.report_file_already_downloaded(filename
)
849 # Attempt to download using rtmpdump
850 if url
.startswith('rtmp'):
851 return self
._download
_with
_rtmpdump
(filename
, url
, player_url
)
853 tmpfilename
= self
.temp_name(filename
)
857 # Do not include the Accept-Encoding header
858 headers
= {'Youtubedl-no-compression': 'True'}
859 basic_request
= urllib2
.Request(url
, None, headers
)
860 request
= urllib2
.Request(url
, None, headers
)
862 # Establish possible resume length
863 if os
.path
.isfile(tmpfilename
):
864 resume_len
= os
.path
.getsize(tmpfilename
)
868 # Request parameters in case of being able to resume
869 if self
.params
.get('continuedl', False) and resume_len
!= 0:
870 self
.report_resuming_byte(resume_len
)
871 request
.add_header('Range', 'bytes=%d-' % resume_len
)
875 retries
= self
.params
.get('retries', 0)
876 while count
<= retries
:
877 # Establish connection
879 data
= urllib2
.urlopen(request
)
881 except (urllib2
.HTTPError
, ), err
:
882 if (err
.code
< 500 or err
.code
>= 600) and err
.code
!= 416:
883 # Unexpected HTTP error
885 elif err
.code
== 416:
886 # Unable to resume (requested range not satisfiable)
888 # Open the connection again without the range header
889 data
= urllib2
.urlopen(basic_request
)
890 content_length
= data
.info()['Content-Length']
891 except (urllib2
.HTTPError
, ), err
:
892 if err
.code
< 500 or err
.code
>= 600:
895 # Examine the reported length
896 if (content_length
is not None and
897 (resume_len
- 100 < long(content_length
) < resume_len
+ 100)):
898 # The file had already been fully downloaded.
899 # Explanation to the above condition: in issue #175 it was revealed that
900 # YouTube sometimes adds or removes a few bytes from the end of the file,
901 # changing the file size slightly and causing problems for some users. So
902 # I decided to implement a suggested change and consider the file
903 # completely downloaded if the file size differs less than 100 bytes from
904 # the one in the hard drive.
905 self
.report_file_already_downloaded(filename
)
906 self
.try_rename(tmpfilename
, filename
)
909 # The length does not match, we start the download over
910 self
.report_unable_to_resume()
916 self
.report_retry(count
, retries
)
919 self
.trouble(u
'ERROR: giving up after %s retries' % retries
)
922 data_len
= data
.info().get('Content-length', None)
923 if data_len
is not None:
924 data_len
= long(data_len
) + resume_len
925 data_len_str
= self
.format_bytes(data_len
)
926 byte_counter
= 0 + resume_len
932 data_block
= data
.read(block_size
)
934 if len(data_block
) == 0:
936 byte_counter
+= len(data_block
)
938 # Open file just in time
941 (stream
, tmpfilename
) = sanitize_open(tmpfilename
, open_mode
)
942 assert stream
is not None
943 filename
= self
.undo_temp_name(tmpfilename
)
944 self
.report_destination(filename
)
945 except (OSError, IOError), err
:
946 self
.trouble(u
'ERROR: unable to open for writing: %s' % str(err
))
949 stream
.write(data_block
)
950 except (IOError, OSError), err
:
951 self
.trouble(u
'\nERROR: unable to write data: %s' % str(err
))
953 block_size
= self
.best_block_size(after
- before
, len(data_block
))
956 percent_str
= self
.calc_percent(byte_counter
, data_len
)
957 eta_str
= self
.calc_eta(start
, time
.time(), data_len
- resume_len
, byte_counter
- resume_len
)
958 speed_str
= self
.calc_speed(start
, time
.time(), byte_counter
- resume_len
)
959 self
.report_progress(percent_str
, data_len_str
, speed_str
, eta_str
)
962 self
.slow_down(start
, byte_counter
- resume_len
)
965 self
.trouble(u
'\nERROR: Did not get any data blocks')
969 if data_len
is not None and byte_counter
!= data_len
:
970 raise ContentTooShortError(byte_counter
, long(data_len
))
971 self
.try_rename(tmpfilename
, filename
)
973 # Update file modification time
974 if self
.params
.get('updatetime', True):
975 self
.try_utime(filename
, data
.info().get('last-modified', None))
980 class InfoExtractor(object):
981 """Information Extractor class.
983 Information extractors are the classes that, given a URL, extract
984 information from the video (or videos) the URL refers to. This
985 information includes the real video URL, the video title and simplified
986 title, author and others. The information is stored in a dictionary
987 which is then passed to the FileDownloader. The FileDownloader
988 processes this information possibly downloading the video to the file
989 system, among other possible outcomes. The dictionaries must include
990 the following fields:
992 id: Video identifier.
993 url: Final video URL.
994 uploader: Nickname of the video uploader.
995 title: Literal title.
996 stitle: Simplified title.
997 ext: Video filename extension.
998 format: Video format.
999 player_url: SWF Player URL (may be None).
1001 The following fields are optional. Their primary purpose is to allow
1002 youtube-dl to serve as the backend for a video search function, such
1003 as the one in youtube2mp3. They are only used when their respective
1004 forced printing functions are called:
1006 thumbnail: Full URL to a video thumbnail image.
1007 description: One-line video description.
1009 Subclasses of this one should re-define the _real_initialize() and
1010 _real_extract() methods, as well as the suitable() static method.
1011 Probably, they should also be instantiated and added to the main
1018 def __init__(self
, downloader
=None):
1019 """Constructor. Receives an optional downloader."""
1021 self
.set_downloader(downloader
)
1025 """Receives a URL and returns True if suitable for this IE."""
1028 def initialize(self
):
1029 """Initializes an instance (authentication, etc)."""
1031 self
._real
_initialize
()
1034 def extract(self
, url
):
1035 """Extracts URL information and returns it in list of dicts."""
1037 return self
._real
_extract
(url
)
1039 def set_downloader(self
, downloader
):
1040 """Sets the downloader for this IE."""
1041 self
._downloader
= downloader
1043 def _real_initialize(self
):
1044 """Real initialization process. Redefine in subclasses."""
1047 def _real_extract(self
, url
):
1048 """Real extraction process. Redefine in subclasses."""
1052 class YoutubeIE(InfoExtractor
):
1053 """Information extractor for youtube.com."""
1055 _VALID_URL
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
1056 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
1057 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
1058 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
1059 _NETRC_MACHINE
= 'youtube'
1060 # Listed in order of quality
1061 _available_formats
= ['38', '37', '45', '22', '43', '35', '34', '18', '6', '5', '17', '13']
1062 _video_extensions
= {
1068 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
1075 return (re
.match(YoutubeIE
._VALID
_URL
, url
) is not None)
1077 def report_lang(self
):
1078 """Report attempt to set language."""
1079 self
._downloader
.to_screen(u
'[youtube] Setting language')
1081 def report_login(self
):
1082 """Report attempt to log in."""
1083 self
._downloader
.to_screen(u
'[youtube] Logging in')
1085 def report_age_confirmation(self
):
1086 """Report attempt to confirm age."""
1087 self
._downloader
.to_screen(u
'[youtube] Confirming age')
1089 def report_video_webpage_download(self
, video_id
):
1090 """Report attempt to download video webpage."""
1091 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
1093 def report_video_info_webpage_download(self
, video_id
):
1094 """Report attempt to download video info webpage."""
1095 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
1097 def report_information_extraction(self
, video_id
):
1098 """Report attempt to extract video information."""
1099 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
1101 def report_unavailable_format(self
, video_id
, format
):
1102 """Report extracted video URL."""
1103 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
1105 def report_rtmp_download(self
):
1106 """Indicate the download will use the RTMP protocol."""
1107 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
1109 def _real_initialize(self
):
1110 if self
._downloader
is None:
1115 downloader_params
= self
._downloader
.params
1117 # Attempt to use provided username and password or .netrc data
1118 if downloader_params
.get('username', None) is not None:
1119 username
= downloader_params
['username']
1120 password
= downloader_params
['password']
1121 elif downloader_params
.get('usenetrc', False):
1123 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
1124 if info
is not None:
1128 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
1129 except (IOError, netrc
.NetrcParseError
), err
:
1130 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
1134 request
= urllib2
.Request(self
._LANG
_URL
)
1137 urllib2
.urlopen(request
).read()
1138 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1139 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
1142 # No authentication to be performed
1143 if username
is None:
1148 'current_form': 'loginForm',
1150 'action_login': 'Log In',
1151 'username': username
,
1152 'password': password
,
1154 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
))
1157 login_results
= urllib2
.urlopen(request
).read()
1158 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
1159 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
1161 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1162 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
1168 'action_confirm': 'Confirm',
1170 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
))
1172 self
.report_age_confirmation()
1173 age_results
= urllib2
.urlopen(request
).read()
1174 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1175 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1178 def _real_extract(self
, url
):
1179 # Extract video id from URL
1180 mobj
= re
.match(self
._VALID
_URL
, url
)
1182 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1184 video_id
= mobj
.group(2)
1187 self
.report_video_webpage_download(video_id
)
1188 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
)
1190 video_webpage
= urllib2
.urlopen(request
).read()
1191 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1192 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
1195 # Attempt to extract SWF player URL
1196 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
1197 if mobj
is not None:
1198 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
1203 self
.report_video_info_webpage_download(video_id
)
1204 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1205 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1206 % (video_id
, el_type
))
1207 request
= urllib2
.Request(video_info_url
)
1209 video_info_webpage
= urllib2
.urlopen(request
).read()
1210 video_info
= parse_qs(video_info_webpage
)
1211 if 'token' in video_info
:
1213 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1214 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
1216 if 'token' not in video_info
:
1217 if 'reason' in video_info
:
1218 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
1220 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
1223 # Start extracting information
1224 self
.report_information_extraction(video_id
)
1227 if 'author' not in video_info
:
1228 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1230 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
1233 if 'title' not in video_info
:
1234 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1236 video_title
= urllib
.unquote_plus(video_info
['title'][0])
1237 video_title
= video_title
.decode('utf-8')
1238 video_title
= sanitize_title(video_title
)
1241 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1242 simple_title
= simple_title
.strip(ur
'_')
1245 if 'thumbnail_url' not in video_info
:
1246 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
1247 video_thumbnail
= ''
1248 else: # don't panic if we can't find it
1249 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
1253 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
1254 if mobj
is not None:
1255 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
1256 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
1257 for expression
in format_expressions
:
1259 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
1267 video_description
= u
'No description available.'
1268 if self
._downloader
.params
.get('forcedescription', False) or self
._downloader
.params
.get('writedescription', False):
1269 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', video_webpage
)
1270 if mobj
is not None:
1271 video_description
= mobj
.group(1).decode('utf-8')
1273 html_parser
= lxml
.etree
.HTMLParser(encoding
='utf-8')
1274 vwebpage_doc
= lxml
.etree
.parse(StringIO
.StringIO(video_webpage
), html_parser
)
1275 video_description
= u
''.join(vwebpage_doc
.xpath('id("eow-description")//text()'))
1276 # TODO use another parser
1279 video_token
= urllib
.unquote_plus(video_info
['token'][0])
1281 # Decide which formats to download
1282 req_format
= self
._downloader
.params
.get('format', None)
1284 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
1285 self
.report_rtmp_download()
1286 video_url_list
= [(None, video_info
['conn'][0])]
1287 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
1288 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
1289 url_data
= [parse_qs(uds
) for uds
in url_data_strs
]
1290 url_data
= filter(lambda ud
: 'itag' in ud
and 'url' in ud
, url_data
)
1291 url_map
= dict((ud
['itag'][0], ud
['url'][0]) for ud
in url_data
)
1293 format_limit
= self
._downloader
.params
.get('format_limit', None)
1294 if format_limit
is not None and format_limit
in self
._available
_formats
:
1295 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
1297 format_list
= self
._available
_formats
1298 existing_formats
= [x
for x
in format_list
if x
in url_map
]
1299 if len(existing_formats
) == 0:
1300 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
1302 if req_format
is None:
1303 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
1304 elif req_format
== '-1':
1305 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
1308 if req_format
not in url_map
:
1309 self
._downloader
.trouble(u
'ERROR: requested format not available')
1311 video_url_list
= [(req_format
, url_map
[req_format
])] # Specific format
1313 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
1316 for format_param
, video_real_url
in video_url_list
:
1317 # At this point we have a new video
1318 self
._downloader
.increment_downloads()
1321 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
1324 # Process video information
1325 self
._downloader
.process_info({
1326 'id': video_id
.decode('utf-8'),
1327 'url': video_real_url
.decode('utf-8'),
1328 'uploader': video_uploader
.decode('utf-8'),
1329 'upload_date': upload_date
,
1330 'title': video_title
,
1331 'stitle': simple_title
,
1332 'ext': video_extension
.decode('utf-8'),
1333 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
1334 'thumbnail': video_thumbnail
.decode('utf-8'),
1335 'description': video_description
,
1336 'player_url': player_url
,
1338 except UnavailableVideoError
, err
:
1339 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1342 class MetacafeIE(InfoExtractor
):
1343 """Information Extractor for metacafe.com."""
1345 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
1346 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
1347 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
1350 def __init__(self
, youtube_ie
, downloader
=None):
1351 InfoExtractor
.__init
__(self
, downloader
)
1352 self
._youtube
_ie
= youtube_ie
1356 return (re
.match(MetacafeIE
._VALID
_URL
, url
) is not None)
1358 def report_disclaimer(self
):
1359 """Report disclaimer retrieval."""
1360 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
1362 def report_age_confirmation(self
):
1363 """Report attempt to confirm age."""
1364 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
1366 def report_download_webpage(self
, video_id
):
1367 """Report webpage download."""
1368 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
1370 def report_extraction(self
, video_id
):
1371 """Report information extraction."""
1372 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
1374 def _real_initialize(self
):
1375 # Retrieve disclaimer
1376 request
= urllib2
.Request(self
._DISCLAIMER
)
1378 self
.report_disclaimer()
1379 disclaimer
= urllib2
.urlopen(request
).read()
1380 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1381 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
1387 'submit': "Continue - I'm over 18",
1389 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
))
1391 self
.report_age_confirmation()
1392 disclaimer
= urllib2
.urlopen(request
).read()
1393 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1394 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
1397 def _real_extract(self
, url
):
1398 # Extract id and simplified title from URL
1399 mobj
= re
.match(self
._VALID
_URL
, url
)
1401 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1404 video_id
= mobj
.group(1)
1406 # Check if video comes from YouTube
1407 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
1408 if mobj2
is not None:
1409 self
._youtube
_ie
.extract('http://www.youtube.com/watch?v=%s' % mobj2
.group(1))
1412 # At this point we have a new video
1413 self
._downloader
.increment_downloads()
1415 simple_title
= mobj
.group(2).decode('utf-8')
1417 # Retrieve video webpage to extract further information
1418 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
1420 self
.report_download_webpage(video_id
)
1421 webpage
= urllib2
.urlopen(request
).read()
1422 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1423 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1426 # Extract URL, uploader and title from webpage
1427 self
.report_extraction(video_id
)
1428 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
1429 if mobj
is not None:
1430 mediaURL
= urllib
.unquote(mobj
.group(1))
1431 video_extension
= mediaURL
[-3:]
1433 # Extract gdaKey if available
1434 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
1436 video_url
= mediaURL
1438 gdaKey
= mobj
.group(1)
1439 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
1441 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
1443 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1445 vardict
= parse_qs(mobj
.group(1))
1446 if 'mediaData' not in vardict
:
1447 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1449 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
1451 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1453 mediaURL
= mobj
.group(1).replace('\\/', '/')
1454 video_extension
= mediaURL
[-3:]
1455 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
1457 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
1459 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1461 video_title
= mobj
.group(1).decode('utf-8')
1462 video_title
= sanitize_title(video_title
)
1464 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
1466 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1468 video_uploader
= mobj
.group(1)
1471 # Process video information
1472 self
._downloader
.process_info({
1473 'id': video_id
.decode('utf-8'),
1474 'url': video_url
.decode('utf-8'),
1475 'uploader': video_uploader
.decode('utf-8'),
1476 'upload_date': u
'NA',
1477 'title': video_title
,
1478 'stitle': simple_title
,
1479 'ext': video_extension
.decode('utf-8'),
1483 except UnavailableVideoError
:
1484 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1487 class DailymotionIE(InfoExtractor
):
1488 """Information Extractor for Dailymotion"""
1490 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
1492 def __init__(self
, downloader
=None):
1493 InfoExtractor
.__init
__(self
, downloader
)
1497 return (re
.match(DailymotionIE
._VALID
_URL
, url
) is not None)
1499 def report_download_webpage(self
, video_id
):
1500 """Report webpage download."""
1501 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
1503 def report_extraction(self
, video_id
):
1504 """Report information extraction."""
1505 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
1507 def _real_initialize(self
):
1510 def _real_extract(self
, url
):
1511 # Extract id and simplified title from URL
1512 mobj
= re
.match(self
._VALID
_URL
, url
)
1514 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
1517 # At this point we have a new video
1518 self
._downloader
.increment_downloads()
1519 video_id
= mobj
.group(1)
1521 simple_title
= mobj
.group(2).decode('utf-8')
1522 video_extension
= 'flv'
1524 # Retrieve video webpage to extract further information
1525 request
= urllib2
.Request(url
)
1526 request
.add_header('Cookie', 'family_filter=off')
1528 self
.report_download_webpage(video_id
)
1529 webpage
= urllib2
.urlopen(request
).read()
1530 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1531 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
1534 # Extract URL, uploader and title from webpage
1535 self
.report_extraction(video_id
)
1536 mobj
= re
.search(r
'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage
)
1538 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1540 sequence
= urllib
.unquote(mobj
.group(1))
1541 mobj
= re
.search(r
',\"sdURL\"\:\"([^\"]+?)\",', sequence
)
1543 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1545 mediaURL
= urllib
.unquote(mobj
.group(1)).replace('\\', '')
1547 # if needed add http://www.dailymotion.com/ if relative URL
1549 video_url
= mediaURL
1551 mobj
= re
.search(r
'(?im)<title>Dailymotion\s*-\s*(.+)\s*-\s*[^<]+?</title>', webpage
)
1553 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1555 video_title
= mobj
.group(1).decode('utf-8')
1556 video_title
= sanitize_title(video_title
)
1558 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage
)
1560 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
1562 video_uploader
= mobj
.group(1)
1565 # Process video information
1566 self
._downloader
.process_info({
1567 'id': video_id
.decode('utf-8'),
1568 'url': video_url
.decode('utf-8'),
1569 'uploader': video_uploader
.decode('utf-8'),
1570 'upload_date': u
'NA',
1571 'title': video_title
,
1572 'stitle': simple_title
,
1573 'ext': video_extension
.decode('utf-8'),
1577 except UnavailableVideoError
:
1578 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1581 class GoogleIE(InfoExtractor
):
1582 """Information extractor for video.google.com."""
1584 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1586 def __init__(self
, downloader
=None):
1587 InfoExtractor
.__init
__(self
, downloader
)
1591 return (re
.match(GoogleIE
._VALID
_URL
, url
) is not None)
1593 def report_download_webpage(self
, video_id
):
1594 """Report webpage download."""
1595 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
1597 def report_extraction(self
, video_id
):
1598 """Report information extraction."""
1599 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
1601 def _real_initialize(self
):
1604 def _real_extract(self
, url
):
1605 # Extract id from URL
1606 mobj
= re
.match(self
._VALID
_URL
, url
)
1608 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1611 # At this point we have a new video
1612 self
._downloader
.increment_downloads()
1613 video_id
= mobj
.group(1)
1615 video_extension
= 'mp4'
1617 # Retrieve video webpage to extract further information
1618 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
1620 self
.report_download_webpage(video_id
)
1621 webpage
= urllib2
.urlopen(request
).read()
1622 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1623 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1626 # Extract URL, uploader, and title from webpage
1627 self
.report_extraction(video_id
)
1628 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
1630 video_extension
= 'flv'
1631 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
1633 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1635 mediaURL
= urllib
.unquote(mobj
.group(1))
1636 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
1637 mediaURL
= mediaURL
.replace('\\x26', '\x26')
1639 video_url
= mediaURL
1641 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1643 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1645 video_title
= mobj
.group(1).decode('utf-8')
1646 video_title
= sanitize_title(video_title
)
1647 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1649 # Extract video description
1650 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
1652 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1654 video_description
= mobj
.group(1).decode('utf-8')
1655 if not video_description
:
1656 video_description
= 'No description available.'
1658 # Extract video thumbnail
1659 if self
._downloader
.params
.get('forcethumbnail', False):
1660 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
1662 webpage
= urllib2
.urlopen(request
).read()
1663 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1664 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1666 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
1668 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1670 video_thumbnail
= mobj
.group(1)
1671 else: # we need something to pass to process_info
1672 video_thumbnail
= ''
1675 # Process video information
1676 self
._downloader
.process_info({
1677 'id': video_id
.decode('utf-8'),
1678 'url': video_url
.decode('utf-8'),
1680 'upload_date': u
'NA',
1681 'title': video_title
,
1682 'stitle': simple_title
,
1683 'ext': video_extension
.decode('utf-8'),
1687 except UnavailableVideoError
:
1688 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1691 class PhotobucketIE(InfoExtractor
):
1692 """Information extractor for photobucket.com."""
1694 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1696 def __init__(self
, downloader
=None):
1697 InfoExtractor
.__init
__(self
, downloader
)
1701 return (re
.match(PhotobucketIE
._VALID
_URL
, url
) is not None)
1703 def report_download_webpage(self
, video_id
):
1704 """Report webpage download."""
1705 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
1707 def report_extraction(self
, video_id
):
1708 """Report information extraction."""
1709 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
1711 def _real_initialize(self
):
1714 def _real_extract(self
, url
):
1715 # Extract id from URL
1716 mobj
= re
.match(self
._VALID
_URL
, url
)
1718 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1721 # At this point we have a new video
1722 self
._downloader
.increment_downloads()
1723 video_id
= mobj
.group(1)
1725 video_extension
= 'flv'
1727 # Retrieve video webpage to extract further information
1728 request
= urllib2
.Request(url
)
1730 self
.report_download_webpage(video_id
)
1731 webpage
= urllib2
.urlopen(request
).read()
1732 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1733 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1736 # Extract URL, uploader, and title from webpage
1737 self
.report_extraction(video_id
)
1738 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
1740 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
1742 mediaURL
= urllib
.unquote(mobj
.group(1))
1744 video_url
= mediaURL
1746 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
1748 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1750 video_title
= mobj
.group(1).decode('utf-8')
1751 video_title
= sanitize_title(video_title
)
1752 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1754 video_uploader
= mobj
.group(2).decode('utf-8')
1757 # Process video information
1758 self
._downloader
.process_info({
1759 'id': video_id
.decode('utf-8'),
1760 'url': video_url
.decode('utf-8'),
1761 'uploader': video_uploader
,
1762 'upload_date': u
'NA',
1763 'title': video_title
,
1764 'stitle': simple_title
,
1765 'ext': video_extension
.decode('utf-8'),
1769 except UnavailableVideoError
:
1770 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1773 class YahooIE(InfoExtractor
):
1774 """Information extractor for video.yahoo.com."""
1776 # _VALID_URL matches all Yahoo! Video URLs
1777 # _VPAGE_URL matches only the extractable '/watch/' URLs
1778 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1779 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1781 def __init__(self
, downloader
=None):
1782 InfoExtractor
.__init
__(self
, downloader
)
1786 return (re
.match(YahooIE
._VALID
_URL
, url
) is not None)
1788 def report_download_webpage(self
, video_id
):
1789 """Report webpage download."""
1790 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
1792 def report_extraction(self
, video_id
):
1793 """Report information extraction."""
1794 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
1796 def _real_initialize(self
):
1799 def _real_extract(self
, url
, new_video
=True):
1800 # Extract ID from URL
1801 mobj
= re
.match(self
._VALID
_URL
, url
)
1803 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1806 # At this point we have a new video
1807 self
._downloader
.increment_downloads()
1808 video_id
= mobj
.group(2)
1809 video_extension
= 'flv'
1811 # Rewrite valid but non-extractable URLs as
1812 # extractable English language /watch/ URLs
1813 if re
.match(self
._VPAGE
_URL
, url
) is None:
1814 request
= urllib2
.Request(url
)
1816 webpage
= urllib2
.urlopen(request
).read()
1817 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1818 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1821 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
1823 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
1825 yahoo_id
= mobj
.group(1)
1827 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
1829 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
1831 yahoo_vid
= mobj
.group(1)
1833 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
1834 return self
._real
_extract
(url
, new_video
=False)
1836 # Retrieve video webpage to extract further information
1837 request
= urllib2
.Request(url
)
1839 self
.report_download_webpage(video_id
)
1840 webpage
= urllib2
.urlopen(request
).read()
1841 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1842 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1845 # Extract uploader and title from webpage
1846 self
.report_extraction(video_id
)
1847 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
1849 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1851 video_title
= mobj
.group(1).decode('utf-8')
1852 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1854 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
1856 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1858 video_uploader
= mobj
.group(1).decode('utf-8')
1860 # Extract video thumbnail
1861 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
1863 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
1865 video_thumbnail
= mobj
.group(1).decode('utf-8')
1867 # Extract video description
1868 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
1870 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
1872 video_description
= mobj
.group(1).decode('utf-8')
1873 if not video_description
:
1874 video_description
= 'No description available.'
1876 # Extract video height and width
1877 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
1879 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
1881 yv_video_height
= mobj
.group(1)
1883 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
1885 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
1887 yv_video_width
= mobj
.group(1)
1889 # Retrieve video playlist to extract media URL
1890 # I'm not completely sure what all these options are, but we
1891 # seem to need most of them, otherwise the server sends a 401.
1892 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1893 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1894 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1895 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1896 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1898 self
.report_download_webpage(video_id
)
1899 webpage
= urllib2
.urlopen(request
).read()
1900 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1901 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1904 # Extract media URL from playlist XML
1905 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1907 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1909 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1910 video_url
= re
.sub(r
'(?u)&(.+?);', htmlentity_transform
, video_url
)
1913 # Process video information
1914 self
._downloader
.process_info({
1915 'id': video_id
.decode('utf-8'),
1917 'uploader': video_uploader
,
1918 'upload_date': u
'NA',
1919 'title': video_title
,
1920 'stitle': simple_title
,
1921 'ext': video_extension
.decode('utf-8'),
1922 'thumbnail': video_thumbnail
.decode('utf-8'),
1923 'description': video_description
,
1924 'thumbnail': video_thumbnail
,
1927 except UnavailableVideoError
:
1928 self
._downloader
.trouble(u
'\nERROR: unable to download video')
1931 class VimeoIE(InfoExtractor
):
1932 """Information extractor for vimeo.com."""
1934 # _VALID_URL matches Vimeo URLs
1935 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
1937 def __init__(self
, downloader
=None):
1938 InfoExtractor
.__init
__(self
, downloader
)
1942 return (re
.match(VimeoIE
._VALID
_URL
, url
) is not None)
1944 def report_download_webpage(self
, video_id
):
1945 """Report webpage download."""
1946 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
1948 def report_extraction(self
, video_id
):
1949 """Report information extraction."""
1950 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
1952 def _real_initialize(self
):
1955 def _real_extract(self
, url
, new_video
=True):
1956 # Extract ID from URL
1957 mobj
= re
.match(self
._VALID
_URL
, url
)
1959 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1962 # At this point we have a new video
1963 self
._downloader
.increment_downloads()
1964 video_id
= mobj
.group(1)
1966 # Retrieve video webpage to extract further information
1967 request
= urllib2
.Request("http://vimeo.com/moogaloop/load/clip:%s" % video_id
, None, std_headers
)
1969 self
.report_download_webpage(video_id
)
1970 webpage
= urllib2
.urlopen(request
).read()
1971 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1972 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1975 # Now we begin extracting as much information as we can from what we
1976 # retrieved. First we extract the information common to all extractors,
1977 # and latter we extract those that are Vimeo specific.
1978 self
.report_extraction(video_id
)
1981 mobj
= re
.search(r
'<caption>(.*?)</caption>', webpage
)
1983 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
1985 video_title
= mobj
.group(1).decode('utf-8')
1986 simple_title
= re
.sub(ur
'(?u)([^%s]+)' % simple_title_chars
, ur
'_', video_title
)
1989 mobj
= re
.search(r
'<uploader_url>http://vimeo.com/(.*?)</uploader_url>', webpage
)
1991 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
1993 video_uploader
= mobj
.group(1).decode('utf-8')
1995 # Extract video thumbnail
1996 mobj
= re
.search(r
'<thumbnail>(.*?)</thumbnail>', webpage
)
1998 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
2000 video_thumbnail
= mobj
.group(1).decode('utf-8')
2002 # # Extract video description
2003 # mobj = re.search(r'<meta property="og:description" content="(.*)" />', webpage)
2005 # self._downloader.trouble(u'ERROR: unable to extract video description')
2007 # video_description = mobj.group(1).decode('utf-8')
2008 # if not video_description: video_description = 'No description available.'
2009 video_description
= 'Foo.'
2011 # Vimeo specific: extract request signature
2012 mobj
= re
.search(r
'<request_signature>(.*?)</request_signature>', webpage
)
2014 self
._downloader
.trouble(u
'ERROR: unable to extract request signature')
2016 sig
= mobj
.group(1).decode('utf-8')
2018 # Vimeo specific: Extract request signature expiration
2019 mobj
= re
.search(r
'<request_signature_expires>(.*?)</request_signature_expires>', webpage
)
2021 self
._downloader
.trouble(u
'ERROR: unable to extract request signature expiration')
2023 sig_exp
= mobj
.group(1).decode('utf-8')
2025 video_url
= "http://vimeo.com/moogaloop/play/clip:%s/%s/%s" % (video_id
, sig
, sig_exp
)
2028 # Process video information
2029 self
._downloader
.process_info({
2030 'id': video_id
.decode('utf-8'),
2032 'uploader': video_uploader
,
2033 'upload_date': u
'NA',
2034 'title': video_title
,
2035 'stitle': simple_title
,
2037 'thumbnail': video_thumbnail
.decode('utf-8'),
2038 'description': video_description
,
2039 'thumbnail': video_thumbnail
,
2040 'description': video_description
,
2043 except UnavailableVideoError
:
2044 self
._downloader
.trouble(u
'ERROR: unable to download video')
2047 class GenericIE(InfoExtractor
):
2048 """Generic last-resort information extractor."""
2050 def __init__(self
, downloader
=None):
2051 InfoExtractor
.__init
__(self
, downloader
)
2057 def report_download_webpage(self
, video_id
):
2058 """Report webpage download."""
2059 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
2060 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
2062 def report_extraction(self
, video_id
):
2063 """Report information extraction."""
2064 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
2066 def _real_initialize(self
):
2069 def _real_extract(self
, url
):
2070 # At this point we have a new video
2071 self
._downloader
.increment_downloads()
2073 video_id
= url
.split('/')[-1]
2074 request
= urllib2
.Request(url
)
2076 self
.report_download_webpage(video_id
)
2077 webpage
= urllib2
.urlopen(request
).read()
2078 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2079 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
2081 except ValueError, err
:
2082 # since this is the last-resort InfoExtractor, if
2083 # this error is thrown, it'll be thrown here
2084 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
2087 self
.report_extraction(video_id
)
2088 # Start with something easy: JW Player in SWFObject
2089 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
2091 # Broaden the search a little bit
2092 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
2094 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2097 # It's possible that one of the regexes
2098 # matched, but returned an empty group:
2099 if mobj.group(1) is None:
2100 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
2103 video_url = urllib.unquote(mobj.group(1))
2104 video_id = os.path.basename(video_url)
2106 # here's a fun little line of code for you:
2107 video_extension = os.path.splitext(video_id)[1][1:]
2108 video_id = os.path.splitext(video_id)[0]
2110 # it's tempting to parse this further, but you would
2111 # have to take into account all the variations like
2112 # Video Title - Site Name
2113 # Site Name | Video Title
2114 # Video Title - Tagline | Site Name
2115 # and so on and so forth; it's just not practical
2116 mobj = re.search(r'<title>(.*)</title>', webpage)
2118 self._downloader.trouble(u'ERROR: unable to extract title')
2120 video_title = mobj.group(1).decode('utf-8')
2121 video_title = sanitize_title(video_title)
2122 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
2124 # video uploader is domain name
2125 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
2127 self._downloader.trouble(u'ERROR: unable to extract title')
2129 video_uploader = mobj.group(1).decode('utf-8')
2132 # Process video information
2133 self._downloader.process_info({
2134 'id': video_id.decode('utf-8'),
2135 'url': video_url.decode('utf-8'),
2136 'uploader': video_uploader,
2137 'upload_date': u'NA',
2138 'title': video_title,
2139 'stitle': simple_title,
2140 'ext': video_extension.decode('utf-8'),
2144 except UnavailableVideoError, err:
2145 self._downloader.trouble(u'\nERROR: unable to download video')
2148 class YoutubeSearchIE(InfoExtractor):
2149 """Information Extractor for YouTube search queries."""
2150 _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
2151 _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
2152 _VIDEO_INDICATOR = r'href="/watch
\?v
=.+?
"'
2153 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2155 _max_youtube_results = 1000
2157 def __init__(self, youtube_ie, downloader=None):
2158 InfoExtractor.__init__(self, downloader)
2159 self._youtube_ie = youtube_ie
2163 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
2165 def report_download_page(self, query, pagenum):
2166 """Report attempt to download playlist page with given number."""
2167 query = query.decode(preferredencoding())
2168 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
2170 def _real_initialize(self):
2171 self._youtube_ie.initialize()
2173 def _real_extract(self, query):
2174 mobj = re.match(self._VALID_QUERY, query)
2176 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2179 prefix, query = query.split(':')
2181 query = query.encode('utf-8')
2183 self._download_n_results(query, 1)
2185 elif prefix == 'all':
2186 self._download_n_results(query, self._max_youtube_results)
2192 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2194 elif n > self._max_youtube_results:
2195 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
2196 n = self._max_youtube_results
2197 self._download_n_results(query, n)
2199 except ValueError: # parsing prefix as integer fails
2200 self._download_n_results(query, 1)
2203 def _download_n_results(self, query, n):
2204 """Downloads a specified number of results for a query"""
2207 already_seen = set()
2211 self.report_download_page(query, pagenum)
2212 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2213 request = urllib2.Request(result_url)
2215 page = urllib2.urlopen(request).read()
2216 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2217 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2220 # Extract video identifiers
2221 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2222 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
2223 if video_id not in already_seen:
2224 video_ids.append(video_id)
2225 already_seen.add(video_id)
2226 if len(video_ids) == n:
2227 # Specified n videos reached
2228 for id in video_ids:
2229 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2232 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2233 for id in video_ids:
2234 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2237 pagenum = pagenum + 1
2240 class GoogleSearchIE(InfoExtractor):
2241 """Information Extractor for Google Video search queries."""
2242 _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
2243 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
2244 _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
2245 _MORE_PAGES_INDICATOR = r'<span>Next</span>'
2247 _max_google_results = 1000
2249 def __init__(self, google_ie, downloader=None):
2250 InfoExtractor.__init__(self, downloader)
2251 self._google_ie = google_ie
2255 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
2257 def report_download_page(self, query, pagenum):
2258 """Report attempt to download playlist page with given number."""
2259 query = query.decode(preferredencoding())
2260 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
2262 def _real_initialize(self):
2263 self._google_ie.initialize()
2265 def _real_extract(self, query):
2266 mobj = re.match(self._VALID_QUERY, query)
2268 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2271 prefix, query = query.split(':')
2273 query = query.encode('utf-8')
2275 self._download_n_results(query, 1)
2277 elif prefix == 'all':
2278 self._download_n_results(query, self._max_google_results)
2284 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2286 elif n > self._max_google_results:
2287 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
2288 n = self._max_google_results
2289 self._download_n_results(query, n)
2291 except ValueError: # parsing prefix as integer fails
2292 self._download_n_results(query, 1)
2295 def _download_n_results(self, query, n):
2296 """Downloads a specified number of results for a query"""
2299 already_seen = set()
2303 self.report_download_page(query, pagenum)
2304 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2305 request = urllib2.Request(result_url)
2307 page = urllib2.urlopen(request).read()
2308 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2309 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2312 # Extract video identifiers
2313 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2314 video_id = mobj.group(1)
2315 if video_id not in already_seen:
2316 video_ids.append(video_id)
2317 already_seen.add(video_id)
2318 if len(video_ids) == n:
2319 # Specified n videos reached
2320 for id in video_ids:
2321 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2324 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2325 for id in video_ids:
2326 self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
2329 pagenum = pagenum + 1
2332 class YahooSearchIE(InfoExtractor):
2333 """Information Extractor for Yahoo! Video search queries."""
2334 _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
2335 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
2336 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
2337 _MORE_PAGES_INDICATOR = r'\s*Next'
2339 _max_yahoo_results = 1000
2341 def __init__(self, yahoo_ie, downloader=None):
2342 InfoExtractor.__init__(self, downloader)
2343 self._yahoo_ie = yahoo_ie
2347 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
2349 def report_download_page(self, query, pagenum):
2350 """Report attempt to download playlist page with given number."""
2351 query = query.decode(preferredencoding())
2352 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
2354 def _real_initialize(self):
2355 self._yahoo_ie.initialize()
2357 def _real_extract(self, query):
2358 mobj = re.match(self._VALID_QUERY, query)
2360 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
2363 prefix, query = query.split(':')
2365 query = query.encode('utf-8')
2367 self._download_n_results(query, 1)
2369 elif prefix == 'all':
2370 self._download_n_results(query, self._max_yahoo_results)
2376 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
2378 elif n > self._max_yahoo_results:
2379 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
2380 n = self._max_yahoo_results
2381 self._download_n_results(query, n)
2383 except ValueError: # parsing prefix as integer fails
2384 self._download_n_results(query, 1)
2387 def _download_n_results(self, query, n):
2388 """Downloads a specified number of results for a query"""
2391 already_seen = set()
2395 self.report_download_page(query, pagenum)
2396 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
2397 request = urllib2.Request(result_url)
2399 page = urllib2.urlopen(request).read()
2400 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2401 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2404 # Extract video identifiers
2405 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2406 video_id = mobj.group(1)
2407 if video_id not in already_seen:
2408 video_ids.append(video_id)
2409 already_seen.add(video_id)
2410 if len(video_ids) == n:
2411 # Specified n videos reached
2412 for id in video_ids:
2413 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2416 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2417 for id in video_ids:
2418 self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
2421 pagenum = pagenum + 1
2424 class YoutubePlaylistIE(InfoExtractor):
2425 """Information Extractor for YouTube playlists."""
2427 _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)([0-9A-Za-z]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
2428 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
2429 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2430 _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
2433 def __init__(self, youtube_ie, downloader=None):
2434 InfoExtractor.__init__(self, downloader)
2435 self._youtube_ie = youtube_ie
2439 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
2441 def report_download_page(self, playlist_id, pagenum):
2442 """Report attempt to download playlist page with given number."""
2443 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
2445 def _real_initialize(self):
2446 self._youtube_ie.initialize()
2448 def _real_extract(self, url):
2449 # Extract playlist id
2450 mobj = re.match(self._VALID_URL, url)
2452 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2456 if mobj.group(3) is not None:
2457 self._youtube_ie.extract(mobj.group(3))
2460 # Download playlist pages
2461 # prefix is 'p' as default for playlists but there are other types that need extra care
2462 playlist_prefix = mobj.group(1)
2463 if playlist_prefix == 'a':
2464 playlist_access = 'artist'
2466 playlist_prefix = 'p'
2467 playlist_access = 'view_play_list'
2468 playlist_id = mobj.group(2)
2473 self.report_download_page(playlist_id, pagenum)
2474 request = urllib2.Request(self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum))
2476 page = urllib2.urlopen(request).read()
2477 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2478 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2481 # Extract video identifiers
2483 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2484 if mobj.group(1) not in ids_in_page:
2485 ids_in_page.append(mobj.group(1))
2486 video_ids.extend(ids_in_page)
2488 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
2490 pagenum = pagenum + 1
2492 playliststart = self._downloader.params.get('playliststart', 1) - 1
2493 playlistend = self._downloader.params.get('playlistend', -1)
2494 video_ids = video_ids[playliststart:playlistend]
2496 for id in video_ids:
2497 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
2501 class YoutubeUserIE(InfoExtractor):
2502 """Information Extractor for YouTube users."""
2504 _VALID_URL = r'(?:(?:(?:http://)?(?:\w+\.)?youtube.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
2505 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
2506 _GDATA_PAGE_SIZE = 50
2507 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
2508 _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
2511 def __init__(self, youtube_ie, downloader=None):
2512 InfoExtractor.__init__(self, downloader)
2513 self._youtube_ie = youtube_ie
2517 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
2519 def report_download_page(self, username, start_index):
2520 """Report attempt to download user page."""
2521 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
2522 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
2524 def _real_initialize(self):
2525 self._youtube_ie.initialize()
2527 def _real_extract(self, url):
2529 mobj = re.match(self._VALID_URL, url)
2531 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
2534 username = mobj.group(1)
2536 # Download video ids using YouTube Data API. Result size per
2537 # query is limited (currently to 50 videos) so we need to query
2538 # page by page until there are no video ids - it means we got
2545 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
2546 self.report_download_page(username, start_index)
2548 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
2551 page = urllib2.urlopen(request).read()
2552 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2553 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
2556 # Extract video identifiers
2559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
2560 if mobj.group(1) not in ids_in_page:
2561 ids_in_page.append(mobj.group(1))
2563 video_ids.extend(ids_in_page)
2565 # A little optimization - if current page is not
2566 # "full
", ie. does not contain PAGE_SIZE video ids then
2567 # we can assume that this page is the last one - there
2568 # are no more ids on further pages - no need to query
2571 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
2576 all_ids_count = len(video_ids)
2577 playliststart = self._downloader.params.get('playliststart', 1) - 1
2578 playlistend = self._downloader.params.get('playlistend', -1)
2580 if playlistend == -1:
2581 video_ids = video_ids[playliststart:]
2583 video_ids = video_ids[playliststart:playlistend]
2585 self._downloader.to_screen("[youtube
] user
%s: Collected
%d video
ids (downloading
%d of them
)" %
2586 (username, all_ids_count, len(video_ids)))
2588 for video_id in video_ids:
2589 self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % video_id)
2592 class DepositFilesIE(InfoExtractor):
2593 """Information extractor for depositfiles.com"""
2595 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)'
2597 def __init__(self, downloader=None):
2598 InfoExtractor.__init__(self, downloader)
2602 return (re.match(DepositFilesIE._VALID_URL, url) is not None)
2604 def report_download_webpage(self, file_id):
2605 """Report webpage download."""
2606 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2608 def report_extraction(self, file_id):
2609 """Report information extraction."""
2610 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2612 def _real_initialize(self):
2615 def _real_extract(self, url):
2616 # At this point we have a new file
2617 self._downloader.increment_downloads()
2619 file_id = url.split('/')[-1]
2620 # Rebuild url in english locale
2621 url = 'http://depositfiles.com/en/files/' + file_id
2623 # Retrieve file webpage with 'Free download' button pressed
2624 free_download_indication = { 'gateway_result' : '1' }
2625 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
2627 self.report_download_webpage(file_id)
2628 webpage = urllib2.urlopen(request).read()
2629 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2630 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
2633 # Search for the real file URL
2634 mobj = re.search(r'<form action="(http
://fileshare
.+?
)"', webpage)
2635 if (mobj is None) or (mobj.group(1) is None):
2636 # Try to figure out reason of the error.
2637 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2638 if (mobj is not None) and (mobj.group(1) is not None):
2639 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2640 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2642 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2645 file_url = mobj.group(1)
2646 file_extension = os.path.splitext(file_url)[1][1:]
2648 # Search for file title
2649 mobj = re.search(r'<b title="(.*?
)">', webpage)
2651 self._downloader.trouble(u'ERROR: unable to extract title')
2653 file_title = mobj.group(1).decode('utf-8')
2656 # Process file information
2657 self._downloader.process_info({
2658 'id': file_id.decode('utf-8'),
2659 'url': file_url.decode('utf-8'),
2661 'upload_date': u'NA',
2662 'title': file_title,
2663 'stitle': file_title,
2664 'ext': file_extension.decode('utf-8'),
2668 except UnavailableVideoError, err:
2669 self._downloader.trouble(u'ERROR: unable to download file')
2672 class FacebookIE(InfoExtractor):
2673 """Information Extractor for Facebook"""
2675 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook.com/video/video.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2676 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2677 _NETRC_MACHINE = 'facebook'
2678 _available_formats = ['highqual', 'lowqual']
2679 _video_extensions = {
2684 def __init__(self, downloader=None):
2685 InfoExtractor.__init__(self, downloader)
2689 return (re.match(FacebookIE._VALID_URL, url) is not None)
2691 def _reporter(self, message):
2692 """Add header and report message."""
2693 self._downloader.to_screen(u'[facebook] %s' % message)
2695 def report_login(self):
2696 """Report attempt to log in."""
2697 self._reporter(u'Logging in')
2699 def report_video_webpage_download(self, video_id):
2700 """Report attempt to download video webpage."""
2701 self._reporter(u'%s: Downloading video webpage' % video_id)
2703 def report_information_extraction(self, video_id):
2704 """Report attempt to extract video information."""
2705 self._reporter(u'%s: Extracting video information' % video_id)
2707 def _parse_page(self, video_webpage):
2708 """Extract video information from page"""
2710 data = {'title': r'class="video_title datawrap
">(.*?)</',
2711 'description': r'<div class="datawrap
">(.*?)</div>',
2712 'owner': r'\("video_owner_name
", "(.*?
)"\)',
2713 'upload_date': r'data-date="(.*?
)"',
2714 'thumbnail': r'\("thumb_url
", "(?P
<THUMB
>.*?
)"\)',
2717 for piece in data.keys():
2718 mobj = re.search(data[piece], video_webpage)
2719 if mobj is not None:
2720 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape
"))
2724 for fmt in self._available_formats:
2725 mobj = re.search(r'\("%s_src
\", "(.+?)"\
)' % fmt, video_webpage)
2726 if mobj is not None:
2727 # URL is in a Javascript segment inside an escaped Unicode format within
2728 # the generally utf-8 page
2729 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
2730 video_info['video_urls
'] = video_urls
2734 def _real_initialize(self):
2735 if self._downloader is None:
2740 downloader_params = self._downloader.params
2742 # Attempt to use provided username and password or .netrc data
2743 if downloader_params.get('username
', None) is not None:
2744 useremail = downloader_params['username
']
2745 password = downloader_params['password
']
2746 elif downloader_params.get('usenetrc
', False):
2748 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2749 if info is not None:
2753 raise netrc.NetrcParseError('No authenticators
for %s' % self._NETRC_MACHINE)
2754 except (IOError, netrc.NetrcParseError), err:
2755 self._downloader.to_stderr(u'WARNING
: parsing
.netrc
: %s' % str(err))
2758 if useremail is None:
2767 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
2770 login_results = urllib2.urlopen(request).read()
2771 if re.search(r'<form(.*)name
="login"(.*)</form
>', login_results) is not None:
2772 self._downloader.to_stderr(u'WARNING
: unable to log
in: bad username
/password
, or exceded login rate
limit (~
3/min). Check credentials
or wait
.')
2774 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2775 self._downloader.to_stderr(u'WARNING
: unable to log
in: %s' % str(err))
2778 def _real_extract(self, url):
2779 mobj = re.match(self._VALID_URL, url)
2781 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2783 video_id = mobj.group('ID
')
2786 self.report_video_webpage_download(video_id)
2787 request = urllib2.Request('https
://www
.facebook
.com
/video
/video
.php?v
=%s' % video_id)
2789 page = urllib2.urlopen(request)
2790 video_webpage = page.read()
2791 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2792 self._downloader.trouble(u'ERROR
: unable to download video webpage
: %s' % str(err))
2795 # Start extracting information
2796 self.report_information_extraction(video_id)
2798 # Extract information
2799 video_info = self._parse_page(video_webpage)
2802 if 'owner
' not in video_info:
2803 self._downloader.trouble(u'ERROR
: unable to extract uploader nickname
')
2805 video_uploader = video_info['owner
']
2808 if 'title
' not in video_info:
2809 self._downloader.trouble(u'ERROR
: unable to extract video title
')
2811 video_title = video_info['title
']
2812 video_title = video_title.decode('utf
-8')
2813 video_title = sanitize_title(video_title)
2816 simple_title = re.sub(ur'(?u
)([^
%s]+)' % simple_title_chars, ur'_
', video_title)
2817 simple_title = simple_title.strip(ur'_
')
2820 if 'thumbnail
' not in video_info:
2821 self._downloader.trouble(u'WARNING
: unable to extract video thumbnail
')
2822 video_thumbnail = ''
2824 video_thumbnail = video_info['thumbnail
']
2828 if 'upload_date
' in video_info:
2829 upload_time = video_info['upload_date
']
2830 timetuple = email.utils.parsedate_tz(upload_time)
2831 if timetuple is not None:
2833 upload_date = time.strftime('%Y
%m
%d', timetuple[0:9])
2838 video_description = video_info.get('description
', 'No description available
.')
2840 url_map = video_info['video_urls
']
2841 if len(url_map.keys()) > 0:
2842 # Decide which formats to download
2843 req_format = self._downloader.params.get('format
', None)
2844 format_limit = self._downloader.params.get('format_limit
', None)
2846 if format_limit is not None and format_limit in self._available_formats:
2847 format_list = self._available_formats[self._available_formats.index(format_limit):]
2849 format_list = self._available_formats
2850 existing_formats = [x for x in format_list if x in url_map]
2851 if len(existing_formats) == 0:
2852 self._downloader.trouble(u'ERROR
: no known formats available
for video
')
2854 if req_format is None:
2855 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2856 elif req_format == '-1':
2857 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2860 if req_format not in url_map:
2861 self._downloader.trouble(u'ERROR
: requested format
not available
')
2863 video_url_list = [(req_format, url_map[req_format])] # Specific format
2865 for format_param, video_real_url in video_url_list:
2867 # At this point we have a new video
2868 self._downloader.increment_downloads()
2871 video_extension = self._video_extensions.get(format_param, 'mp4
')
2874 # Process video information
2875 self._downloader.process_info({
2876 'id': video_id.decode('utf
-8'),
2877 'url
': video_real_url.decode('utf
-8'),
2878 'uploader
': video_uploader.decode('utf
-8'),
2879 'upload_date
': upload_date,
2880 'title
': video_title,
2881 'stitle
': simple_title,
2882 'ext
': video_extension.decode('utf
-8'),
2883 'format
': (format_param is None and u'NA
' or format_param.decode('utf
-8')),
2884 'thumbnail
': video_thumbnail.decode('utf
-8'),
2885 'description
': video_description.decode('utf
-8'),
2888 except UnavailableVideoError, err:
2889 self._downloader.trouble(u'\nERROR
: unable to download video
')
2891 class BlipTVIE(InfoExtractor):
2892 """Information extractor for blip.tv"""
2894 _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?blip\
.tv(/.+)$
'
2895 _URL_EXT = r'^
.*\
.([a
-z0
-9]+)$
'
2899 return (re.match(BlipTVIE._VALID_URL, url) is not None)
2901 def report_extraction(self, file_id):
2902 """Report information extraction."""
2903 self._downloader.to_screen(u'[blip
.tv
] %s: Extracting information
' % file_id)
2905 def _simplify_title(self, title):
2906 res = re.sub(ur'(?u
)([^
%s]+)' % simple_title_chars, ur'_
', title)
2907 res = res.strip(ur'_
')
2910 def _real_extract(self, url):
2911 mobj = re.match(self._VALID_URL, url)
2913 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2920 json_url = url + cchar + 'skin
=json
&version
=2&no_wrap
=1'
2921 request = urllib2.Request(json_url)
2922 self.report_extraction(mobj.group(1))
2924 json_code = urllib2.urlopen(request).read()
2925 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2926 self._downloader.trouble(u'ERROR
: unable to download video info webpage
: %s' % str(err))
2929 json_data = json.loads(json_code)
2930 if 'Post
' in json_data:
2931 data = json_data['Post
']
2935 upload_date = datetime.datetime.strptime(data['datestamp
'], '%m
-%d-%y
%H
:%M
%p
').strftime('%Y
%m
%d')
2936 video_url = data['media
']['url
']
2937 umobj = re.match(self._URL_EXT, video_url)
2939 raise ValueError('Can
not determine filename extension
')
2940 ext = umobj.group(1)
2942 self._downloader.increment_downloads()
2945 'id': data['item_id
'],
2947 'uploader
': data['display_name
'],
2948 'upload_date
': upload_date,
2949 'title
': data['title
'],
2950 'stitle
': self._simplify_title(data['title
']),
2952 'format
': data['media
']['mimeType
'],
2953 'thumbnail
': data['thumbnailUrl
'],
2954 'description
': data['description
'],
2955 'player_url
': data['embedUrl
']
2957 except (ValueError,KeyError), err:
2958 self._downloader.trouble(u'ERROR
: unable to parse video information
: %s' % repr(err))
2962 self._downloader.process_info(info)
2963 except UnavailableVideoError, err:
2964 self._downloader.trouble(u'\nERROR
: unable to download video
')
2967 class MyVideoIE(InfoExtractor):
2968 """Information Extractor for myvideo.de."""
2970 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?myvideo\
.de
/watch
/([0-9]+)/([^?
/]+).*'
2972 def __init__(self, downloader=None):
2973 InfoExtractor.__init__(self, downloader)
2977 return (re.match(MyVideoIE._VALID_URL, url) is not None)
2979 def report_download_webpage(self, video_id):
2980 """Report webpage download."""
2981 self._downloader.to_screen(u'[myvideo
] %s: Downloading webpage
' % video_id)
2983 def report_extraction(self, video_id):
2984 """Report information extraction."""
2985 self._downloader.to_screen(u'[myvideo
] %s: Extracting information
' % video_id)
2987 def _real_initialize(self):
2990 def _real_extract(self,url):
2991 mobj = re.match(self._VALID_URL, url)
2993 self._download.trouble(u'ERROR
: invalid URL
: %s' % url)
2996 video_id = mobj.group(1)
2997 simple_title = mobj.group(2).decode('utf
-8')
2998 # should actually not be necessary
2999 simple_title = sanitize_title(simple_title)
3000 simple_title = re.sub(ur'(?u
)([^
%s]+)' % simple_title_chars, ur'_
', simple_title)
3003 request = urllib2.Request('http
://www
.myvideo
.de
/watch
/%s' % video_id)
3005 self.report_download_webpage(video_id)
3006 webpage = urllib2.urlopen(request).read()
3007 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3008 self._downloader.trouble(u'ERROR
: Unable to retrieve video webpage
: %s' % str(err))
3011 self.report_extraction(video_id)
3012 mobj = re.search(r'<link rel
=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
3015 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
3017 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
3019 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
3021 self._downloader.trouble(u'ERROR
: unable to extract title
')
3024 video_title = mobj.group(1)
3025 video_title = sanitize_title(video_title)
3029 self._downloader.process_info({
3033 'upload_date
': u'NA
',
3034 'title
': video_title,
3035 'stitle
': simple_title,
3040 except UnavailableVideoError:
3041 self._downloader.trouble(u'\nERROR
: Unable to download video
')
3043 class ComedyCentralIE(InfoExtractor):
3044 """Information extractor for The Daily Show and Colbert Report """
3046 _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
'
3050 return (re.match(ComedyCentralIE._VALID_URL, url) is not None)
3052 def report_extraction(self, episode_id):
3053 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
3055 def report_config_download(self, episode_id):
3056 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
3058 def report_player_url(self, episode_id):
3059 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
3061 def _simplify_title(self, title):
3062 res = re.sub(ur'(?u
)([^
%s]+)' % simple_title_chars, ur'_
', title)
3063 res = res.strip(ur'_
')
3066 def _real_extract(self, url):
3067 mobj = re.match(self._VALID_URL, url)
3069 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
3072 if mobj.group('shortname
'):
3073 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
3074 url = 'http
://www
.thedailyshow
.com
/full
-episodes
/'
3076 url = 'http
://www
.colbertnation
.com
/full
-episodes
/'
3077 mobj = re.match(self._VALID_URL, url)
3078 assert mobj is not None
3080 dlNewest = not mobj.group('episode
')
3082 epTitle = mobj.group('showname
')
3084 epTitle = mobj.group('episode
')
3086 req = urllib2.Request(url)
3087 self.report_extraction(epTitle)
3089 htmlHandle = urllib2.urlopen(req)
3090 html = htmlHandle.read()
3091 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3092 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
3095 url = htmlHandle.geturl()
3096 mobj = re.match(self._VALID_URL, url)
3098 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
3100 if mobj.group('episode
') == '':
3101 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
3103 epTitle = mobj.group('episode
')
3105 mMovieParams = re.findall('<param name
="movie" value
="(http://media.mtvnservices.com/(.*?:episode:([^:]*):)(.*?))"/>', html)
3106 if len(mMovieParams) == 0:
3107 self._downloader.trouble(u'ERROR
: unable to find Flash URL
in webpage
' + url)
3109 show_id = mMovieParams[0][2]
3110 ACT_COUNT = { # TODO: Detect this dynamically
3111 'thedailyshow
.com
': 4,
3112 'colbertnation
.com
': 3,
3115 'thedailyshow
.com
': 1,
3116 'colbertnation
.com
': 1,
3119 first_player_url = mMovieParams[0][0]
3120 startMediaNum = int(mMovieParams[0][3]) + OFFSET
3121 movieId = mMovieParams[0][1]
3123 playerReq = urllib2.Request(first_player_url)
3124 self.report_player_url(epTitle)
3126 playerResponse = urllib2.urlopen(playerReq)
3127 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3128 self._downloader.trouble(u'ERROR
: unable to download player
: %s' % unicode(err))
3130 player_url = playerResponse.geturl()
3132 for actNum in range(ACT_COUNT):
3133 mediaNum = startMediaNum + actNum
3134 mediaId = movieId + str(mediaNum)
3135 configUrl = ('http
://www
.comedycentral
.com
/global/feeds
/entertainment
/media
/mediaGenEntertainment
.jhtml?
' +
3136 urllib.urlencode({'uri
': mediaId}))
3137 configReq = urllib2.Request(configUrl)
3138 self.report_config_download(epTitle)
3140 configXml = urllib2.urlopen(configReq).read()
3141 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
3142 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
3145 cdoc = xml.etree.ElementTree.fromstring(configXml)
3147 for rendition in cdoc.findall('.//rendition
'):
3148 finfo = (rendition.attrib['bitrate
'], rendition.findall('./src
')[0].text)
3152 self._downloader.trouble(u'\nERROR
: unable to download
' + str(mediaNum) + ': No videos found
')
3155 # For now, just pick the highest bitrate
3156 format,video_url = turls[-1]
3158 self._downloader.increment_downloads()
3160 effTitle = show_id.replace('.com
', '') + '-' + epTitle
3162 'id': str(mediaNum),
3164 'uploader
': show_id,
3165 'upload_date
': 'NA
',
3167 'stitle
': self._simplify_title(effTitle),
3171 'description
': 'TODO
: Not yet supported
',
3172 'player_url
': player_url
3176 self._downloader.process_info(info)
3177 except UnavailableVideoError, err:
3178 self._downloader.trouble(u'\nERROR
: unable to download
' + str(mediaNum))
3182 class PostProcessor(object):
3183 """Post Processor class.
3185 PostProcessor objects can be added to downloaders with their
3186 add_post_processor() method. When the downloader has finished a
3187 successful download, it will take its internal chain of PostProcessors
3188 and start calling the run() method on each one of them, first with
3189 an initial argument and then with the returned value of the previous
3192 The chain will be stopped if one of them ever returns None or the end
3193 of the chain is reached.
3195 PostProcessor objects follow a "mutual registration" process similar
3196 to InfoExtractor objects.
3201 def __init__(self, downloader=None):
3202 self._downloader = downloader
3204 def set_downloader(self, downloader):
3205 """Sets the downloader for this PP."""
3206 self._downloader = downloader
3208 def run(self, information):
3209 """Run the PostProcessor.
3211 The "information" argument is a dictionary like the ones
3212 composed by InfoExtractors. The only difference is that this
3213 one has an extra field called "filepath" that points to the
3216 When this method returns None, the postprocessing chain is
3217 stopped. However, this method may return an information
3218 dictionary that will be passed to the next postprocessing
3219 object in the chain. It can be the one it received after
3220 changing some fields.
3222 In addition, this method may raise a PostProcessingError
3223 exception that will be taken into account by the downloader
3226 return information # by default, do nothing
3229 class FFmpegExtractAudioPP(PostProcessor):
3231 def __init__(self, downloader=None, preferredcodec=None):
3232 PostProcessor.__init__(self, downloader)
3233 if preferredcodec is None:
3234 preferredcodec = 'best
'
3235 self._preferredcodec = preferredcodec
3238 def get_audio_codec(path):
3240 cmd = ['ffprobe
', '-show_streams
', '--', path]
3241 handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w
'), stdout=subprocess.PIPE)
3242 output = handle.communicate()[0]
3243 if handle.wait() != 0:
3245 except (IOError, OSError):
3248 for line in output.split('\n'):
3249 if line.startswith('codec_name
='):
3250 audio_codec = line.split('=')[1].strip()
3251 elif line.strip() == 'codec_type
=audio
' and audio_codec is not None:
3256 def run_ffmpeg(path, out_path, codec, more_opts):
3258 cmd = ['ffmpeg
', '-y
', '-i
', path, '-vn
', '-acodec
', codec] + more_opts + ['--', out_path]
3259 ret = subprocess.call(cmd, stdout=file(os.path.devnull, 'w
'), stderr=subprocess.STDOUT)
3261 except (IOError, OSError):
3264 def run(self, information):
3265 path = information['filepath
']
3267 filecodec = self.get_audio_codec(path)
3268 if filecodec is None:
3269 self._downloader.to_stderr(u'WARNING
: unable to obtain
file audio codec
with ffprobe
')
3273 if self._preferredcodec == 'best
' or self._preferredcodec == filecodec:
3274 if filecodec == 'aac
' or filecodec == 'mp3
':
3275 # Lossless if possible
3277 extension = filecodec
3278 if filecodec == 'aac
':
3279 more_opts = ['-f
', 'adts
']
3282 acodec = 'libmp3lame
'
3284 more_opts = ['-ab
', '128k
']
3286 # We convert the audio (lossy)
3287 acodec = {'mp3
': 'libmp3lame
', 'aac
': 'aac
'}[self._preferredcodec]
3288 extension = self._preferredcodec
3289 more_opts = ['-ab
', '128k
']
3290 if self._preferredcodec == 'aac
':
3291 more_opts += ['-f
', 'adts
']
3293 (prefix, ext) = os.path.splitext(path)
3294 new_path = prefix + '.' + extension
3295 self._downloader.to_screen(u'[ffmpeg
] Destination
: %s' % new_path)
3296 status = self.run_ffmpeg(path, new_path, acodec, more_opts)
3299 self._downloader.to_stderr(u'WARNING
: error running ffmpeg
')
3304 except (IOError, OSError):
3305 self._downloader.to_stderr(u'WARNING
: Unable to remove downloaded video
file')
3308 information['filepath
'] = new_path
3312 def updateSelf(downloader, filename):
3313 ''' Update the program file with the latest version from the repository '''
3314 # Note: downloader only used for options
3315 if not os.access(filename, os.W_OK):
3316 sys.exit('ERROR
: no write permissions on
%s' % filename)
3318 downloader.to_screen('Updating to latest version
...')
3322 urlh = urllib.urlopen(UPDATE_URL)
3323 newcontent = urlh.read()
3326 except (IOError, OSError), err:
3327 sys.exit('ERROR
: unable to download latest version
')
3330 outf = open(filename, 'wb
')
3332 outf.write(newcontent)
3335 except (IOError, OSError), err:
3336 sys.exit('ERROR
: unable to overwrite current version
')
3338 downloader.to_screen('Updated youtube
-dl
. Restart to use the new version
.')
3345 def _format_option_string(option):
3346 ''' ('-o
', '--option
') -> -o, --format METAVAR'''
3350 if option._short_opts: opts.append(option._short_opts[0])
3351 if option._long_opts: opts.append(option._long_opts[0])
3352 if len(opts) > 1: opts.insert(1, ', ')
3354 if option.takes_value(): opts.append(' %s' % option.metavar)
3356 return "".join(opts)
3358 def _find_term_columns():
3359 columns = os.environ.get('COLUMNS
', None)
3364 sp = subprocess.Popen(['stty
', 'size
'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
3365 out,err = sp.communicate()
3366 return int(out.split()[1])
3372 max_help_position = 80
3374 # No need to wrap help messages if we're on a wide console
3375 columns
= _find_term_columns()
3376 if columns
: max_width
= columns
3378 fmt
= optparse
.IndentedHelpFormatter(width
=max_width
, max_help_position
=max_help_position
)
3379 fmt
.format_option_strings
= _format_option_string
3382 'version' : __version__
,
3384 'usage' : '%prog [options] url...',
3385 'conflict_handler' : 'resolve',
3388 parser
= optparse
.OptionParser(**kw
)
3391 general
= optparse
.OptionGroup(parser
, 'General Options')
3392 authentication
= optparse
.OptionGroup(parser
, 'Authentication Options')
3393 video_format
= optparse
.OptionGroup(parser
, 'Video Format Options')
3394 postproc
= optparse
.OptionGroup(parser
, 'Post-processing Options')
3395 filesystem
= optparse
.OptionGroup(parser
, 'Filesystem Options')
3396 verbosity
= optparse
.OptionGroup(parser
, 'Verbosity / Simulation Options')
3398 general
.add_option('-h', '--help',
3399 action
='help', help='print this help text and exit')
3400 general
.add_option('-v', '--version',
3401 action
='version', help='print program version and exit')
3402 general
.add_option('-U', '--update',
3403 action
='store_true', dest
='update_self', help='update this program to latest version')
3404 general
.add_option('-i', '--ignore-errors',
3405 action
='store_true', dest
='ignoreerrors', help='continue on download errors', default
=False)
3406 general
.add_option('-r', '--rate-limit',
3407 dest
='ratelimit', metavar
='LIMIT', help='download rate limit (e.g. 50k or 44.6m)')
3408 general
.add_option('-R', '--retries',
3409 dest
='retries', metavar
='RETRIES', help='number of retries (default is 10)', default
=10)
3410 general
.add_option('--playlist-start',
3411 dest
='playliststart', metavar
='NUMBER', help='playlist video to start at (default is 1)', default
=1)
3412 general
.add_option('--playlist-end',
3413 dest
='playlistend', metavar
='NUMBER', help='playlist video to end at (default is last)', default
=-1)
3414 general
.add_option('--dump-user-agent',
3415 action
='store_true', dest
='dump_user_agent',
3416 help='display the current browser identification', default
=False)
3418 authentication
.add_option('-u', '--username',
3419 dest
='username', metavar
='USERNAME', help='account username')
3420 authentication
.add_option('-p', '--password',
3421 dest
='password', metavar
='PASSWORD', help='account password')
3422 authentication
.add_option('-n', '--netrc',
3423 action
='store_true', dest
='usenetrc', help='use .netrc authentication data', default
=False)
3426 video_format
.add_option('-f', '--format',
3427 action
='store', dest
='format', metavar
='FORMAT', help='video format code')
3428 video_format
.add_option('--all-formats',
3429 action
='store_const', dest
='format', help='download all available video formats', const
='-1')
3430 video_format
.add_option('--max-quality',
3431 action
='store', dest
='format_limit', metavar
='FORMAT', help='highest quality format to download')
3434 verbosity
.add_option('-q', '--quiet',
3435 action
='store_true', dest
='quiet', help='activates quiet mode', default
=False)
3436 verbosity
.add_option('-s', '--simulate',
3437 action
='store_true', dest
='simulate', help='do not download video', default
=False)
3438 verbosity
.add_option('-g', '--get-url',
3439 action
='store_true', dest
='geturl', help='simulate, quiet but print URL', default
=False)
3440 verbosity
.add_option('-e', '--get-title',
3441 action
='store_true', dest
='gettitle', help='simulate, quiet but print title', default
=False)
3442 verbosity
.add_option('--get-thumbnail',
3443 action
='store_true', dest
='getthumbnail',
3444 help='simulate, quiet but print thumbnail URL', default
=False)
3445 verbosity
.add_option('--get-description',
3446 action
='store_true', dest
='getdescription',
3447 help='simulate, quiet but print video description', default
=False)
3448 verbosity
.add_option('--get-filename',
3449 action
='store_true', dest
='getfilename',
3450 help='simulate, quiet but print output filename', default
=False)
3451 verbosity
.add_option('--no-progress',
3452 action
='store_true', dest
='noprogress', help='do not print progress bar', default
=False)
3453 verbosity
.add_option('--console-title',
3454 action
='store_true', dest
='consoletitle',
3455 help='display progress in console titlebar', default
=False)
3458 filesystem
.add_option('-t', '--title',
3459 action
='store_true', dest
='usetitle', help='use title in file name', default
=False)
3460 filesystem
.add_option('-l', '--literal',
3461 action
='store_true', dest
='useliteral', help='use literal title in file name', default
=False)
3462 filesystem
.add_option('-A', '--auto-number',
3463 action
='store_true', dest
='autonumber',
3464 help='number downloaded files starting from 00000', default
=False)
3465 filesystem
.add_option('-o', '--output',
3466 dest
='outtmpl', metavar
='TEMPLATE', help='output filename template')
3467 filesystem
.add_option('-a', '--batch-file',
3468 dest
='batchfile', metavar
='FILE', help='file containing URLs to download (\'-\' for stdin)')
3469 filesystem
.add_option('-w', '--no-overwrites',
3470 action
='store_true', dest
='nooverwrites', help='do not overwrite files', default
=False)
3471 filesystem
.add_option('-c', '--continue',
3472 action
='store_true', dest
='continue_dl', help='resume partially downloaded files', default
=False)
3473 filesystem
.add_option('--cookies',
3474 dest
='cookiefile', metavar
='FILE', help='file to dump cookie jar to')
3475 filesystem
.add_option('--no-part',
3476 action
='store_true', dest
='nopart', help='do not use .part files', default
=False)
3477 filesystem
.add_option('--no-mtime',
3478 action
='store_false', dest
='updatetime',
3479 help='do not use the Last-modified header to set the file modification time', default
=True)
3480 filesystem
.add_option('--write-description',
3481 action
='store_true', dest
='writedescription',
3482 help='write video description to a .description file', default
=False)
3483 filesystem
.add_option('--write-info-json',
3484 action
='store_true', dest
='writeinfojson',
3485 help='write video metadata to a .info.json file', default
=False)
3488 postproc
.add_option('--extract-audio', action
='store_true', dest
='extractaudio', default
=False,
3489 help='convert video files to audio-only files (requires ffmpeg and ffprobe)')
3490 postproc
.add_option('--audio-format', metavar
='FORMAT', dest
='audioformat', default
='best',
3491 help='"best", "aac" or "mp3"; best by default')
3494 parser
.add_option_group(general
)
3495 parser
.add_option_group(filesystem
)
3496 parser
.add_option_group(verbosity
)
3497 parser
.add_option_group(video_format
)
3498 parser
.add_option_group(authentication
)
3499 parser
.add_option_group(postproc
)
3501 opts
, args
= parser
.parse_args()
3503 return parser
, opts
, args
3506 parser
, opts
, args
= parseOpts()
3508 # Open appropriate CookieJar
3509 if opts
.cookiefile
is None:
3510 jar
= cookielib
.CookieJar()
3513 jar
= cookielib
.MozillaCookieJar(opts
.cookiefile
)
3514 if os
.path
.isfile(opts
.cookiefile
) and os
.access(opts
.cookiefile
, os
.R_OK
):
3516 except (IOError, OSError), err
:
3517 sys
.exit(u
'ERROR: unable to open cookie file')
3520 if opts
.dump_user_agent
:
3521 print std_headers
['User-Agent']
3524 # General configuration
3525 cookie_processor
= urllib2
.HTTPCookieProcessor(jar
)
3526 opener
= urllib2
.build_opener(urllib2
.ProxyHandler(), cookie_processor
, YoutubeDLHandler())
3527 urllib2
.install_opener(opener
)
3528 socket
.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
3530 # Batch file verification
3532 if opts
.batchfile
is not None:
3534 if opts
.batchfile
== '-':
3537 batchfd
= open(opts
.batchfile
, 'r')
3538 batchurls
= batchfd
.readlines()
3539 batchurls
= [x
.strip() for x
in batchurls
]
3540 batchurls
= [x
for x
in batchurls
if len(x
) > 0 and not re
.search(r
'^[#/;]', x
)]
3542 sys
.exit(u
'ERROR: batch file could not be read')
3543 all_urls
= batchurls
+ args
3545 # Conflicting, missing and erroneous options
3546 if opts
.usenetrc
and (opts
.username
is not None or opts
.password
is not None):
3547 parser
.error(u
'using .netrc conflicts with giving username/password')
3548 if opts
.password
is not None and opts
.username
is None:
3549 parser
.error(u
'account username missing')
3550 if opts
.outtmpl
is not None and (opts
.useliteral
or opts
.usetitle
or opts
.autonumber
):
3551 parser
.error(u
'using output template conflicts with using title, literal title or auto number')
3552 if opts
.usetitle
and opts
.useliteral
:
3553 parser
.error(u
'using title conflicts with using literal title')
3554 if opts
.username
is not None and opts
.password
is None:
3555 opts
.password
= getpass
.getpass(u
'Type account password and press return:')
3556 if opts
.ratelimit
is not None:
3557 numeric_limit
= FileDownloader
.parse_bytes(opts
.ratelimit
)
3558 if numeric_limit
is None:
3559 parser
.error(u
'invalid rate limit specified')
3560 opts
.ratelimit
= numeric_limit
3561 if opts
.retries
is not None:
3563 opts
.retries
= long(opts
.retries
)
3564 except (TypeError, ValueError), err
:
3565 parser
.error(u
'invalid retry count specified')
3567 opts
.playliststart
= int(opts
.playliststart
)
3568 if opts
.playliststart
<= 0:
3569 raise ValueError(u
'Playlist start must be positive')
3570 except (TypeError, ValueError), err
:
3571 parser
.error(u
'invalid playlist start number specified')
3573 opts
.playlistend
= int(opts
.playlistend
)
3574 if opts
.playlistend
!= -1 and (opts
.playlistend
<= 0 or opts
.playlistend
< opts
.playliststart
):
3575 raise ValueError(u
'Playlist end must be greater than playlist start')
3576 except (TypeError, ValueError), err
:
3577 parser
.error(u
'invalid playlist end number specified')
3578 if opts
.extractaudio
:
3579 if opts
.audioformat
not in ['best', 'aac', 'mp3']:
3580 parser
.error(u
'invalid audio format specified')
3582 # Information extractors
3583 youtube_ie
= YoutubeIE()
3584 metacafe_ie
= MetacafeIE(youtube_ie
)
3585 dailymotion_ie
= DailymotionIE()
3586 youtube_pl_ie
= YoutubePlaylistIE(youtube_ie
)
3587 youtube_user_ie
= YoutubeUserIE(youtube_ie
)
3588 youtube_search_ie
= YoutubeSearchIE(youtube_ie
)
3589 google_ie
= GoogleIE()
3590 google_search_ie
= GoogleSearchIE(google_ie
)
3591 photobucket_ie
= PhotobucketIE()
3592 yahoo_ie
= YahooIE()
3593 yahoo_search_ie
= YahooSearchIE(yahoo_ie
)
3594 deposit_files_ie
= DepositFilesIE()
3595 facebook_ie
= FacebookIE()
3596 bliptv_ie
= BlipTVIE()
3597 vimeo_ie
= VimeoIE()
3598 myvideo_ie
= MyVideoIE()
3599 comedycentral_ie
= ComedyCentralIE()
3601 generic_ie
= GenericIE()
3604 fd
= FileDownloader({
3605 'usenetrc': opts
.usenetrc
,
3606 'username': opts
.username
,
3607 'password': opts
.password
,
3608 'quiet': (opts
.quiet
or opts
.geturl
or opts
.gettitle
or opts
.getthumbnail
or opts
.getdescription
or opts
.getfilename
),
3609 'forceurl': opts
.geturl
,
3610 'forcetitle': opts
.gettitle
,
3611 'forcethumbnail': opts
.getthumbnail
,
3612 'forcedescription': opts
.getdescription
,
3613 'forcefilename': opts
.getfilename
,
3614 'simulate': (opts
.simulate
or opts
.geturl
or opts
.gettitle
or opts
.getthumbnail
or opts
.getdescription
or opts
.getfilename
),
3615 'format': opts
.format
,
3616 'format_limit': opts
.format_limit
,
3617 'outtmpl': ((opts
.outtmpl
is not None and opts
.outtmpl
.decode(preferredencoding()))
3618 or (opts
.format
== '-1' and opts
.usetitle
and u
'%(stitle)s-%(id)s-%(format)s.%(ext)s')
3619 or (opts
.format
== '-1' and opts
.useliteral
and u
'%(title)s-%(id)s-%(format)s.%(ext)s')
3620 or (opts
.format
== '-1' and u
'%(id)s-%(format)s.%(ext)s')
3621 or (opts
.usetitle
and opts
.autonumber
and u
'%(autonumber)s-%(stitle)s-%(id)s.%(ext)s')
3622 or (opts
.useliteral
and opts
.autonumber
and u
'%(autonumber)s-%(title)s-%(id)s.%(ext)s')
3623 or (opts
.usetitle
and u
'%(stitle)s-%(id)s.%(ext)s')
3624 or (opts
.useliteral
and u
'%(title)s-%(id)s.%(ext)s')
3625 or (opts
.autonumber
and u
'%(autonumber)s-%(id)s.%(ext)s')
3626 or u
'%(id)s.%(ext)s'),
3627 'ignoreerrors': opts
.ignoreerrors
,
3628 'ratelimit': opts
.ratelimit
,
3629 'nooverwrites': opts
.nooverwrites
,
3630 'retries': opts
.retries
,
3631 'continuedl': opts
.continue_dl
,
3632 'noprogress': opts
.noprogress
,
3633 'playliststart': opts
.playliststart
,
3634 'playlistend': opts
.playlistend
,
3635 'logtostderr': opts
.outtmpl
== '-',
3636 'consoletitle': opts
.consoletitle
,
3637 'nopart': opts
.nopart
,
3638 'updatetime': opts
.updatetime
,
3639 'writedescription': opts
.writedescription
,
3640 'writeinfojson': opts
.writeinfojson
,
3642 fd
.add_info_extractor(youtube_search_ie
)
3643 fd
.add_info_extractor(youtube_pl_ie
)
3644 fd
.add_info_extractor(youtube_user_ie
)
3645 fd
.add_info_extractor(metacafe_ie
)
3646 fd
.add_info_extractor(dailymotion_ie
)
3647 fd
.add_info_extractor(youtube_ie
)
3648 fd
.add_info_extractor(google_ie
)
3649 fd
.add_info_extractor(google_search_ie
)
3650 fd
.add_info_extractor(photobucket_ie
)
3651 fd
.add_info_extractor(yahoo_ie
)
3652 fd
.add_info_extractor(yahoo_search_ie
)
3653 fd
.add_info_extractor(deposit_files_ie
)
3654 fd
.add_info_extractor(facebook_ie
)
3655 fd
.add_info_extractor(bliptv_ie
)
3656 fd
.add_info_extractor(vimeo_ie
)
3657 fd
.add_info_extractor(myvideo_ie
)
3658 fd
.add_info_extractor(comedycentral_ie
)
3660 # This must come last since it's the
3661 # fallback if none of the others work
3662 fd
.add_info_extractor(generic_ie
)
3665 if opts
.extractaudio
:
3666 fd
.add_post_processor(FFmpegExtractAudioPP(preferredcodec
=opts
.audioformat
))
3669 if opts
.update_self
:
3670 updateSelf(fd
, sys
.argv
[0])
3673 if len(all_urls
) < 1:
3674 if not opts
.update_self
:
3675 parser
.error(u
'you must provide at least one URL')
3678 retcode
= fd
.download(all_urls
)
3680 # Dump cookie jar if requested
3681 if opts
.cookiefile
is not None:
3684 except (IOError, OSError), err
:
3685 sys
.exit(u
'ERROR: unable to save cookie jar')
3690 if __name__
== '__main__':
3693 except DownloadError
:
3695 except SameFileError
:
3696 sys
.exit(u
'ERROR: fixed output name but more than one file to download')
3697 except KeyboardInterrupt:
3698 sys
.exit(u
'\nERROR: Interrupted by user')
3700 # vim: set ts=4 sw=4 sts=4 noet ai si filetype=python: