2 # -*- coding: utf-8 -*-
15 import xml
.etree
.ElementTree
16 from urlparse
import parse_qs
19 import cStringIO
as StringIO
26 class InfoExtractor(object):
27 """Information Extractor class.
29 Information extractors are the classes that, given a URL, extract
30 information from the video (or videos) the URL refers to. This
31 information includes the real video URL, the video title and simplified
32 title, author and others. The information is stored in a dictionary
33 which is then passed to the FileDownloader. The FileDownloader
34 processes this information possibly downloading the video to the file
35 system, among other possible outcomes. The dictionaries must include
40 uploader: Nickname of the video uploader.
42 ext: Video filename extension.
44 player_url: SWF Player URL (may be None).
46 The following fields are optional. Their primary purpose is to allow
47 youtube-dl to serve as the backend for a video search function, such
48 as the one in youtube2mp3. They are only used when their respective
49 forced printing functions are called:
51 thumbnail: Full URL to a video thumbnail image.
52 description: One-line video description.
54 Subclasses of this one should re-define the _real_initialize() and
55 _real_extract() methods and define a _VALID_URL regexp.
56 Probably, they should also be added to the list of extractors.
62 def __init__(self
, downloader
=None):
63 """Constructor. Receives an optional downloader."""
65 self
.set_downloader(downloader
)
67 def suitable(self
, url
):
68 """Receives a URL and returns True if suitable for this IE."""
69 return re
.match(self
._VALID
_URL
, url
) is not None
72 """Initializes an instance (authentication, etc)."""
74 self
._real
_initialize
()
77 def extract(self
, url
):
78 """Extracts URL information and returns it in list of dicts."""
80 return self
._real
_extract
(url
)
82 def set_downloader(self
, downloader
):
83 """Sets the downloader for this IE."""
84 self
._downloader
= downloader
86 def _real_initialize(self
):
87 """Real initialization process. Redefine in subclasses."""
90 def _real_extract(self
, url
):
91 """Real extraction process. Redefine in subclasses."""
95 class YoutubeIE(InfoExtractor
):
96 """Information extractor for youtube.com."""
98 _VALID_URL
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube.majestyc.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$'
99 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
100 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
101 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
102 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
103 _NETRC_MACHINE
= 'youtube'
104 # Listed in order of quality
105 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
106 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
107 _video_extensions
= {
113 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
119 _video_dimensions
= {
137 def report_lang(self
):
138 """Report attempt to set language."""
139 self
._downloader
.to_screen(u
'[youtube] Setting language')
141 def report_login(self
):
142 """Report attempt to log in."""
143 self
._downloader
.to_screen(u
'[youtube] Logging in')
145 def report_age_confirmation(self
):
146 """Report attempt to confirm age."""
147 self
._downloader
.to_screen(u
'[youtube] Confirming age')
149 def report_video_webpage_download(self
, video_id
):
150 """Report attempt to download video webpage."""
151 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
153 def report_video_info_webpage_download(self
, video_id
):
154 """Report attempt to download video info webpage."""
155 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
157 def report_video_subtitles_download(self
, video_id
):
158 """Report attempt to download video info webpage."""
159 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video subtitles' % video_id
)
161 def report_information_extraction(self
, video_id
):
162 """Report attempt to extract video information."""
163 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
165 def report_unavailable_format(self
, video_id
, format
):
166 """Report extracted video URL."""
167 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
169 def report_rtmp_download(self
):
170 """Indicate the download will use the RTMP protocol."""
171 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
173 def _closed_captions_xml_to_srt(self
, xml_string
):
175 texts
= re
.findall(r
'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string
, re
.MULTILINE
)
176 # TODO parse xml instead of regex
177 for n
, (start
, dur_tag
, dur
, caption
) in enumerate(texts
):
178 if not dur
: dur
= '4'
180 end
= start
+ float(dur
)
181 start
= "%02i:%02i:%02i,%03i" %(start
/(60*60), start
/60%60, start
%60, start
%1*1000)
182 end
= "%02i:%02i:%02i,%03i" %(end
/(60*60), end
/60%60, end
%60, end
%1*1000)
183 caption
= unescapeHTML(caption
)
184 caption
= unescapeHTML(caption
) # double cycle, intentional
185 srt
+= str(n
+1) + '\n'
186 srt
+= start
+ ' --> ' + end
+ '\n'
187 srt
+= caption
+ '\n\n'
190 def _print_formats(self
, formats
):
191 print 'Available formats:'
193 print '%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???'))
195 def _real_initialize(self
):
196 if self
._downloader
is None:
201 downloader_params
= self
._downloader
.params
203 # Attempt to use provided username and password or .netrc data
204 if downloader_params
.get('username', None) is not None:
205 username
= downloader_params
['username']
206 password
= downloader_params
['password']
207 elif downloader_params
.get('usenetrc', False):
209 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
214 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
215 except (IOError, netrc
.NetrcParseError
), err
:
216 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
))
220 request
= urllib2
.Request(self
._LANG
_URL
)
223 urllib2
.urlopen(request
).read()
224 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
225 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
))
228 # No authentication to be performed
234 'current_form': 'loginForm',
236 'action_login': 'Log In',
237 'username': username
,
238 'password': password
,
240 request
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
))
243 login_results
= urllib2
.urlopen(request
).read()
244 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
245 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
247 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
248 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
))
254 'action_confirm': 'Confirm',
256 request
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
))
258 self
.report_age_confirmation()
259 age_results
= urllib2
.urlopen(request
).read()
260 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
261 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
264 def _real_extract(self
, url
):
265 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
266 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
268 url
= 'http://www.youtube.com/' + urllib
.unquote(mobj
.group(1)).lstrip('/')
270 # Extract video id from URL
271 mobj
= re
.match(self
._VALID
_URL
, url
)
273 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
275 video_id
= mobj
.group(2)
278 self
.report_video_webpage_download(video_id
)
279 request
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
)
281 video_webpage
= urllib2
.urlopen(request
).read()
282 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
283 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
286 # Attempt to extract SWF player URL
287 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
289 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
294 self
.report_video_info_webpage_download(video_id
)
295 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
296 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
297 % (video_id
, el_type
))
298 request
= urllib2
.Request(video_info_url
)
300 video_info_webpage
= urllib2
.urlopen(request
).read()
301 video_info
= parse_qs(video_info_webpage
)
302 if 'token' in video_info
:
304 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
305 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
))
307 if 'token' not in video_info
:
308 if 'reason' in video_info
:
309 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8'))
311 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
314 # Check for "rental" videos
315 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
316 self
._downloader
.trouble(u
'ERROR: "rental" videos not supported')
319 # Start extracting information
320 self
.report_information_extraction(video_id
)
323 if 'author' not in video_info
:
324 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
326 video_uploader
= urllib
.unquote_plus(video_info
['author'][0])
329 if 'title' not in video_info
:
330 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
332 video_title
= urllib
.unquote_plus(video_info
['title'][0])
333 video_title
= video_title
.decode('utf-8')
336 if 'thumbnail_url' not in video_info
:
337 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
339 else: # don't panic if we can't find it
340 video_thumbnail
= urllib
.unquote_plus(video_info
['thumbnail_url'][0])
344 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
346 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
347 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
348 for expression
in format_expressions
:
350 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
355 video_description
= get_element_by_id("eow-description", video_webpage
.decode('utf8'))
356 if video_description
: video_description
= clean_html(video_description
)
357 else: video_description
= ''
360 video_subtitles
= None
361 if self
._downloader
.params
.get('writesubtitles', False):
363 self
.report_video_subtitles_download(video_id
)
364 request
= urllib2
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
366 srt_list
= urllib2
.urlopen(request
).read()
367 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
368 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % str(err
))
369 srt_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list
)
370 srt_lang_list
= dict((l
[1], l
[0]) for l
in srt_lang_list
)
371 if not srt_lang_list
:
372 raise Trouble(u
'WARNING: video has no closed captions')
373 if self
._downloader
.params
.get('subtitleslang', False):
374 srt_lang
= self
._downloader
.params
.get('subtitleslang')
375 elif 'en' in srt_lang_list
:
378 srt_lang
= srt_lang_list
.keys()[0]
379 if not srt_lang
in srt_lang_list
:
380 raise Trouble(u
'WARNING: no closed captions found in the specified language')
381 request
= urllib2
.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang
, srt_lang_list
[srt_lang
], video_id
))
383 srt_xml
= urllib2
.urlopen(request
).read()
384 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
385 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % str(err
))
387 raise Trouble(u
'WARNING: unable to download video subtitles')
388 video_subtitles
= self
._closed
_captions
_xml
_to
_srt
(srt_xml
.decode('utf-8'))
389 except Trouble
as trouble
:
390 self
._downloader
.trouble(trouble
[0])
393 video_token
= urllib
.unquote_plus(video_info
['token'][0])
395 # Decide which formats to download
396 req_format
= self
._downloader
.params
.get('format', None)
398 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
399 self
.report_rtmp_download()
400 video_url_list
= [(None, video_info
['conn'][0])]
401 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
402 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
403 url_data
= [parse_qs(uds
) for uds
in url_data_strs
]
404 url_data
= filter(lambda ud
: 'itag' in ud
and 'url' in ud
, url_data
)
405 url_map
= dict((ud
['itag'][0], ud
['url'][0] + '&signature=' + ud
['sig'][0]) for ud
in url_data
)
407 format_limit
= self
._downloader
.params
.get('format_limit', None)
408 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
409 if format_limit
is not None and format_limit
in available_formats
:
410 format_list
= available_formats
[available_formats
.index(format_limit
):]
412 format_list
= available_formats
413 existing_formats
= [x
for x
in format_list
if x
in url_map
]
414 if len(existing_formats
) == 0:
415 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
417 if self
._downloader
.params
.get('listformats', None):
418 self
._print
_formats
(existing_formats
)
420 if req_format
is None or req_format
== 'best':
421 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
422 elif req_format
== 'worst':
423 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
424 elif req_format
in ('-1', 'all'):
425 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
427 # Specific formats. We pick the first in a slash-delimeted sequence.
428 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
429 req_formats
= req_format
.split('/')
430 video_url_list
= None
431 for rf
in req_formats
:
433 video_url_list
= [(rf
, url_map
[rf
])]
435 if video_url_list
is None:
436 self
._downloader
.trouble(u
'ERROR: requested format not available')
439 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
443 for format_param
, video_real_url
in video_url_list
:
445 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
448 'id': video_id
.decode('utf-8'),
449 'url': video_real_url
.decode('utf-8'),
450 'uploader': video_uploader
.decode('utf-8'),
451 'upload_date': upload_date
,
452 'title': video_title
,
453 'ext': video_extension
.decode('utf-8'),
454 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
455 'thumbnail': video_thumbnail
.decode('utf-8'),
456 'description': video_description
,
457 'player_url': player_url
,
458 'subtitles': video_subtitles
463 class MetacafeIE(InfoExtractor
):
464 """Information Extractor for metacafe.com."""
466 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
467 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
468 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
469 IE_NAME
= u
'metacafe'
471 def __init__(self
, downloader
=None):
472 InfoExtractor
.__init
__(self
, downloader
)
474 def report_disclaimer(self
):
475 """Report disclaimer retrieval."""
476 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
478 def report_age_confirmation(self
):
479 """Report attempt to confirm age."""
480 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
482 def report_download_webpage(self
, video_id
):
483 """Report webpage download."""
484 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
486 def report_extraction(self
, video_id
):
487 """Report information extraction."""
488 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
490 def _real_initialize(self
):
491 # Retrieve disclaimer
492 request
= urllib2
.Request(self
._DISCLAIMER
)
494 self
.report_disclaimer()
495 disclaimer
= urllib2
.urlopen(request
).read()
496 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
497 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
))
503 'submit': "Continue - I'm over 18",
505 request
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
))
507 self
.report_age_confirmation()
508 disclaimer
= urllib2
.urlopen(request
).read()
509 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
510 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
))
513 def _real_extract(self
, url
):
514 # Extract id and simplified title from URL
515 mobj
= re
.match(self
._VALID
_URL
, url
)
517 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
520 video_id
= mobj
.group(1)
522 # Check if video comes from YouTube
523 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
524 if mobj2
is not None:
525 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % mobj2
.group(1)])
528 # Retrieve video webpage to extract further information
529 request
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
531 self
.report_download_webpage(video_id
)
532 webpage
= urllib2
.urlopen(request
).read()
533 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
534 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
537 # Extract URL, uploader and title from webpage
538 self
.report_extraction(video_id
)
539 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
541 mediaURL
= urllib
.unquote(mobj
.group(1))
542 video_extension
= mediaURL
[-3:]
544 # Extract gdaKey if available
545 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
549 gdaKey
= mobj
.group(1)
550 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
552 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
554 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
556 vardict
= parse_qs(mobj
.group(1))
557 if 'mediaData' not in vardict
:
558 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
560 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
562 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
564 mediaURL
= mobj
.group(1).replace('\\/', '/')
565 video_extension
= mediaURL
[-3:]
566 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
568 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
570 self
._downloader
.trouble(u
'ERROR: unable to extract title')
572 video_title
= mobj
.group(1).decode('utf-8')
574 mobj
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
)
576 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
578 video_uploader
= mobj
.group(1)
581 'id': video_id
.decode('utf-8'),
582 'url': video_url
.decode('utf-8'),
583 'uploader': video_uploader
.decode('utf-8'),
584 'upload_date': u
'NA',
585 'title': video_title
,
586 'ext': video_extension
.decode('utf-8'),
592 class DailymotionIE(InfoExtractor
):
593 """Information Extractor for Dailymotion"""
595 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)'
596 IE_NAME
= u
'dailymotion'
598 def __init__(self
, downloader
=None):
599 InfoExtractor
.__init
__(self
, downloader
)
601 def report_download_webpage(self
, video_id
):
602 """Report webpage download."""
603 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
605 def report_extraction(self
, video_id
):
606 """Report information extraction."""
607 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
609 def _real_extract(self
, url
):
610 # Extract id and simplified title from URL
611 mobj
= re
.match(self
._VALID
_URL
, url
)
613 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
616 video_id
= mobj
.group(1)
618 video_extension
= 'flv'
620 # Retrieve video webpage to extract further information
621 request
= urllib2
.Request(url
)
622 request
.add_header('Cookie', 'family_filter=off')
624 self
.report_download_webpage(video_id
)
625 webpage
= urllib2
.urlopen(request
).read()
626 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
627 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
))
630 # Extract URL, uploader and title from webpage
631 self
.report_extraction(video_id
)
632 mobj
= re
.search(r
'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage
)
634 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
636 sequence
= urllib
.unquote(mobj
.group(1))
637 mobj
= re
.search(r
',\"sdURL\"\:\"([^\"]+?)\",', sequence
)
639 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
641 mediaURL
= urllib
.unquote(mobj
.group(1)).replace('\\', '')
643 # if needed add http://www.dailymotion.com/ if relative URL
647 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
649 self
._downloader
.trouble(u
'ERROR: unable to extract title')
651 video_title
= unescapeHTML(mobj
.group('title').decode('utf-8'))
653 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage
)
655 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
657 video_uploader
= mobj
.group(1)
660 'id': video_id
.decode('utf-8'),
661 'url': video_url
.decode('utf-8'),
662 'uploader': video_uploader
.decode('utf-8'),
663 'upload_date': u
'NA',
664 'title': video_title
,
665 'ext': video_extension
.decode('utf-8'),
671 class GoogleIE(InfoExtractor
):
672 """Information extractor for video.google.com."""
674 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
675 IE_NAME
= u
'video.google'
677 def __init__(self
, downloader
=None):
678 InfoExtractor
.__init
__(self
, downloader
)
680 def report_download_webpage(self
, video_id
):
681 """Report webpage download."""
682 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
684 def report_extraction(self
, video_id
):
685 """Report information extraction."""
686 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
688 def _real_extract(self
, url
):
689 # Extract id from URL
690 mobj
= re
.match(self
._VALID
_URL
, url
)
692 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
695 video_id
= mobj
.group(1)
697 video_extension
= 'mp4'
699 # Retrieve video webpage to extract further information
700 request
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
702 self
.report_download_webpage(video_id
)
703 webpage
= urllib2
.urlopen(request
).read()
704 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
705 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
708 # Extract URL, uploader, and title from webpage
709 self
.report_extraction(video_id
)
710 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
712 video_extension
= 'flv'
713 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
715 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
717 mediaURL
= urllib
.unquote(mobj
.group(1))
718 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
719 mediaURL
= mediaURL
.replace('\\x26', '\x26')
723 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
725 self
._downloader
.trouble(u
'ERROR: unable to extract title')
727 video_title
= mobj
.group(1).decode('utf-8')
729 # Extract video description
730 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
732 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
734 video_description
= mobj
.group(1).decode('utf-8')
735 if not video_description
:
736 video_description
= 'No description available.'
738 # Extract video thumbnail
739 if self
._downloader
.params
.get('forcethumbnail', False):
740 request
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
742 webpage
= urllib2
.urlopen(request
).read()
743 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
744 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
746 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
748 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
750 video_thumbnail
= mobj
.group(1)
751 else: # we need something to pass to process_info
755 'id': video_id
.decode('utf-8'),
756 'url': video_url
.decode('utf-8'),
758 'upload_date': u
'NA',
759 'title': video_title
,
760 'ext': video_extension
.decode('utf-8'),
766 class PhotobucketIE(InfoExtractor
):
767 """Information extractor for photobucket.com."""
769 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME
= u
'photobucket'
772 def __init__(self
, downloader
=None):
773 InfoExtractor
.__init
__(self
, downloader
)
775 def report_download_webpage(self
, video_id
):
776 """Report webpage download."""
777 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
779 def report_extraction(self
, video_id
):
780 """Report information extraction."""
781 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
783 def _real_extract(self
, url
):
784 # Extract id from URL
785 mobj
= re
.match(self
._VALID
_URL
, url
)
787 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
790 video_id
= mobj
.group(1)
792 video_extension
= 'flv'
794 # Retrieve video webpage to extract further information
795 request
= urllib2
.Request(url
)
797 self
.report_download_webpage(video_id
)
798 webpage
= urllib2
.urlopen(request
).read()
799 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
800 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
803 # Extract URL, uploader, and title from webpage
804 self
.report_extraction(video_id
)
805 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
807 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
809 mediaURL
= urllib
.unquote(mobj
.group(1))
813 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
815 self
._downloader
.trouble(u
'ERROR: unable to extract title')
817 video_title
= mobj
.group(1).decode('utf-8')
819 video_uploader
= mobj
.group(2).decode('utf-8')
822 'id': video_id
.decode('utf-8'),
823 'url': video_url
.decode('utf-8'),
824 'uploader': video_uploader
,
825 'upload_date': u
'NA',
826 'title': video_title
,
827 'ext': video_extension
.decode('utf-8'),
833 class YahooIE(InfoExtractor
):
834 """Information extractor for video.yahoo.com."""
836 # _VALID_URL matches all Yahoo! Video URLs
837 # _VPAGE_URL matches only the extractable '/watch/' URLs
838 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
839 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
840 IE_NAME
= u
'video.yahoo'
842 def __init__(self
, downloader
=None):
843 InfoExtractor
.__init
__(self
, downloader
)
845 def report_download_webpage(self
, video_id
):
846 """Report webpage download."""
847 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
849 def report_extraction(self
, video_id
):
850 """Report information extraction."""
851 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
853 def _real_extract(self
, url
, new_video
=True):
854 # Extract ID from URL
855 mobj
= re
.match(self
._VALID
_URL
, url
)
857 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
860 video_id
= mobj
.group(2)
861 video_extension
= 'flv'
863 # Rewrite valid but non-extractable URLs as
864 # extractable English language /watch/ URLs
865 if re
.match(self
._VPAGE
_URL
, url
) is None:
866 request
= urllib2
.Request(url
)
868 webpage
= urllib2
.urlopen(request
).read()
869 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
870 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
873 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
875 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
877 yahoo_id
= mobj
.group(1)
879 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
881 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
883 yahoo_vid
= mobj
.group(1)
885 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
886 return self
._real
_extract
(url
, new_video
=False)
888 # Retrieve video webpage to extract further information
889 request
= urllib2
.Request(url
)
891 self
.report_download_webpage(video_id
)
892 webpage
= urllib2
.urlopen(request
).read()
893 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
894 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
897 # Extract uploader and title from webpage
898 self
.report_extraction(video_id
)
899 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
901 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
903 video_title
= mobj
.group(1).decode('utf-8')
905 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
907 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
909 video_uploader
= mobj
.group(1).decode('utf-8')
911 # Extract video thumbnail
912 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
914 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
916 video_thumbnail
= mobj
.group(1).decode('utf-8')
918 # Extract video description
919 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
921 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
923 video_description
= mobj
.group(1).decode('utf-8')
924 if not video_description
:
925 video_description
= 'No description available.'
927 # Extract video height and width
928 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
930 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
932 yv_video_height
= mobj
.group(1)
934 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
936 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
938 yv_video_width
= mobj
.group(1)
940 # Retrieve video playlist to extract media URL
941 # I'm not completely sure what all these options are, but we
942 # seem to need most of them, otherwise the server sends a 401.
943 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
944 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
945 request
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
946 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
947 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
949 self
.report_download_webpage(video_id
)
950 webpage
= urllib2
.urlopen(request
).read()
951 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
952 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
955 # Extract media URL from playlist XML
956 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
958 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
960 video_url
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
961 video_url
= unescapeHTML(video_url
)
964 'id': video_id
.decode('utf-8'),
966 'uploader': video_uploader
,
967 'upload_date': u
'NA',
968 'title': video_title
,
969 'ext': video_extension
.decode('utf-8'),
970 'thumbnail': video_thumbnail
.decode('utf-8'),
971 'description': video_description
,
972 'thumbnail': video_thumbnail
,
977 class VimeoIE(InfoExtractor
):
978 """Information extractor for vimeo.com."""
980 # _VALID_URL matches Vimeo URLs
981 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)'
984 def __init__(self
, downloader
=None):
985 InfoExtractor
.__init
__(self
, downloader
)
987 def report_download_webpage(self
, video_id
):
988 """Report webpage download."""
989 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
991 def report_extraction(self
, video_id
):
992 """Report information extraction."""
993 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
995 def _real_extract(self
, url
, new_video
=True):
996 # Extract ID from URL
997 mobj
= re
.match(self
._VALID
_URL
, url
)
999 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1002 video_id
= mobj
.group(1)
1004 # Retrieve video webpage to extract further information
1005 request
= urllib2
.Request(url
, None, std_headers
)
1007 self
.report_download_webpage(video_id
)
1008 webpage
= urllib2
.urlopen(request
).read()
1009 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1010 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1013 # Now we begin extracting as much information as we can from what we
1014 # retrieved. First we extract the information common to all extractors,
1015 # and latter we extract those that are Vimeo specific.
1016 self
.report_extraction(video_id
)
1018 # Extract the config JSON
1019 config
= webpage
.split(' = {config:')[1].split(',assets:')[0]
1021 config
= json
.loads(config
)
1023 self
._downloader
.trouble(u
'ERROR: unable to extract info section')
1027 video_title
= config
["video"]["title"]
1030 video_uploader
= config
["video"]["owner"]["name"]
1032 # Extract video thumbnail
1033 video_thumbnail
= config
["video"]["thumbnail"]
1035 # Extract video description
1036 video_description
= get_element_by_id("description", webpage
.decode('utf8'))
1037 if video_description
: video_description
= clean_html(video_description
)
1038 else: video_description
= ''
1040 # Extract upload date
1041 video_upload_date
= u
'NA'
1042 mobj
= re
.search(r
'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage
)
1043 if mobj
is not None:
1044 video_upload_date
= mobj
.group(1)
1046 # Vimeo specific: extract request signature and timestamp
1047 sig
= config
['request']['signature']
1048 timestamp
= config
['request']['timestamp']
1050 # Vimeo specific: extract video codec and quality information
1051 # TODO bind to format param
1052 codecs
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 for codec
in codecs
:
1054 if codec
[0] in config
["video"]["files"]:
1055 video_codec
= codec
[0]
1056 video_extension
= codec
[1]
1057 if 'hd' in config
["video"]["files"][codec
[0]]: quality
= 'hd'
1058 else: quality
= 'sd'
1061 self
._downloader
.trouble(u
'ERROR: no known codec found')
1064 video_url
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1065 %(video_id
, sig
, timestamp
, quality
, video_codec
.upper())
1070 'uploader': video_uploader
,
1071 'upload_date': video_upload_date
,
1072 'title': video_title
,
1073 'ext': video_extension
,
1074 'thumbnail': video_thumbnail
,
1075 'description': video_description
,
1080 class GenericIE(InfoExtractor
):
1081 """Generic last-resort information extractor."""
1084 IE_NAME
= u
'generic'
1086 def __init__(self
, downloader
=None):
1087 InfoExtractor
.__init
__(self
, downloader
)
1089 def report_download_webpage(self
, video_id
):
1090 """Report webpage download."""
1091 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.')
1092 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
)
1094 def report_extraction(self
, video_id
):
1095 """Report information extraction."""
1096 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
)
1098 def report_following_redirect(self
, new_url
):
1099 """Report information extraction."""
1100 self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
)
1102 def _test_redirect(self
, url
):
1103 """Check if it is a redirect, like url shorteners, in case restart chain."""
1104 class HeadRequest(urllib2
.Request
):
1105 def get_method(self
):
1108 class HEADRedirectHandler(urllib2
.HTTPRedirectHandler
):
1110 Subclass the HTTPRedirectHandler to make it use our
1111 HeadRequest also on the redirected URL
1113 def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):
1114 if code
in (301, 302, 303, 307):
1115 newurl
= newurl
.replace(' ', '%20')
1116 newheaders
= dict((k
,v
) for k
,v
in req
.headers
.items()
1117 if k
.lower() not in ("content-length", "content-type"))
1118 return HeadRequest(newurl
,
1120 origin_req_host
=req
.get_origin_req_host(),
1123 raise urllib2
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)
1125 class HTTPMethodFallback(urllib2
.BaseHandler
):
1127 Fallback to GET if HEAD is not allowed (405 HTTP error)
1129 def http_error_405(self
, req
, fp
, code
, msg
, headers
):
1133 newheaders
= dict((k
,v
) for k
,v
in req
.headers
.items()
1134 if k
.lower() not in ("content-length", "content-type"))
1135 return self
.parent
.open(urllib2
.Request(req
.get_full_url(),
1137 origin_req_host
=req
.get_origin_req_host(),
1141 opener
= urllib2
.OpenerDirector()
1142 for handler
in [urllib2
.HTTPHandler
, urllib2
.HTTPDefaultErrorHandler
,
1143 HTTPMethodFallback
, HEADRedirectHandler
,
1144 urllib2
.HTTPErrorProcessor
, urllib2
.HTTPSHandler
]:
1145 opener
.add_handler(handler())
1147 response
= opener
.open(HeadRequest(url
))
1148 new_url
= response
.geturl()
1150 if url
== new_url
: return False
1152 self
.report_following_redirect(new_url
)
1153 self
._downloader
.download([new_url
])
1156 def _real_extract(self
, url
):
1157 if self
._test
_redirect
(url
): return
1159 video_id
= url
.split('/')[-1]
1160 request
= urllib2
.Request(url
)
1162 self
.report_download_webpage(video_id
)
1163 webpage
= urllib2
.urlopen(request
).read()
1164 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
1165 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
))
1167 except ValueError, err
:
1168 # since this is the last-resort InfoExtractor, if
1169 # this error is thrown, it'll be thrown here
1170 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1173 self
.report_extraction(video_id
)
1174 # Start with something easy: JW Player in SWFObject
1175 mobj
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1177 # Broaden the search a little bit
1178 mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage)
1180 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183 # It's possible that one of the regexes
1184 # matched, but returned an empty group:
1185 if mobj.group(1) is None:
1186 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1189 video_url = urllib.unquote(mobj.group(1))
1190 video_id = os.path.basename(video_url)
1192 # here's a fun little line of code for you:
1193 video_extension = os.path.splitext(video_id)[1][1:]
1194 video_id = os.path.splitext(video_id)[0]
1196 # it's tempting to parse this further, but you would
1197 # have to take into account all the variations like
1198 # Video Title - Site Name
1199 # Site Name | Video Title
1200 # Video Title - Tagline | Site Name
1201 # and so on and so forth; it's just not practical
1202 mobj = re.search(r'<title>(.*)</title>', webpage)
1204 self._downloader.trouble(u'ERROR: unable to extract title')
1206 video_title = mobj.group(1).decode('utf-8')
1208 # video uploader is domain name
1209 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1211 self._downloader.trouble(u'ERROR: unable to extract title')
1213 video_uploader = mobj.group(1).decode('utf-8')
1216 'id': video_id.decode('utf-8'),
1217 'url': video_url.decode('utf-8'),
1218 'uploader': video_uploader,
1219 'upload_date': u'NA',
1220 'title': video_title,
1221 'ext': video_extension.decode('utf-8'),
1227 class YoutubeSearchIE(InfoExtractor):
1228 """Information Extractor for YouTube search queries."""
1229 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1230 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1231 _max_youtube_results = 1000
1232 IE_NAME = u'youtube:search'
1234 def __init__(self, downloader=None):
1235 InfoExtractor.__init__(self, downloader)
1237 def report_download_page(self, query, pagenum):
1238 """Report attempt to download search page with given number."""
1239 query = query.decode(preferredencoding())
1240 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1242 def _real_extract(self, query):
1243 mobj = re.match(self._VALID_URL, query)
1245 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1248 prefix, query = query.split(':')
1250 query = query.encode('utf-8')
1252 self._download_n_results(query, 1)
1254 elif prefix == 'all':
1255 self._download_n_results(query, self._max_youtube_results)
1261 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1263 elif n > self._max_youtube_results:
1264 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1265 n = self._max_youtube_results
1266 self._download_n_results(query, n)
1268 except ValueError: # parsing prefix as integer fails
1269 self._download_n_results(query, 1)
1272 def _download_n_results(self, query, n):
1273 """Downloads a specified number of results for a query"""
1279 while (50 * pagenum) < limit:
1280 self.report_download_page(query, pagenum+1)
1281 result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1)
1282 request = urllib2.Request(result_url)
1284 data = urllib2.urlopen(request).read()
1285 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1286 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err))
1288 api_response = json.loads(data)['data']
1290 new_ids = list(video['id'] for video in api_response['items'])
1291 video_ids += new_ids
1293 limit = min(n, api_response['totalItems'])
1296 if len(video_ids) > n:
1297 video_ids = video_ids[:n]
1298 for id in video_ids:
1299 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1303 class GoogleSearchIE(InfoExtractor):
1304 """Information Extractor for Google Video search queries."""
1305 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1306 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1307 _VIDEO_INDICATOR = r'<a href="http
://video\
.google\
.com
/videoplay
\?docid
=([^
"\&]+)'
1308 _MORE_PAGES_INDICATOR = r'class="pn
" id="pnnext
"'
1309 _max_google_results = 1000
1310 IE_NAME = u'video.google:search'
1312 def __init__(self, downloader=None):
1313 InfoExtractor.__init__(self, downloader)
1315 def report_download_page(self, query, pagenum):
1316 """Report attempt to download playlist page with given number."""
1317 query = query.decode(preferredencoding())
1318 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1320 def _real_extract(self, query):
1321 mobj = re.match(self._VALID_URL, query)
1323 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1326 prefix, query = query.split(':')
1328 query = query.encode('utf-8')
1330 self._download_n_results(query, 1)
1332 elif prefix == 'all':
1333 self._download_n_results(query, self._max_google_results)
1339 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1341 elif n > self._max_google_results:
1342 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1343 n = self._max_google_results
1344 self._download_n_results(query, n)
1346 except ValueError: # parsing prefix as integer fails
1347 self._download_n_results(query, 1)
1350 def _download_n_results(self, query, n):
1351 """Downloads a specified number of results for a query"""
1357 self.report_download_page(query, pagenum)
1358 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10)
1359 request = urllib2.Request(result_url)
1361 page = urllib2.urlopen(request).read()
1362 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1363 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1366 # Extract video identifiers
1367 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1368 video_id = mobj.group(1)
1369 if video_id not in video_ids:
1370 video_ids.append(video_id)
1371 if len(video_ids) == n:
1372 # Specified n videos reached
1373 for id in video_ids:
1374 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1377 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1378 for id in video_ids:
1379 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1382 pagenum = pagenum + 1
1385 class YahooSearchIE(InfoExtractor):
1386 """Information Extractor for Yahoo! Video search queries."""
1387 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1388 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1389 _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"'
1390 _MORE_PAGES_INDICATOR = r'\s*Next'
1391 _max_yahoo_results = 1000
1392 IE_NAME = u'video.yahoo:search'
1394 def __init__(self, downloader=None):
1395 InfoExtractor.__init__(self, downloader)
1397 def report_download_page(self, query, pagenum):
1398 """Report attempt to download playlist page with given number."""
1399 query = query.decode(preferredencoding())
1400 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1402 def _real_extract(self, query):
1403 mobj = re.match(self._VALID_URL, query)
1405 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1408 prefix, query = query.split(':')
1410 query = query.encode('utf-8')
1412 self._download_n_results(query, 1)
1414 elif prefix == 'all':
1415 self._download_n_results(query, self._max_yahoo_results)
1421 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1423 elif n > self._max_yahoo_results:
1424 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1425 n = self._max_yahoo_results
1426 self._download_n_results(query, n)
1428 except ValueError: # parsing prefix as integer fails
1429 self._download_n_results(query, 1)
1432 def _download_n_results(self, query, n):
1433 """Downloads a specified number of results for a query"""
1436 already_seen = set()
1440 self.report_download_page(query, pagenum)
1441 result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1442 request = urllib2.Request(result_url)
1444 page = urllib2.urlopen(request).read()
1445 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1446 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1449 # Extract video identifiers
1450 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1451 video_id = mobj.group(1)
1452 if video_id not in already_seen:
1453 video_ids.append(video_id)
1454 already_seen.add(video_id)
1455 if len(video_ids) == n:
1456 # Specified n videos reached
1457 for id in video_ids:
1458 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1461 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1462 for id in video_ids:
1463 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1466 pagenum = pagenum + 1
1469 class YoutubePlaylistIE(InfoExtractor):
1470 """Information Extractor for YouTube playlists."""
1472 _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*'
1473 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1474 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&'
1475 _MORE_PAGES_INDICATOR = r'yt-uix-pager-next'
1476 IE_NAME = u'youtube:playlist'
1478 def __init__(self, downloader=None):
1479 InfoExtractor.__init__(self, downloader)
1481 def report_download_page(self, playlist_id, pagenum):
1482 """Report attempt to download playlist page with given number."""
1483 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1485 def _real_extract(self, url):
1486 # Extract playlist id
1487 mobj = re.match(self._VALID_URL, url)
1489 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1493 if mobj.group(3) is not None:
1494 self._downloader.download([mobj.group(3)])
1497 # Download playlist pages
1498 # prefix is 'p' as default for playlists but there are other types that need extra care
1499 playlist_prefix = mobj.group(1)
1500 if playlist_prefix == 'a':
1501 playlist_access = 'artist'
1503 playlist_prefix = 'p'
1504 playlist_access = 'view_play_list'
1505 playlist_id = mobj.group(2)
1510 self.report_download_page(playlist_id, pagenum)
1511 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1512 request = urllib2.Request(url)
1514 page = urllib2.urlopen(request).read()
1515 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1516 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1519 # Extract video identifiers
1521 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1522 if mobj.group(1) not in ids_in_page:
1523 ids_in_page.append(mobj.group(1))
1524 video_ids.extend(ids_in_page)
1526 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1528 pagenum = pagenum + 1
1530 playliststart = self._downloader.params.get('playliststart', 1) - 1
1531 playlistend = self._downloader.params.get('playlistend', -1)
1532 if playlistend == -1:
1533 video_ids = video_ids[playliststart:]
1535 video_ids = video_ids[playliststart:playlistend]
1537 for id in video_ids:
1538 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1542 class YoutubeUserIE(InfoExtractor):
1543 """Information Extractor for YouTube users."""
1545 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1546 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1547 _GDATA_PAGE_SIZE = 50
1548 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1549 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1550 IE_NAME = u'youtube:user'
1552 def __init__(self, downloader=None):
1553 InfoExtractor.__init__(self, downloader)
1555 def report_download_page(self, username, start_index):
1556 """Report attempt to download user page."""
1557 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1558 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1560 def _real_extract(self, url):
1562 mobj = re.match(self._VALID_URL, url)
1564 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1567 username = mobj.group(1)
1569 # Download video ids using YouTube Data API. Result size per
1570 # query is limited (currently to 50 videos) so we need to query
1571 # page by page until there are no video ids - it means we got
1578 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1579 self.report_download_page(username, start_index)
1581 request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1584 page = urllib2.urlopen(request).read()
1585 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1586 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1589 # Extract video identifiers
1592 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1593 if mobj.group(1) not in ids_in_page:
1594 ids_in_page.append(mobj.group(1))
1596 video_ids.extend(ids_in_page)
1598 # A little optimization - if current page is not
1599 # "full
", ie. does not contain PAGE_SIZE video ids then
1600 # we can assume that this page is the last one - there
1601 # are no more ids on further pages - no need to query
1604 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1609 all_ids_count = len(video_ids)
1610 playliststart = self._downloader.params.get('playliststart', 1) - 1
1611 playlistend = self._downloader.params.get('playlistend', -1)
1613 if playlistend == -1:
1614 video_ids = video_ids[playliststart:]
1616 video_ids = video_ids[playliststart:playlistend]
1618 self._downloader.to_screen(u"[youtube
] user
%s: Collected
%d video
ids (downloading
%d of them
)" %
1619 (username, all_ids_count, len(video_ids)))
1621 for video_id in video_ids:
1622 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1625 class BlipTVUserIE(InfoExtractor):
1626 """Information Extractor for blip.tv users."""
1628 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1630 IE_NAME = u'blip.tv:user'
1632 def __init__(self, downloader=None):
1633 InfoExtractor.__init__(self, downloader)
1635 def report_download_page(self, username, pagenum):
1636 """Report attempt to download user page."""
1637 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1638 (self.IE_NAME, username, pagenum))
1640 def _real_extract(self, url):
1642 mobj = re.match(self._VALID_URL, url)
1644 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1647 username = mobj.group(1)
1649 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1651 request = urllib2.Request(url)
1654 page = urllib2.urlopen(request).read().decode('utf-8')
1655 mobj = re.search(r'data-users-id="([^
"]+)"', page)
1656 page_base = page_base % mobj.group(1)
1657 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1658 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % str(err))
1662 # Download video ids using BlipTV Ajax calls. Result size per
1663 # query is limited (currently to 12 videos) so we need to query
1664 # page by page until there are no video ids - it means we got
1671 self.report_download_page(username, pagenum)
1673 request = urllib2.Request( page_base + "&page=" + str(pagenum) )
1676 page = urllib2.urlopen(request).read().decode('utf
-8')
1677 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1678 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % str(err))
1681 # Extract video identifiers
1684 for mobj in re.finditer(r'href
="/([^"]+)"', page):
1685 if mobj.group(1) not in ids_in_page:
1686 ids_in_page.append(unescapeHTML(mobj.group(1)))
1688 video_ids.extend(ids_in_page)
1690 # A little optimization - if current page is not
1691 # "full
", ie. does not contain PAGE_SIZE video ids then
1692 # we can assume that this page is the last one - there
1693 # are no more ids on further pages - no need to query
1696 if len(ids_in_page) < self._PAGE_SIZE:
1701 all_ids_count = len(video_ids)
1702 playliststart = self._downloader.params.get('playliststart', 1) - 1
1703 playlistend = self._downloader.params.get('playlistend', -1)
1705 if playlistend == -1:
1706 video_ids = video_ids[playliststart:]
1708 video_ids = video_ids[playliststart:playlistend]
1710 self._downloader.to_screen(u"[%s] user
%s: Collected
%d video
ids (downloading
%d of them
)" %
1711 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1713 for video_id in video_ids:
1714 self._downloader.download([u'http://blip.tv/'+video_id])
1717 class DepositFilesIE(InfoExtractor):
1718 """Information extractor for depositfiles.com"""
1720 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1721 IE_NAME = u'DepositFiles'
1723 def __init__(self, downloader=None):
1724 InfoExtractor.__init__(self, downloader)
1726 def report_download_webpage(self, file_id):
1727 """Report webpage download."""
1728 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1730 def report_extraction(self, file_id):
1731 """Report information extraction."""
1732 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1734 def _real_extract(self, url):
1735 file_id = url.split('/')[-1]
1736 # Rebuild url in english locale
1737 url = 'http://depositfiles.com/en/files/' + file_id
1739 # Retrieve file webpage with 'Free download' button pressed
1740 free_download_indication = { 'gateway_result' : '1' }
1741 request = urllib2.Request(url, urllib.urlencode(free_download_indication))
1743 self.report_download_webpage(file_id)
1744 webpage = urllib2.urlopen(request).read()
1745 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1746 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err))
1749 # Search for the real file URL
1750 mobj = re.search(r'<form action="(http
://fileshare
.+?
)"', webpage)
1751 if (mobj is None) or (mobj.group(1) is None):
1752 # Try to figure out reason of the error.
1753 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1754 if (mobj is not None) and (mobj.group(1) is not None):
1755 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1756 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1758 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1761 file_url = mobj.group(1)
1762 file_extension = os.path.splitext(file_url)[1][1:]
1764 # Search for file title
1765 mobj = re.search(r'<b title="(.*?
)">', webpage)
1767 self._downloader.trouble(u'ERROR: unable to extract title')
1769 file_title = mobj.group(1).decode('utf-8')
1772 'id': file_id.decode('utf-8'),
1773 'url': file_url.decode('utf-8'),
1775 'upload_date': u'NA',
1776 'title': file_title,
1777 'ext': file_extension.decode('utf-8'),
1783 class FacebookIE(InfoExtractor):
1784 """Information Extractor for Facebook"""
1786 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1787 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1788 _NETRC_MACHINE = 'facebook'
1789 _available_formats = ['video', 'highqual', 'lowqual']
1790 _video_extensions = {
1795 IE_NAME = u'facebook'
1797 def __init__(self, downloader=None):
1798 InfoExtractor.__init__(self, downloader)
1800 def _reporter(self, message):
1801 """Add header and report message."""
1802 self._downloader.to_screen(u'[facebook] %s' % message)
1804 def report_login(self):
1805 """Report attempt to log in."""
1806 self._reporter(u'Logging in')
1808 def report_video_webpage_download(self, video_id):
1809 """Report attempt to download video webpage."""
1810 self._reporter(u'%s: Downloading video webpage' % video_id)
1812 def report_information_extraction(self, video_id):
1813 """Report attempt to extract video information."""
1814 self._reporter(u'%s: Extracting video information' % video_id)
1816 def _parse_page(self, video_webpage):
1817 """Extract video information from page"""
1819 data = {'title': r'\("video_title
", "(.*?
)"\)',
1820 'description': r'<div class="datawrap
">(.*?)</div>',
1821 'owner': r'\("video_owner_name
", "(.*?
)"\)',
1822 'thumbnail': r'\("thumb_url
", "(?P
<THUMB
>.*?
)"\)',
1825 for piece in data.keys():
1826 mobj = re.search(data[piece], video_webpage)
1827 if mobj is not None:
1828 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape
"))
1832 for fmt in self._available_formats:
1833 mobj = re.search(r'\("%s_src
\", "(.+?)"\
)' % fmt, video_webpage)
1834 if mobj is not None:
1835 # URL is in a Javascript segment inside an escaped Unicode format within
1836 # the generally utf-8 page
1837 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape"))
1838 video_info['video_urls
'] = video_urls
1842 def _real_initialize(self):
1843 if self._downloader is None:
1848 downloader_params = self._downloader.params
1850 # Attempt to use provided username and password or .netrc data
1851 if downloader_params.get('username
', None) is not None:
1852 useremail = downloader_params['username
']
1853 password = downloader_params['password
']
1854 elif downloader_params.get('usenetrc
', False):
1856 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1857 if info is not None:
1861 raise netrc.NetrcParseError('No authenticators
for %s' % self._NETRC_MACHINE)
1862 except (IOError, netrc.NetrcParseError), err:
1863 self._downloader.to_stderr(u'WARNING
: parsing
.netrc
: %s' % str(err))
1866 if useremail is None:
1875 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form))
1878 login_results = urllib2.urlopen(request).read()
1879 if re.search(r'<form(.*)name
="login"(.*)</form
>', login_results) is not None:
1880 self._downloader.to_stderr(u'WARNING
: unable to log
in: bad username
/password
, or exceded login rate
limit (~
3/min). Check credentials
or wait
.')
1882 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1883 self._downloader.to_stderr(u'WARNING
: unable to log
in: %s' % str(err))
1886 def _real_extract(self, url):
1887 mobj = re.match(self._VALID_URL, url)
1889 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
1891 video_id = mobj.group('ID
')
1894 self.report_video_webpage_download(video_id)
1895 request = urllib2.Request('https
://www
.facebook
.com
/video
/video
.php?v
=%s' % video_id)
1897 page = urllib2.urlopen(request)
1898 video_webpage = page.read()
1899 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1900 self._downloader.trouble(u'ERROR
: unable to download video webpage
: %s' % str(err))
1903 # Start extracting information
1904 self.report_information_extraction(video_id)
1906 # Extract information
1907 video_info = self._parse_page(video_webpage)
1910 if 'owner
' not in video_info:
1911 self._downloader.trouble(u'ERROR
: unable to extract uploader nickname
')
1913 video_uploader = video_info['owner
']
1916 if 'title
' not in video_info:
1917 self._downloader.trouble(u'ERROR
: unable to extract video title
')
1919 video_title = video_info['title
']
1920 video_title = video_title.decode('utf
-8')
1923 if 'thumbnail
' not in video_info:
1924 self._downloader.trouble(u'WARNING
: unable to extract video thumbnail
')
1925 video_thumbnail = ''
1927 video_thumbnail = video_info['thumbnail
']
1931 if 'upload_date
' in video_info:
1932 upload_time = video_info['upload_date
']
1933 timetuple = email.utils.parsedate_tz(upload_time)
1934 if timetuple is not None:
1936 upload_date = time.strftime('%Y
%m
%d', timetuple[0:9])
1941 video_description = video_info.get('description
', 'No description available
.')
1943 url_map = video_info['video_urls
']
1944 if len(url_map.keys()) > 0:
1945 # Decide which formats to download
1946 req_format = self._downloader.params.get('format
', None)
1947 format_limit = self._downloader.params.get('format_limit
', None)
1949 if format_limit is not None and format_limit in self._available_formats:
1950 format_list = self._available_formats[self._available_formats.index(format_limit):]
1952 format_list = self._available_formats
1953 existing_formats = [x for x in format_list if x in url_map]
1954 if len(existing_formats) == 0:
1955 self._downloader.trouble(u'ERROR
: no known formats available
for video
')
1957 if req_format is None:
1958 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1959 elif req_format == 'worst
':
1960 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
1961 elif req_format == '-1':
1962 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1965 if req_format not in url_map:
1966 self._downloader.trouble(u'ERROR
: requested format
not available
')
1968 video_url_list = [(req_format, url_map[req_format])] # Specific format
1971 for format_param, video_real_url in video_url_list:
1973 video_extension = self._video_extensions.get(format_param, 'mp4
')
1976 'id': video_id.decode('utf
-8'),
1977 'url
': video_real_url.decode('utf
-8'),
1978 'uploader
': video_uploader.decode('utf
-8'),
1979 'upload_date
': upload_date,
1980 'title
': video_title,
1981 'ext
': video_extension.decode('utf
-8'),
1982 'format
': (format_param is None and u'NA
' or format_param.decode('utf
-8')),
1983 'thumbnail
': video_thumbnail.decode('utf
-8'),
1984 'description
': video_description.decode('utf
-8'),
1989 class BlipTVIE(InfoExtractor):
1990 """Information extractor for blip.tv"""
1992 _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?blip\
.tv(/.+)$
'
1993 _URL_EXT = r'^
.*\
.([a
-z0
-9]+)$
'
1994 IE_NAME = u'blip
.tv
'
1996 def report_extraction(self, file_id):
1997 """Report information extraction."""
1998 self._downloader.to_screen(u'[%s] %s: Extracting information
' % (self.IE_NAME, file_id))
2000 def report_direct_download(self, title):
2001 """Report information extraction."""
2002 self._downloader.to_screen(u'[%s] %s: Direct download detected
' % (self.IE_NAME, title))
2004 def _real_extract(self, url):
2005 mobj = re.match(self._VALID_URL, url)
2007 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2014 json_url = url + cchar + 'skin
=json
&version
=2&no_wrap
=1'
2015 request = urllib2.Request(json_url.encode('utf
-8'))
2016 self.report_extraction(mobj.group(1))
2019 urlh = urllib2.urlopen(request)
2020 if urlh.headers.get('Content
-Type
', '').startswith('video
/'): # Direct download
2021 basename = url.split('/')[-1]
2022 title,ext = os.path.splitext(basename)
2023 title = title.decode('UTF
-8')
2024 ext = ext.replace('.', '')
2025 self.report_direct_download(title)
2033 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2034 self._downloader.trouble(u'ERROR
: unable to download video info webpage
: %s' % str(err))
2036 if info is None: # Regular URL
2038 json_code = urlh.read()
2039 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2040 self._downloader.trouble(u'ERROR
: unable to read video info webpage
: %s' % str(err))
2044 json_data = json.loads(json_code)
2045 if 'Post
' in json_data:
2046 data = json_data['Post
']
2050 upload_date = datetime.datetime.strptime(data['datestamp
'], '%m
-%d-%y
%H
:%M
%p
').strftime('%Y
%m
%d')
2051 video_url = data['media
']['url
']
2052 umobj = re.match(self._URL_EXT, video_url)
2054 raise ValueError('Can
not determine filename extension
')
2055 ext = umobj.group(1)
2058 'id': data['item_id
'],
2060 'uploader
': data['display_name
'],
2061 'upload_date
': upload_date,
2062 'title
': data['title
'],
2064 'format
': data['media
']['mimeType
'],
2065 'thumbnail
': data['thumbnailUrl
'],
2066 'description
': data['description
'],
2067 'player_url
': data['embedUrl
']
2069 except (ValueError,KeyError), err:
2070 self._downloader.trouble(u'ERROR
: unable to parse video information
: %s' % repr(err))
2073 std_headers['User
-Agent
'] = 'iTunes
/10.6.1'
2077 class MyVideoIE(InfoExtractor):
2078 """Information Extractor for myvideo.de."""
2080 _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?myvideo\
.de
/watch
/([0-9]+)/([^?
/]+).*'
2081 IE_NAME = u'myvideo
'
2083 def __init__(self, downloader=None):
2084 InfoExtractor.__init__(self, downloader)
2086 def report_download_webpage(self, video_id):
2087 """Report webpage download."""
2088 self._downloader.to_screen(u'[myvideo
] %s: Downloading webpage
' % video_id)
2090 def report_extraction(self, video_id):
2091 """Report information extraction."""
2092 self._downloader.to_screen(u'[myvideo
] %s: Extracting information
' % video_id)
2094 def _real_extract(self,url):
2095 mobj = re.match(self._VALID_URL, url)
2097 self._download.trouble(u'ERROR
: invalid URL
: %s' % url)
2100 video_id = mobj.group(1)
2103 request = urllib2.Request('http
://www
.myvideo
.de
/watch
/%s' % video_id)
2105 self.report_download_webpage(video_id)
2106 webpage = urllib2.urlopen(request).read()
2107 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2108 self._downloader.trouble(u'ERROR
: Unable to retrieve video webpage
: %s' % str(err))
2111 self.report_extraction(video_id)
2112 mobj = re.search(r'<link rel
=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
2115 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
2117 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
2119 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
2121 self._downloader.trouble(u'ERROR
: unable to extract title
')
2124 video_title = mobj.group(1)
2130 'upload_date
': u'NA
',
2131 'title
': video_title,
2137 class ComedyCentralIE(InfoExtractor):
2138 """Information extractor for The Daily Show and Colbert Report """
2140 _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)?
(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
'
2141 IE_NAME = u'comedycentral
'
2143 def report_extraction(self, episode_id):
2144 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
2146 def report_config_download(self, episode_id):
2147 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
2149 def report_index_download(self, episode_id):
2150 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id)
2152 def report_player_url(self, episode_id):
2153 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
2155 def _real_extract(self, url):
2156 mobj = re.match(self._VALID_URL, url)
2158 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2161 if mobj.group('shortname
'):
2162 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
2163 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/'
2165 url = u'http
://www
.colbertnation
.com
/full
-episodes
/'
2166 mobj = re.match(self._VALID_URL, url)
2167 assert mobj is not None
2169 dlNewest = not mobj.group('episode
')
2171 epTitle = mobj.group('showname
')
2173 epTitle = mobj.group('episode
')
2175 req = urllib2.Request(url)
2176 self.report_extraction(epTitle)
2178 htmlHandle = urllib2.urlopen(req)
2179 html = htmlHandle.read()
2180 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2181 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
2184 url = htmlHandle.geturl()
2185 mobj = re.match(self._VALID_URL, url)
2187 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
2189 if mobj.group('episode
') == '':
2190 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
2192 epTitle = mobj.group('episode
')
2194 mMovieParams = re.findall('(?
:<param name
="movie" value
="|var url = ")(http
://media
.mtvnservices
.com
/([^
"]*episode.*?:.*?))"', html)
2195 if len(mMovieParams) == 0:
2196 self._downloader.trouble(u'ERROR
: unable to find Flash URL
in webpage
' + url)
2199 playerUrl_raw = mMovieParams[0][0]
2200 self.report_player_url(epTitle)
2202 urlHandle = urllib2.urlopen(playerUrl_raw)
2203 playerUrl = urlHandle.geturl()
2204 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2205 self._downloader.trouble(u'ERROR
: unable to find out player URL
: ' + unicode(err))
2208 uri = mMovieParams[0][1]
2209 indexUrl = 'http
://shadow
.comedycentral
.com
/feeds
/video_player
/mrss
/?
' + urllib.urlencode({'uri
': uri})
2210 self.report_index_download(epTitle)
2212 indexXml = urllib2.urlopen(indexUrl).read()
2213 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2214 self._downloader.trouble(u'ERROR
: unable to download episode index
: ' + unicode(err))
2219 idoc = xml.etree.ElementTree.fromstring(indexXml)
2220 itemEls = idoc.findall('.//item
')
2221 for itemEl in itemEls:
2222 mediaId = itemEl.findall('./guid
')[0].text
2223 shortMediaId = mediaId.split(':')[-1]
2224 showId = mediaId.split(':')[-2].replace('.com
', '')
2225 officialTitle = itemEl.findall('./title
')[0].text
2226 officialDate = itemEl.findall('./pubDate
')[0].text
2228 configUrl = ('http
://www
.comedycentral
.com
/global/feeds
/entertainment
/media
/mediaGenEntertainment
.jhtml?
' +
2229 urllib.urlencode({'uri
': mediaId}))
2230 configReq = urllib2.Request(configUrl)
2231 self.report_config_download(epTitle)
2233 configXml = urllib2.urlopen(configReq).read()
2234 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2235 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err))
2238 cdoc = xml.etree.ElementTree.fromstring(configXml)
2240 for rendition in cdoc.findall('.//rendition
'):
2241 finfo = (rendition.attrib['bitrate
'], rendition.findall('./src
')[0].text)
2245 self._downloader.trouble(u'\nERROR
: unable to download
' + mediaId + ': No videos found
')
2248 # For now, just pick the highest bitrate
2249 format,video_url = turls[-1]
2251 effTitle = showId + u'-' + epTitle
2256 'upload_date
': officialDate,
2261 'description
': officialTitle,
2262 'player_url
': playerUrl
2265 results.append(info)
2270 class EscapistIE(InfoExtractor):
2271 """Information extractor for The Escapist """
2273 _VALID_URL = r'^
(https?
://)?
(www\
.)?escapistmagazine\
.com
/videos
/view
/(?P
<showname
>[^
/]+)/(?P
<episode
>[^
/?
]+)[/?
]?
.*$
'
2274 IE_NAME = u'escapist
'
2276 def report_extraction(self, showName):
2277 self._downloader.to_screen(u'[escapist
] %s: Extracting information
' % showName)
2279 def report_config_download(self, showName):
2280 self._downloader.to_screen(u'[escapist
] %s: Downloading configuration
' % showName)
2282 def _real_extract(self, url):
2283 mobj = re.match(self._VALID_URL, url)
2285 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2287 showName = mobj.group('showname
')
2288 videoId = mobj.group('episode
')
2290 self.report_extraction(showName)
2292 webPage = urllib2.urlopen(url)
2293 webPageBytes = webPage.read()
2294 m = re.match(r'text
/html
; charset
="?([^"]+)"?', webPage.headers['Content-Type'])
2295 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2296 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2297 self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err))
2300 descMatch = re.search('<meta name="description
" content="([^
"]*)"', webPage)
2301 description = unescapeHTML(descMatch.group(1))
2302 imgMatch = re.search('<meta
property="og:image" content
="([^"]*)"', webPage)
2303 imgUrl = unescapeHTML(imgMatch.group(1))
2304 playerUrlMatch = re.search('<meta property="og
:video
" content="([^
"]*)"', webPage)
2305 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2306 configUrlMatch = re.search('config
=(.*)$
', playerUrl)
2307 configUrl = urllib2.unquote(configUrlMatch.group(1))
2309 self.report_config_download(showName)
2311 configJSON = urllib2.urlopen(configUrl).read()
2312 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
2313 self._downloader.trouble(u'ERROR
: unable to download configuration
: ' + unicode(err))
2316 # Technically, it's JavaScript
, not JSON
2317 configJSON
= configJSON
.replace("'", '"')
2320 config
= json
.loads(configJSON
)
2321 except (ValueError,), err
:
2322 self
._downloader
.trouble(u
'ERROR: Invalid JSON in configuration file: ' + unicode(err
))
2325 playlist
= config
['playlist']
2326 videoUrl
= playlist
[1]['url']
2331 'uploader': showName
,
2332 'upload_date': None,
2336 'thumbnail': imgUrl
,
2337 'description': description
,
2338 'player_url': playerUrl
,
2344 class CollegeHumorIE(InfoExtractor
):
2345 """Information extractor for collegehumor.com"""
2347 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2348 IE_NAME
= u
'collegehumor'
2350 def report_webpage(self
, video_id
):
2351 """Report information extraction."""
2352 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2354 def report_extraction(self
, video_id
):
2355 """Report information extraction."""
2356 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2358 def _real_extract(self
, url
):
2359 mobj
= re
.match(self
._VALID
_URL
, url
)
2361 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2363 video_id
= mobj
.group('videoid')
2365 self
.report_webpage(video_id
)
2366 request
= urllib2
.Request(url
)
2368 webpage
= urllib2
.urlopen(request
).read()
2369 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2370 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
2373 m
= re
.search(r
'id="video:(?P<internalvideoid>[0-9]+)"', webpage
)
2375 self
._downloader
.trouble(u
'ERROR: Cannot extract internal video ID')
2377 internal_video_id
= m
.group('internalvideoid')
2381 'internal_id': internal_video_id
,
2384 self
.report_extraction(video_id
)
2385 xmlUrl
= 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
2387 metaXml
= urllib2
.urlopen(xmlUrl
).read()
2388 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2389 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % str(err
))
2392 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
2394 videoNode
= mdoc
.findall('./video')[0]
2395 info
['description'] = videoNode
.findall('./description')[0].text
2396 info
['title'] = videoNode
.findall('./caption')[0].text
2397 info
['url'] = videoNode
.findall('./file')[0].text
2398 info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
2399 info
['ext'] = info
['url'].rpartition('.')[2]
2400 info
['format'] = info
['ext']
2402 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
2408 class XVideosIE(InfoExtractor
):
2409 """Information extractor for xvideos.com"""
2411 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2412 IE_NAME
= u
'xvideos'
2414 def report_webpage(self
, video_id
):
2415 """Report information extraction."""
2416 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2418 def report_extraction(self
, video_id
):
2419 """Report information extraction."""
2420 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2422 def _real_extract(self
, url
):
2423 mobj
= re
.match(self
._VALID
_URL
, url
)
2425 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2427 video_id
= mobj
.group(1).decode('utf-8')
2429 self
.report_webpage(video_id
)
2431 request
= urllib2
.Request(r
'http://www.xvideos.com/video' + video_id
)
2433 webpage
= urllib2
.urlopen(request
).read()
2434 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2435 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
2438 self
.report_extraction(video_id
)
2442 mobj
= re
.search(r
'flv_url=(.+?)&', webpage
)
2444 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
2446 video_url
= urllib2
.unquote(mobj
.group(1).decode('utf-8'))
2450 mobj
= re
.search(r
'<title>(.*?)\s+-\s+XVID', webpage
)
2452 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2454 video_title
= mobj
.group(1).decode('utf-8')
2457 # Extract video thumbnail
2458 mobj
= re
.search(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage
)
2460 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
2462 video_thumbnail
= mobj
.group(0).decode('utf-8')
2468 'upload_date': None,
2469 'title': video_title
,
2472 'thumbnail': video_thumbnail
,
2473 'description': None,
2480 class SoundcloudIE(InfoExtractor
):
2481 """Information extractor for soundcloud.com
2482 To access the media, the uid of the song and a stream token
2483 must be extracted from the page source and the script must make
2484 a request to media.soundcloud.com/crossdomain.xml. Then
2485 the media can be grabbed by requesting from an url composed
2486 of the stream token and uid
2489 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2490 IE_NAME
= u
'soundcloud'
2492 def __init__(self
, downloader
=None):
2493 InfoExtractor
.__init
__(self
, downloader
)
2495 def report_webpage(self
, video_id
):
2496 """Report information extraction."""
2497 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2499 def report_extraction(self
, video_id
):
2500 """Report information extraction."""
2501 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2503 def _real_extract(self
, url
):
2504 mobj
= re
.match(self
._VALID
_URL
, url
)
2506 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2509 # extract uploader (which is in the url)
2510 uploader
= mobj
.group(1).decode('utf-8')
2511 # extract simple title (uploader + slug of song title)
2512 slug_title
= mobj
.group(2).decode('utf-8')
2513 simple_title
= uploader
+ u
'-' + slug_title
2515 self
.report_webpage('%s/%s' % (uploader
, slug_title
))
2517 request
= urllib2
.Request('http://soundcloud.com/%s/%s' % (uploader
, slug_title
))
2519 webpage
= urllib2
.urlopen(request
).read()
2520 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2521 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
2524 self
.report_extraction('%s/%s' % (uploader
, slug_title
))
2526 # extract uid and stream token that soundcloud hands out for access
2527 mobj
= re
.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage
)
2529 video_id
= mobj
.group(1)
2530 stream_token
= mobj
.group(2)
2532 # extract unsimplified title
2533 mobj
= re
.search('"title":"(.*?)",', webpage
)
2535 title
= mobj
.group(1).decode('utf-8')
2537 title
= simple_title
2539 # construct media url (with uid/token)
2540 mediaURL
= "http://media.soundcloud.com/stream/%s?stream_token=%s"
2541 mediaURL
= mediaURL
% (video_id
, stream_token
)
2544 description
= u
'No description available'
2545 mobj
= re
.search('track-description-value"><p>(.*?)</p>', webpage
)
2547 description
= mobj
.group(1)
2551 mobj
= re
.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage
)
2554 upload_date
= datetime
.datetime
.strptime(mobj
.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d')
2555 except Exception, e
:
2556 self
._downloader
.to_stderr(str(e
))
2558 # for soundcloud, a request to a cross domain is required for cookies
2559 request
= urllib2
.Request('http://media.soundcloud.com/crossdomain.xml', std_headers
)
2562 'id': video_id
.decode('utf-8'),
2564 'uploader': uploader
.decode('utf-8'),
2565 'upload_date': upload_date
,
2570 'description': description
.decode('utf-8')
2574 class InfoQIE(InfoExtractor
):
2575 """Information extractor for infoq.com"""
2577 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2580 def report_webpage(self
, video_id
):
2581 """Report information extraction."""
2582 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2584 def report_extraction(self
, video_id
):
2585 """Report information extraction."""
2586 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2588 def _real_extract(self
, url
):
2589 mobj
= re
.match(self
._VALID
_URL
, url
)
2591 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2594 self
.report_webpage(url
)
2596 request
= urllib2
.Request(url
)
2598 webpage
= urllib2
.urlopen(request
).read()
2599 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2600 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
2603 self
.report_extraction(url
)
2607 mobj
= re
.search(r
"jsclassref='([^']*)'", webpage
)
2609 self
._downloader
.trouble(u
'ERROR: unable to extract video url')
2611 video_url
= 'rtmpe://video.infoq.com/cfx/st/' + urllib2
.unquote(mobj
.group(1).decode('base64'))
2615 mobj
= re
.search(r
'contentTitle = "(.*?)";', webpage
)
2617 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2619 video_title
= mobj
.group(1).decode('utf-8')
2621 # Extract description
2622 video_description
= u
'No description available.'
2623 mobj
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', webpage
)
2624 if mobj
is not None:
2625 video_description
= mobj
.group(1).decode('utf-8')
2627 video_filename
= video_url
.split('/')[-1]
2628 video_id
, extension
= video_filename
.split('.')
2634 'upload_date': None,
2635 'title': video_title
,
2637 'format': extension
, # Extension is always(?) mp4, but seems to be flv
2639 'description': video_description
,
2645 class MixcloudIE(InfoExtractor
):
2646 """Information extractor for www.mixcloud.com"""
2647 _VALID_URL
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME
= u
'mixcloud'
2650 def __init__(self
, downloader
=None):
2651 InfoExtractor
.__init
__(self
, downloader
)
2653 def report_download_json(self
, file_id
):
2654 """Report JSON download."""
2655 self
._downloader
.to_screen(u
'[%s] Downloading json' % self
.IE_NAME
)
2657 def report_extraction(self
, file_id
):
2658 """Report information extraction."""
2659 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
2661 def get_urls(self
, jsonData
, fmt
, bitrate
='best'):
2662 """Get urls from 'audio_formats' section in json"""
2665 bitrate_list
= jsonData
[fmt
]
2666 if bitrate
is None or bitrate
== 'best' or bitrate
not in bitrate_list
:
2667 bitrate
= max(bitrate_list
) # select highest
2669 url_list
= jsonData
[fmt
][bitrate
]
2670 except TypeError: # we have no bitrate info.
2671 url_list
= jsonData
[fmt
]
2674 def check_urls(self
, url_list
):
2675 """Returns 1st active url from list"""
2676 for url
in url_list
:
2678 urllib2
.urlopen(url
)
2680 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2685 def _print_formats(self
, formats
):
2686 print 'Available formats:'
2687 for fmt
in formats
.keys():
2688 for b
in formats
[fmt
]:
2690 ext
= formats
[fmt
][b
][0]
2691 print '%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1])
2692 except TypeError: # we have no bitrate info
2693 ext
= formats
[fmt
][0]
2694 print '%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1])
2697 def _real_extract(self
, url
):
2698 mobj
= re
.match(self
._VALID
_URL
, url
)
2700 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2702 # extract uploader & filename from url
2703 uploader
= mobj
.group(1).decode('utf-8')
2704 file_id
= uploader
+ "-" + mobj
.group(2).decode('utf-8')
2706 # construct API request
2707 file_url
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json'
2708 # retrieve .json file with links to files
2709 request
= urllib2
.Request(file_url
)
2711 self
.report_download_json(file_url
)
2712 jsonData
= urllib2
.urlopen(request
).read()
2713 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2714 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file: %s' % str(err
))
2718 json_data
= json
.loads(jsonData
)
2719 player_url
= json_data
['player_swf_url']
2720 formats
= dict(json_data
['audio_formats'])
2722 req_format
= self
._downloader
.params
.get('format', None)
2725 if self
._downloader
.params
.get('listformats', None):
2726 self
._print
_formats
(formats
)
2729 if req_format
is None or req_format
== 'best':
2730 for format_param
in formats
.keys():
2731 url_list
= self
.get_urls(formats
, format_param
)
2733 file_url
= self
.check_urls(url_list
)
2734 if file_url
is not None:
2737 if req_format
not in formats
.keys():
2738 self
._downloader
.trouble(u
'ERROR: format is not available')
2741 url_list
= self
.get_urls(formats
, req_format
)
2742 file_url
= self
.check_urls(url_list
)
2743 format_param
= req_format
2746 'id': file_id
.decode('utf-8'),
2747 'url': file_url
.decode('utf-8'),
2748 'uploader': uploader
.decode('utf-8'),
2749 'upload_date': u
'NA',
2750 'title': json_data
['name'],
2751 'ext': file_url
.split('.')[-1].decode('utf-8'),
2752 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
2753 'thumbnail': json_data
['thumbnail_url'],
2754 'description': json_data
['description'],
2755 'player_url': player_url
.decode('utf-8'),
2758 class StanfordOpenClassroomIE(InfoExtractor
):
2759 """Information extractor for Stanford's Open ClassRoom"""
2761 _VALID_URL
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2762 IE_NAME
= u
'stanfordoc'
2764 def report_download_webpage(self
, objid
):
2765 """Report information extraction."""
2766 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, objid
))
2768 def report_extraction(self
, video_id
):
2769 """Report information extraction."""
2770 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2772 def _real_extract(self
, url
):
2773 mobj
= re
.match(self
._VALID
_URL
, url
)
2775 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2778 if mobj
.group('course') and mobj
.group('video'): # A specific video
2779 course
= mobj
.group('course')
2780 video
= mobj
.group('video')
2782 'id': course
+ '_' + video
,
2785 self
.report_extraction(info
['id'])
2786 baseUrl
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course
+ '/videos/'
2787 xmlUrl
= baseUrl
+ video
+ '.xml'
2789 metaXml
= urllib2
.urlopen(xmlUrl
).read()
2790 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2791 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % unicode(err
))
2793 mdoc
= xml
.etree
.ElementTree
.fromstring(metaXml
)
2795 info
['title'] = mdoc
.findall('./title')[0].text
2796 info
['url'] = baseUrl
+ mdoc
.findall('./videoFile')[0].text
2798 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file')
2800 info
['ext'] = info
['url'].rpartition('.')[2]
2801 info
['format'] = info
['ext']
2803 elif mobj
.group('course'): # A course page
2804 course
= mobj
.group('course')
2810 self
.report_download_webpage(info
['id'])
2812 coursepage
= urllib2
.urlopen(url
).read()
2813 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2814 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
))
2817 m
= re
.search('<h1>([^<]+)</h1>', coursepage
)
2819 info
['title'] = unescapeHTML(m
.group(1))
2821 info
['title'] = info
['id']
2823 m
= re
.search('<description>([^<]+)</description>', coursepage
)
2825 info
['description'] = unescapeHTML(m
.group(1))
2827 links
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
))
2830 'type': 'reference',
2831 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
),
2835 for entry
in info
['list']:
2836 assert entry
['type'] == 'reference'
2837 results
+= self
.extract(entry
['url'])
2842 'id': 'Stanford OpenClassroom',
2846 self
.report_download_webpage(info
['id'])
2847 rootURL
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2849 rootpage
= urllib2
.urlopen(rootURL
).read()
2850 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2851 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
))
2854 info
['title'] = info
['id']
2856 links
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
))
2859 'type': 'reference',
2860 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
),
2865 for entry
in info
['list']:
2866 assert entry
['type'] == 'reference'
2867 results
+= self
.extract(entry
['url'])
2870 class MTVIE(InfoExtractor
):
2871 """Information extractor for MTV.com"""
2873 _VALID_URL
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2876 def report_webpage(self
, video_id
):
2877 """Report information extraction."""
2878 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
))
2880 def report_extraction(self
, video_id
):
2881 """Report information extraction."""
2882 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
))
2884 def _real_extract(self
, url
):
2885 mobj
= re
.match(self
._VALID
_URL
, url
)
2887 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2889 if not mobj
.group('proto'):
2890 url
= 'http://' + url
2891 video_id
= mobj
.group('videoid')
2892 self
.report_webpage(video_id
)
2894 request
= urllib2
.Request(url
)
2896 webpage
= urllib2
.urlopen(request
).read()
2897 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2898 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
))
2901 mobj
= re
.search(r
'<meta name="mtv_vt" content="([^"]+)"/>', webpage
)
2903 self
._downloader
.trouble(u
'ERROR: unable to extract song name')
2905 song_name
= unescapeHTML(mobj
.group(1).decode('iso-8859-1'))
2906 mobj
= re
.search(r
'<meta name="mtv_an" content="([^"]+)"/>', webpage
)
2908 self
._downloader
.trouble(u
'ERROR: unable to extract performer')
2910 performer
= unescapeHTML(mobj
.group(1).decode('iso-8859-1'))
2911 video_title
= performer
+ ' - ' + song_name
2913 mobj
= re
.search(r
'<meta name="mtvn_uri" content="([^"]+)"/>', webpage
)
2915 self
._downloader
.trouble(u
'ERROR: unable to mtvn_uri')
2917 mtvn_uri
= mobj
.group(1)
2919 mobj
= re
.search(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage
)
2921 self
._downloader
.trouble(u
'ERROR: unable to extract content id')
2923 content_id
= mobj
.group(1)
2925 videogen_url
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri
+ '&id=' + content_id
+ '&vid=' + video_id
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2926 self
.report_extraction(video_id
)
2927 request
= urllib2
.Request(videogen_url
)
2929 metadataXml
= urllib2
.urlopen(request
).read()
2930 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
:
2931 self
._downloader
.trouble(u
'ERROR: unable to download video metadata: %s' % str(err
))
2934 mdoc
= xml
.etree
.ElementTree
.fromstring(metadataXml
)
2935 renditions
= mdoc
.findall('.//rendition')
2937 # For now, always pick the highest quality.
2938 rendition
= renditions
[-1]
2941 _
,_
,ext
= rendition
.attrib
['type'].partition('/')
2942 format
= ext
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate']
2943 video_url
= rendition
.find('./src').text
2945 self
._downloader
.trouble('Invalid rendition field.')
2951 'uploader': performer
,
2952 'title': video_title
,