2 # -*- coding: utf-8 -*-
4 from __future__
import absolute_import
13 import xml
.etree
.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 uploader: Nickname of the video uploader, unescaped.
36 upload_date: Video upload date (YYYYMMDD).
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 player_url: SWF Player URL (used for rtmpdump).
46 subtitles: The .srt file contents.
47 urlhandle: [internal] The urlHandle to be used to download the file,
48 like returned by urllib.request.urlopen
50 The fields should all be Unicode strings.
52 Subclasses of this one should re-define the _real_initialize() and
53 _real_extract() methods and define a _VALID_URL regexp.
54 Probably, they should also be added to the list of extractors.
56 _real_extract() must return a *list* of information dictionaries as
59 Finally, the _WORKING attribute should be set to False for broken IEs
60 in order to warn the users and skip the tests.
67 def __init__(self
, downloader
=None):
68 """Constructor. Receives an optional downloader."""
70 self
.set_downloader(downloader
)
72 def suitable(self
, url
):
73 """Receives a URL and returns True if suitable for this IE."""
74 return re
.match(self
._VALID
_URL
, url
) is not None
77 """Getter method for _WORKING."""
81 """Initializes an instance (authentication, etc)."""
83 self
._real
_initialize
()
86 def extract(self
, url
):
87 """Extracts URL information and returns it in list of dicts."""
89 return self
._real
_extract
(url
)
91 def set_downloader(self
, downloader
):
92 """Sets the downloader for this IE."""
93 self
._downloader
= downloader
95 def _real_initialize(self
):
96 """Real initialization process. Redefine in subclasses."""
99 def _real_extract(self
, url
):
100 """Real extraction process. Redefine in subclasses."""
104 class YoutubeIE(InfoExtractor
):
105 """Information extractor for youtube.com."""
109 (?:https?://)? # http(s):// (optional)
110 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
111 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
112 (?:.*?\#/)? # handle anchor (#/) redirect urls
113 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
114 (?: # the various things that can precede the ID:
115 (?:(?:v|embed|e)/) # v/ or embed/ or e/
116 |(?: # or the v= param in all its forms
117 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
118 (?:\?|\#!?) # the params delimiter ? or # or #!
119 (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
122 )? # optional -> youtube.com/xxxx is OK
123 )? # all until now is optional -> you can pass the naked ID
124 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
125 (?(1).+)? # if we found the ID, everything can follow
127 _LANG_URL
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
128 _LOGIN_URL
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
129 _AGE_URL
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
130 _NEXT_URL_RE
= r
'[\?&]next_url=([^&]+)'
131 _NETRC_MACHINE
= 'youtube'
132 # Listed in order of quality
133 _available_formats
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
134 _available_formats_prefer_free
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
135 _video_extensions
= {
141 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
147 _video_dimensions
= {
165 def suitable(self
, url
):
166 """Receives a URL and returns True if suitable for this IE."""
167 return re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
) is not None
169 def report_lang(self
):
170 """Report attempt to set language."""
171 self
._downloader
.to_screen(u
'[youtube] Setting language')
173 def report_login(self
):
174 """Report attempt to log in."""
175 self
._downloader
.to_screen(u
'[youtube] Logging in')
177 def report_age_confirmation(self
):
178 """Report attempt to confirm age."""
179 self
._downloader
.to_screen(u
'[youtube] Confirming age')
181 def report_video_webpage_download(self
, video_id
):
182 """Report attempt to download video webpage."""
183 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
)
185 def report_video_info_webpage_download(self
, video_id
):
186 """Report attempt to download video info webpage."""
187 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
)
189 def report_video_subtitles_download(self
, video_id
):
190 """Report attempt to download video info webpage."""
191 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video subtitles' % video_id
)
193 def report_information_extraction(self
, video_id
):
194 """Report attempt to extract video information."""
195 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
)
197 def report_unavailable_format(self
, video_id
, format
):
198 """Report extracted video URL."""
199 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
))
201 def report_rtmp_download(self
):
202 """Indicate the download will use the RTMP protocol."""
203 self
._downloader
.to_screen(u
'[youtube] RTMP download detected')
205 def _closed_captions_xml_to_srt(self
, xml_string
):
207 texts
= re
.findall(r
'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string
, re
.MULTILINE
)
208 # TODO parse xml instead of regex
209 for n
, (start
, dur_tag
, dur
, caption
) in enumerate(texts
):
210 if not dur
: dur
= '4'
212 end
= start
+ float(dur
)
213 start
= "%02i:%02i:%02i,%03i" %(start
/(60*60), start
/60%60, start
%60, start
%1*1000)
214 end
= "%02i:%02i:%02i,%03i" %(end
/(60*60), end
/60%60, end
%60, end
%1*1000)
215 caption
= unescapeHTML(caption
)
216 caption
= unescapeHTML(caption
) # double cycle, intentional
217 srt
+= str(n
+1) + '\n'
218 srt
+= start
+ ' --> ' + end
+ '\n'
219 srt
+= caption
+ '\n\n'
222 def _print_formats(self
, formats
):
223 print('Available formats:')
225 print('%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???')))
227 def _real_initialize(self
):
228 if self
._downloader
is None:
233 downloader_params
= self
._downloader
.params
235 # Attempt to use provided username and password or .netrc data
236 if downloader_params
.get('username', None) is not None:
237 username
= downloader_params
['username']
238 password
= downloader_params
['password']
239 elif downloader_params
.get('usenetrc', False):
241 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
246 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
247 except (IOError, netrc
.NetrcParseError
) as err
:
248 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
252 request
= compat_urllib_request
.Request(self
._LANG
_URL
)
255 compat_urllib_request
.urlopen(request
).read()
256 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
257 self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % compat_str(err
))
260 # No authentication to be performed
266 'current_form': 'loginForm',
268 'action_login': 'Log In',
269 'username': username
,
270 'password': password
,
272 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
275 login_results
= compat_urllib_request
.urlopen(request
).read()
276 if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None:
277 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password')
279 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
280 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
286 'action_confirm': 'Confirm',
288 request
= compat_urllib_request
.Request(self
._AGE
_URL
, compat_urllib_parse
.urlencode(age_form
))
290 self
.report_age_confirmation()
291 age_results
= compat_urllib_request
.urlopen(request
).read()
292 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
293 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
296 def _real_extract(self
, url
):
297 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
298 mobj
= re
.search(self
._NEXT
_URL
_RE
, url
)
300 url
= 'http://www.youtube.com/' + compat_urllib_parse
.unquote(mobj
.group(1)).lstrip('/')
302 # Extract video id from URL
303 mobj
= re
.match(self
._VALID
_URL
, url
, re
.VERBOSE
)
305 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
307 video_id
= mobj
.group(2)
310 self
.report_video_webpage_download(video_id
)
311 request
= compat_urllib_request
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
)
313 video_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
314 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
315 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
318 video_webpage
= video_webpage_bytes
.decode('utf-8', 'ignore')
320 # Attempt to extract SWF player URL
321 mobj
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
)
323 player_url
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1))
328 self
.report_video_info_webpage_download(video_id
)
329 for el_type
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
330 video_info_url
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
331 % (video_id
, el_type
))
332 request
= compat_urllib_request
.Request(video_info_url
)
334 video_info_webpage_bytes
= compat_urllib_request
.urlopen(request
).read()
335 video_info_webpage
= video_info_webpage_bytes
.decode('utf-8', 'ignore')
336 video_info
= compat_parse_qs(video_info_webpage
)
337 if 'token' in video_info
:
339 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
340 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
342 if 'token' not in video_info
:
343 if 'reason' in video_info
:
344 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0])
346 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason')
349 # Check for "rental" videos
350 if 'ypc_video_rental_bar_text' in video_info
and 'author' not in video_info
:
351 self
._downloader
.trouble(u
'ERROR: "rental" videos not supported')
354 # Start extracting information
355 self
.report_information_extraction(video_id
)
358 if 'author' not in video_info
:
359 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
361 video_uploader
= compat_urllib_parse
.unquote_plus(video_info
['author'][0])
364 if 'title' not in video_info
:
365 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
367 video_title
= compat_urllib_parse
.unquote_plus(video_info
['title'][0])
370 if 'thumbnail_url' not in video_info
:
371 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
373 else: # don't panic if we can't find it
374 video_thumbnail
= compat_urllib_parse
.unquote_plus(video_info
['thumbnail_url'][0])
378 mobj
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
)
380 upload_date
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split())
381 format_expressions
= ['%d %B %Y', '%B %d %Y', '%b %d %Y']
382 for expression
in format_expressions
:
384 upload_date
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d')
389 video_description
= get_element_by_id("eow-description", video_webpage
)
390 if video_description
:
391 video_description
= clean_html(video_description
)
393 video_description
= ''
396 video_subtitles
= None
397 if self
._downloader
.params
.get('writesubtitles', False):
399 self
.report_video_subtitles_download(video_id
)
400 request
= compat_urllib_request
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
)
402 srt_list
= compat_urllib_request
.urlopen(request
).read()
403 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
404 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % compat_str(err
))
405 srt_lang_list
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list
)
406 srt_lang_list
= dict((l
[1], l
[0]) for l
in srt_lang_list
)
407 if not srt_lang_list
:
408 raise Trouble(u
'WARNING: video has no closed captions')
409 if self
._downloader
.params
.get('subtitleslang', False):
410 srt_lang
= self
._downloader
.params
.get('subtitleslang')
411 elif 'en' in srt_lang_list
:
414 srt_lang
= srt_lang_list
.keys()[0]
415 if not srt_lang
in srt_lang_list
:
416 raise Trouble(u
'WARNING: no closed captions found in the specified language')
417 request
= compat_urllib_request
.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang
, srt_lang_list
[srt_lang
], video_id
))
419 srt_xml
= compat_urllib_request
.urlopen(request
).read()
420 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
421 raise Trouble(u
'WARNING: unable to download video subtitles: %s' % compat_str(err
))
423 raise Trouble(u
'WARNING: unable to download video subtitles')
424 video_subtitles
= self
._closed
_captions
_xml
_to
_srt
(srt_xml
.decode('utf-8'))
425 except Trouble
as trouble
:
426 self
._downloader
.trouble(trouble
[0])
428 if 'length_seconds' not in video_info
:
429 self
._downloader
.trouble(u
'WARNING: unable to extract video duration')
432 video_duration
= compat_urllib_parse
.unquote_plus(video_info
['length_seconds'][0])
435 video_token
= compat_urllib_parse
.unquote_plus(video_info
['token'][0])
437 # Decide which formats to download
438 req_format
= self
._downloader
.params
.get('format', None)
440 if 'conn' in video_info
and video_info
['conn'][0].startswith('rtmp'):
441 self
.report_rtmp_download()
442 video_url_list
= [(None, video_info
['conn'][0])]
443 elif 'url_encoded_fmt_stream_map' in video_info
and len(video_info
['url_encoded_fmt_stream_map']) >= 1:
444 url_data_strs
= video_info
['url_encoded_fmt_stream_map'][0].split(',')
445 url_data
= [compat_parse_qs(uds
) for uds
in url_data_strs
]
446 url_data
= filter(lambda ud
: 'itag' in ud
and 'url' in ud
, url_data
)
447 url_map
= dict((ud
['itag'][0], ud
['url'][0] + '&signature=' + ud
['sig'][0]) for ud
in url_data
)
449 format_limit
= self
._downloader
.params
.get('format_limit', None)
450 available_formats
= self
._available
_formats
_prefer
_free
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
451 if format_limit
is not None and format_limit
in available_formats
:
452 format_list
= available_formats
[available_formats
.index(format_limit
):]
454 format_list
= available_formats
455 existing_formats
= [x
for x
in format_list
if x
in url_map
]
456 if len(existing_formats
) == 0:
457 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
459 if self
._downloader
.params
.get('listformats', None):
460 self
._print
_formats
(existing_formats
)
462 if req_format
is None or req_format
== 'best':
463 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
464 elif req_format
== 'worst':
465 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
466 elif req_format
in ('-1', 'all'):
467 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
469 # Specific formats. We pick the first in a slash-delimeted sequence.
470 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
471 req_formats
= req_format
.split('/')
472 video_url_list
= None
473 for rf
in req_formats
:
475 video_url_list
= [(rf
, url_map
[rf
])]
477 if video_url_list
is None:
478 self
._downloader
.trouble(u
'ERROR: requested format not available')
481 self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
485 for format_param
, video_real_url
in video_url_list
:
487 video_extension
= self
._video
_extensions
.get(format_param
, 'flv')
489 video_format
= '{0} - {1}'.format(format_param
if format_param
else video_extension
,
490 self
._video
_dimensions
.get(format_param
, '???'))
494 'url': video_real_url
,
495 'uploader': video_uploader
,
496 'upload_date': upload_date
,
497 'title': video_title
,
498 'ext': video_extension
,
499 'format': video_format
,
500 'thumbnail': video_thumbnail
,
501 'description': video_description
,
502 'player_url': player_url
,
503 'subtitles': video_subtitles
,
504 'duration': video_duration
509 class MetacafeIE(InfoExtractor
):
510 """Information Extractor for metacafe.com."""
512 _VALID_URL
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
513 _DISCLAIMER
= 'http://www.metacafe.com/family_filter/'
514 _FILTER_POST
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
515 IE_NAME
= u
'metacafe'
517 def __init__(self
, downloader
=None):
518 InfoExtractor
.__init
__(self
, downloader
)
520 def report_disclaimer(self
):
521 """Report disclaimer retrieval."""
522 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer')
524 def report_age_confirmation(self
):
525 """Report attempt to confirm age."""
526 self
._downloader
.to_screen(u
'[metacafe] Confirming age')
528 def report_download_webpage(self
, video_id
):
529 """Report webpage download."""
530 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
)
532 def report_extraction(self
, video_id
):
533 """Report information extraction."""
534 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
)
536 def _real_initialize(self
):
537 # Retrieve disclaimer
538 request
= compat_urllib_request
.Request(self
._DISCLAIMER
)
540 self
.report_disclaimer()
541 disclaimer
= compat_urllib_request
.urlopen(request
).read()
542 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
543 self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % compat_str(err
))
549 'submit': "Continue - I'm over 18",
551 request
= compat_urllib_request
.Request(self
._FILTER
_POST
, compat_urllib_parse
.urlencode(disclaimer_form
))
553 self
.report_age_confirmation()
554 disclaimer
= compat_urllib_request
.urlopen(request
).read()
555 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
556 self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % compat_str(err
))
559 def _real_extract(self
, url
):
560 # Extract id and simplified title from URL
561 mobj
= re
.match(self
._VALID
_URL
, url
)
563 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
566 video_id
= mobj
.group(1)
568 # Check if video comes from YouTube
569 mobj2
= re
.match(r
'^yt-(.*)$', video_id
)
570 if mobj2
is not None:
571 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % mobj2
.group(1)])
574 # Retrieve video webpage to extract further information
575 request
= compat_urllib_request
.Request('http://www.metacafe.com/watch/%s/' % video_id
)
577 self
.report_download_webpage(video_id
)
578 webpage
= compat_urllib_request
.urlopen(request
).read()
579 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
580 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
583 # Extract URL, uploader and title from webpage
584 self
.report_extraction(video_id
)
585 mobj
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
)
587 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
588 video_extension
= mediaURL
[-3:]
590 # Extract gdaKey if available
591 mobj
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
)
595 gdaKey
= mobj
.group(1)
596 video_url
= '%s?__gda__=%s' % (mediaURL
, gdaKey
)
598 mobj
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
)
600 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
602 vardict
= compat_parse_qs(mobj
.group(1))
603 if 'mediaData' not in vardict
:
604 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
606 mobj
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0])
608 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
610 mediaURL
= mobj
.group(1).replace('\\/', '/')
611 video_extension
= mediaURL
[-3:]
612 video_url
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2))
614 mobj
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
)
616 self
._downloader
.trouble(u
'ERROR: unable to extract title')
618 video_title
= mobj
.group(1).decode('utf-8')
620 mobj
= re
.search(r
'submitter=(.*?);', webpage
)
622 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
624 video_uploader
= mobj
.group(1)
627 'id': video_id
.decode('utf-8'),
628 'url': video_url
.decode('utf-8'),
629 'uploader': video_uploader
.decode('utf-8'),
631 'title': video_title
,
632 'ext': video_extension
.decode('utf-8'),
636 class DailymotionIE(InfoExtractor
):
637 """Information Extractor for Dailymotion"""
639 _VALID_URL
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
640 IE_NAME
= u
'dailymotion'
642 def __init__(self
, downloader
=None):
643 InfoExtractor
.__init
__(self
, downloader
)
645 def report_download_webpage(self
, video_id
):
646 """Report webpage download."""
647 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
)
649 def report_extraction(self
, video_id
):
650 """Report information extraction."""
651 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
)
653 def _real_extract(self
, url
):
654 # Extract id and simplified title from URL
655 mobj
= re
.match(self
._VALID
_URL
, url
)
657 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
660 video_id
= mobj
.group(1).split('_')[0].split('?')[0]
662 video_extension
= 'mp4'
664 # Retrieve video webpage to extract further information
665 request
= compat_urllib_request
.Request(url
)
666 request
.add_header('Cookie', 'family_filter=off')
668 self
.report_download_webpage(video_id
)
669 webpage
= compat_urllib_request
.urlopen(request
).read()
670 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
671 self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % compat_str(err
))
674 # Extract URL, uploader and title from webpage
675 self
.report_extraction(video_id
)
676 mobj
= re
.search(r
'\s*var flashvars = (.*)', webpage
)
678 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
680 flashvars
= compat_urllib_parse
.unquote(mobj
.group(1))
682 for key
in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
685 self
._downloader
.to_screen(u
'[dailymotion] Using %s' % key
)
688 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
691 mobj
= re
.search(r
'"' + max_quality
+ r
'":"(.+?)"', flashvars
)
693 self
._downloader
.trouble(u
'ERROR: unable to extract video URL')
696 video_url
= compat_urllib_parse
.unquote(mobj
.group(1)).replace('\\/', '/')
698 # TODO: support choosing qualities
700 mobj
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
)
702 self
._downloader
.trouble(u
'ERROR: unable to extract title')
704 video_title
= unescapeHTML(mobj
.group('title').decode('utf-8'))
706 video_uploader
= None
707 mobj
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage
)
709 # lookin for official user
710 mobj_official
= re
.search(r
'<span rel="author"[^>]+?>([^<]+?)</span>', webpage
)
711 if mobj_official
is None:
712 self
._downloader
.trouble(u
'WARNING: unable to extract uploader nickname')
714 video_uploader
= mobj_official
.group(1)
716 video_uploader
= mobj
.group(1)
718 video_upload_date
= None
719 mobj
= re
.search(r
'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage
)
721 video_upload_date
= mobj
.group(3) + mobj
.group(2) + mobj
.group(1)
724 'id': video_id
.decode('utf-8'),
725 'url': video_url
.decode('utf-8'),
726 'uploader': video_uploader
.decode('utf-8'),
727 'upload_date': video_upload_date
,
728 'title': video_title
,
729 'ext': video_extension
.decode('utf-8'),
733 class GoogleIE(InfoExtractor
):
734 """Information extractor for video.google.com."""
736 _VALID_URL
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
737 IE_NAME
= u
'video.google'
739 def __init__(self
, downloader
=None):
740 InfoExtractor
.__init
__(self
, downloader
)
742 def report_download_webpage(self
, video_id
):
743 """Report webpage download."""
744 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
)
746 def report_extraction(self
, video_id
):
747 """Report information extraction."""
748 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
)
750 def _real_extract(self
, url
):
751 # Extract id from URL
752 mobj
= re
.match(self
._VALID
_URL
, url
)
754 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
757 video_id
= mobj
.group(1)
759 video_extension
= 'mp4'
761 # Retrieve video webpage to extract further information
762 request
= compat_urllib_request
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
)
764 self
.report_download_webpage(video_id
)
765 webpage
= compat_urllib_request
.urlopen(request
).read()
766 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
767 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
770 # Extract URL, uploader, and title from webpage
771 self
.report_extraction(video_id
)
772 mobj
= re
.search(r
"download_url:'([^']+)'", webpage
)
774 video_extension
= 'flv'
775 mobj
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
)
777 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
779 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
780 mediaURL
= mediaURL
.replace('\\x3d', '\x3d')
781 mediaURL
= mediaURL
.replace('\\x26', '\x26')
785 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
787 self
._downloader
.trouble(u
'ERROR: unable to extract title')
789 video_title
= mobj
.group(1).decode('utf-8')
791 # Extract video description
792 mobj
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
)
794 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
796 video_description
= mobj
.group(1).decode('utf-8')
797 if not video_description
:
798 video_description
= 'No description available.'
800 # Extract video thumbnail
801 if self
._downloader
.params
.get('forcethumbnail', False):
802 request
= compat_urllib_request
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
)))
804 webpage
= compat_urllib_request
.urlopen(request
).read()
805 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
806 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
808 mobj
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
)
810 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
812 video_thumbnail
= mobj
.group(1)
813 else: # we need something to pass to process_info
817 'id': video_id
.decode('utf-8'),
818 'url': video_url
.decode('utf-8'),
821 'title': video_title
,
822 'ext': video_extension
.decode('utf-8'),
826 class PhotobucketIE(InfoExtractor
):
827 """Information extractor for photobucket.com."""
829 _VALID_URL
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
830 IE_NAME
= u
'photobucket'
832 def __init__(self
, downloader
=None):
833 InfoExtractor
.__init
__(self
, downloader
)
835 def report_download_webpage(self
, video_id
):
836 """Report webpage download."""
837 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
)
839 def report_extraction(self
, video_id
):
840 """Report information extraction."""
841 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
)
843 def _real_extract(self
, url
):
844 # Extract id from URL
845 mobj
= re
.match(self
._VALID
_URL
, url
)
847 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
850 video_id
= mobj
.group(1)
852 video_extension
= 'flv'
854 # Retrieve video webpage to extract further information
855 request
= compat_urllib_request
.Request(url
)
857 self
.report_download_webpage(video_id
)
858 webpage
= compat_urllib_request
.urlopen(request
).read()
859 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
860 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
863 # Extract URL, uploader, and title from webpage
864 self
.report_extraction(video_id
)
865 mobj
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
)
867 self
._downloader
.trouble(u
'ERROR: unable to extract media URL')
869 mediaURL
= compat_urllib_parse
.unquote(mobj
.group(1))
873 mobj
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
)
875 self
._downloader
.trouble(u
'ERROR: unable to extract title')
877 video_title
= mobj
.group(1).decode('utf-8')
879 video_uploader
= mobj
.group(2).decode('utf-8')
882 'id': video_id
.decode('utf-8'),
883 'url': video_url
.decode('utf-8'),
884 'uploader': video_uploader
,
886 'title': video_title
,
887 'ext': video_extension
.decode('utf-8'),
891 class YahooIE(InfoExtractor
):
892 """Information extractor for video.yahoo.com."""
894 # _VALID_URL matches all Yahoo! Video URLs
895 # _VPAGE_URL matches only the extractable '/watch/' URLs
896 _VALID_URL
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
897 _VPAGE_URL
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
898 IE_NAME
= u
'video.yahoo'
900 def __init__(self
, downloader
=None):
901 InfoExtractor
.__init
__(self
, downloader
)
903 def report_download_webpage(self
, video_id
):
904 """Report webpage download."""
905 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
)
907 def report_extraction(self
, video_id
):
908 """Report information extraction."""
909 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
)
911 def _real_extract(self
, url
, new_video
=True):
912 # Extract ID from URL
913 mobj
= re
.match(self
._VALID
_URL
, url
)
915 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
918 video_id
= mobj
.group(2)
919 video_extension
= 'flv'
921 # Rewrite valid but non-extractable URLs as
922 # extractable English language /watch/ URLs
923 if re
.match(self
._VPAGE
_URL
, url
) is None:
924 request
= compat_urllib_request
.Request(url
)
926 webpage
= compat_urllib_request
.urlopen(request
).read()
927 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
928 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
931 mobj
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
)
933 self
._downloader
.trouble(u
'ERROR: Unable to extract id field')
935 yahoo_id
= mobj
.group(1)
937 mobj
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
)
939 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field')
941 yahoo_vid
= mobj
.group(1)
943 url
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
)
944 return self
._real
_extract
(url
, new_video
=False)
946 # Retrieve video webpage to extract further information
947 request
= compat_urllib_request
.Request(url
)
949 self
.report_download_webpage(video_id
)
950 webpage
= compat_urllib_request
.urlopen(request
).read()
951 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
952 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
955 # Extract uploader and title from webpage
956 self
.report_extraction(video_id
)
957 mobj
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
)
959 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
961 video_title
= mobj
.group(1).decode('utf-8')
963 mobj
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
)
965 self
._downloader
.trouble(u
'ERROR: unable to extract video uploader')
967 video_uploader
= mobj
.group(1).decode('utf-8')
969 # Extract video thumbnail
970 mobj
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
)
972 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail')
974 video_thumbnail
= mobj
.group(1).decode('utf-8')
976 # Extract video description
977 mobj
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
)
979 self
._downloader
.trouble(u
'ERROR: unable to extract video description')
981 video_description
= mobj
.group(1).decode('utf-8')
982 if not video_description
:
983 video_description
= 'No description available.'
985 # Extract video height and width
986 mobj
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
)
988 self
._downloader
.trouble(u
'ERROR: unable to extract video height')
990 yv_video_height
= mobj
.group(1)
992 mobj
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
)
994 self
._downloader
.trouble(u
'ERROR: unable to extract video width')
996 yv_video_width
= mobj
.group(1)
998 # Retrieve video playlist to extract media URL
999 # I'm not completely sure what all these options are, but we
1000 # seem to need most of them, otherwise the server sends a 401.
1001 yv_lg
= 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1002 yv_bitrate
= '700' # according to Wikipedia this is hard-coded
1003 request
= compat_urllib_request
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id
+
1004 '&tech=flash&mode=playlist&lg=' + yv_lg
+ '&bitrate=' + yv_bitrate
+ '&vidH=' + yv_video_height
+
1005 '&vidW=' + yv_video_width
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1007 self
.report_download_webpage(video_id
)
1008 webpage
= compat_urllib_request
.urlopen(request
).read()
1009 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1010 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1013 # Extract media URL from playlist XML
1014 mobj
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
)
1016 self
._downloader
.trouble(u
'ERROR: Unable to extract media URL')
1018 video_url
= compat_urllib_parse
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8')
1019 video_url
= unescapeHTML(video_url
)
1022 'id': video_id
.decode('utf-8'),
1024 'uploader': video_uploader
,
1025 'upload_date': None,
1026 'title': video_title
,
1027 'ext': video_extension
.decode('utf-8'),
1028 'thumbnail': video_thumbnail
.decode('utf-8'),
1029 'description': video_description
,
1033 class VimeoIE(InfoExtractor
):
1034 """Information extractor for vimeo.com."""
1036 # _VALID_URL matches Vimeo URLs
1037 _VALID_URL
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
1040 def __init__(self
, downloader
=None):
1041 InfoExtractor
.__init
__(self
, downloader
)
1043 def report_download_webpage(self
, video_id
):
1044 """Report webpage download."""
1045 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
)
1047 def report_extraction(self
, video_id
):
1048 """Report information extraction."""
1049 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
)
1051 def _real_extract(self
, url
, new_video
=True):
1052 # Extract ID from URL
1053 mobj
= re
.match(self
._VALID
_URL
, url
)
1055 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1058 video_id
= mobj
.group(1)
1060 # Retrieve video webpage to extract further information
1061 request
= compat_urllib_request
.Request(url
, None, std_headers
)
1063 self
.report_download_webpage(video_id
)
1064 webpage
= compat_urllib_request
.urlopen(request
).read()
1065 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1066 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1069 # Now we begin extracting as much information as we can from what we
1070 # retrieved. First we extract the information common to all extractors,
1071 # and latter we extract those that are Vimeo specific.
1072 self
.report_extraction(video_id
)
1074 # Extract the config JSON
1076 config
= webpage
.split(' = {config:')[1].split(',assets:')[0]
1077 config
= json
.loads(config
)
1079 self
._downloader
.trouble(u
'ERROR: unable to extract info section')
1083 video_title
= config
["video"]["title"]
1086 video_uploader
= config
["video"]["owner"]["name"]
1088 # Extract video thumbnail
1089 video_thumbnail
= config
["video"]["thumbnail"]
1091 # Extract video description
1092 video_description
= get_element_by_id("description", webpage
.decode('utf8'))
1093 if video_description
: video_description
= clean_html(video_description
)
1094 else: video_description
= ''
1096 # Extract upload date
1097 video_upload_date
= None
1098 mobj
= re
.search(r
'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage
)
1099 if mobj
is not None:
1100 video_upload_date
= mobj
.group(1)
1102 # Vimeo specific: extract request signature and timestamp
1103 sig
= config
['request']['signature']
1104 timestamp
= config
['request']['timestamp']
1106 # Vimeo specific: extract video codec and quality information
1107 # First consider quality, then codecs, then take everything
1108 # TODO bind to format param
1109 codecs
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1110 files
= { 'hd': [], 'sd': [], 'other': []}
1111 for codec_name
, codec_extension
in codecs
:
1112 if codec_name
in config
["video"]["files"]:
1113 if 'hd' in config
["video"]["files"][codec_name
]:
1114 files
['hd'].append((codec_name
, codec_extension
, 'hd'))
1115 elif 'sd' in config
["video"]["files"][codec_name
]:
1116 files
['sd'].append((codec_name
, codec_extension
, 'sd'))
1118 files
['other'].append((codec_name
, codec_extension
, config
["video"]["files"][codec_name
][0]))
1120 for quality
in ('hd', 'sd', 'other'):
1121 if len(files
[quality
]) > 0:
1122 video_quality
= files
[quality
][0][2]
1123 video_codec
= files
[quality
][0][0]
1124 video_extension
= files
[quality
][0][1]
1125 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading %s file at %s quality' % (video_id
, video_codec
.upper(), video_quality
))
1128 self
._downloader
.trouble(u
'ERROR: no known codec found')
1131 video_url
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1132 %(video_id
, sig
, timestamp
, video_quality
, video_codec
.upper())
1137 'uploader': video_uploader
,
1138 'upload_date': video_upload_date
,
1139 'title': video_title
,
1140 'ext': video_extension
,
1141 'thumbnail': video_thumbnail
,
1142 'description': video_description
,
1146 class ArteTvIE(InfoExtractor
):
1147 """arte.tv information extractor."""
1149 _VALID_URL
= r
'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1150 _LIVE_URL
= r
'index-[0-9]+\.html$'
1152 IE_NAME
= u
'arte.tv'
1154 def __init__(self
, downloader
=None):
1155 InfoExtractor
.__init
__(self
, downloader
)
1157 def report_download_webpage(self
, video_id
):
1158 """Report webpage download."""
1159 self
._downloader
.to_screen(u
'[arte.tv] %s: Downloading webpage' % video_id
)
1161 def report_extraction(self
, video_id
):
1162 """Report information extraction."""
1163 self
._downloader
.to_screen(u
'[arte.tv] %s: Extracting information' % video_id
)
1165 def fetch_webpage(self
, url
):
1166 self
._downloader
.increment_downloads()
1167 request
= compat_urllib_request
.Request(url
)
1169 self
.report_download_webpage(url
)
1170 webpage
= compat_urllib_request
.urlopen(request
).read()
1171 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1172 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
1174 except ValueError as err
:
1175 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1179 def grep_webpage(self
, url
, regex
, regexFlags
, matchTuples
):
1180 page
= self
.fetch_webpage(url
)
1181 mobj
= re
.search(regex
, page
, regexFlags
)
1185 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1188 for (i
, key
, err
) in matchTuples
:
1189 if mobj
.group(i
) is None:
1190 self
._downloader
.trouble(err
)
1193 info
[key
] = mobj
.group(i
)
1197 def extractLiveStream(self
, url
):
1198 video_lang
= url
.split('/')[-4]
1199 info
= self
.grep_webpage(
1201 r
'src="(.*?/videothek_js.*?\.js)',
1204 (1, 'url', u
'ERROR: Invalid URL: %s' % url
)
1207 http_host
= url
.split('/')[2]
1208 next_url
= 'http://%s%s' % (http_host
, compat_urllib_parse
.unquote(info
.get('url')))
1209 info
= self
.grep_webpage(
1211 r
'(s_artestras_scst_geoFRDE_' + video_lang
+ '.*?)\'.*?' +
1212 '(http://.*?\.swf).*?' +
1216 (1, 'path', u
'ERROR: could not extract video path: %s' % url
),
1217 (2, 'player', u
'ERROR: could not extract video player: %s' % url
),
1218 (3, 'url', u
'ERROR: could not extract video url: %s' % url
)
1221 video_url
= u
'%s/%s' % (info
.get('url'), info
.get('path'))
1223 def extractPlus7Stream(self
, url
):
1224 video_lang
= url
.split('/')[-3]
1225 info
= self
.grep_webpage(
1227 r
'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1230 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1233 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video lang="%s" ref="(http
[^
\'"&]*)' % video_lang,
1239 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1242 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video id="(.*?
)".*?>.*?' +
1247 '<name>(.*?)</name>.*?' +
1248 '<dateVideo>(.*?)</dateVideo>.*?' +
1249 '<url quality="hd
">(.*?)</url>',
1252 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1253 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1254 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1255 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1260 'id': info.get('id'),
1261 'url': compat_urllib_parse.unquote(info.get('url')),
1262 'uploader': u'arte.tv',
1263 'upload_date': info.get('date'),
1264 'title': info.get('title'),
1270 def _real_extract(self, url):
1271 video_id = url.split('/')[-1]
1272 self.report_extraction(video_id)
1274 if re.search(self._LIVE_URL, video_id) is not None:
1275 self.extractLiveStream(url)
1278 info = self.extractPlus7Stream(url)
1283 class GenericIE(InfoExtractor):
1284 """Generic last-resort information extractor."""
1287 IE_NAME = u'generic'
1289 def __init__(self, downloader=None):
1290 InfoExtractor.__init__(self, downloader)
1292 def report_download_webpage(self, video_id):
1293 """Report webpage download."""
1294 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1295 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1297 def report_extraction(self, video_id):
1298 """Report information extraction."""
1299 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1301 def report_following_redirect(self, new_url):
1302 """Report information extraction."""
1303 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1305 def _test_redirect(self, url):
1306 """Check if it is a redirect, like url shorteners, in case restart chain."""
1307 class HeadRequest(compat_urllib_request.Request):
1308 def get_method(self):
1311 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1313 Subclass the HTTPRedirectHandler to make it use our
1314 HeadRequest also on the redirected URL
1316 def redirect_request(self, req, fp, code, msg, headers, newurl):
1317 if code in (301, 302, 303, 307):
1318 newurl = newurl.replace(' ', '%20')
1319 newheaders = dict((k,v) for k,v in req.headers.items()
1320 if k.lower() not in ("content
-length
", "content
-type"))
1321 return HeadRequest(newurl,
1323 origin_req_host=req.get_origin_req_host(),
1326 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1328 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1330 Fallback to GET if HEAD is not allowed (405 HTTP error)
1332 def http_error_405(self, req, fp, code, msg, headers):
1336 newheaders = dict((k,v) for k,v in req.headers.items()
1337 if k.lower() not in ("content
-length
", "content
-type"))
1338 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1340 origin_req_host=req.get_origin_req_host(),
1344 opener = compat_urllib_request.OpenerDirector()
1345 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1346 HTTPMethodFallback, HEADRedirectHandler,
1347 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1348 opener.add_handler(handler())
1350 response = opener.open(HeadRequest(url))
1351 new_url = response.geturl()
1356 self.report_following_redirect(new_url)
1357 self._downloader.download([new_url])
1360 def _real_extract(self, url):
1361 if self._test_redirect(url): return
1363 video_id = url.split('/')[-1]
1364 request = compat_urllib_request.Request(url)
1366 self.report_download_webpage(video_id)
1367 webpage = compat_urllib_request.urlopen(request).read()
1368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1369 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1371 except ValueError as err:
1372 # since this is the last-resort InfoExtractor, if
1373 # this error is thrown, it'll be thrown here
1374 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1377 self.report_extraction(video_id)
1378 # Start with something easy: JW Player in SWFObject
1379 mobj = re.search(r'flashvars: [\'"](?
:.*&)?
file=(http
[^
\'"&]*)', webpage)
1381 # Broaden the search a little bit
1382 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1384 self._downloader.trouble(u'ERROR
: Invalid URL
: %s' % url)
1387 # It's possible that one of the regexes
1388 # matched, but returned an empty group:
1389 if mobj
.group(1) is None:
1390 self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
)
1393 video_url
= compat_urllib_parse
.unquote(mobj
.group(1))
1394 video_id
= os
.path
.basename(video_url
)
1396 # here's a fun little line of code for you:
1397 video_extension
= os
.path
.splitext(video_id
)[1][1:]
1398 video_id
= os
.path
.splitext(video_id
)[0]
1400 # it's tempting to parse this further, but you would
1401 # have to take into account all the variations like
1402 # Video Title - Site Name
1403 # Site Name | Video Title
1404 # Video Title - Tagline | Site Name
1405 # and so on and so forth; it's just not practical
1406 mobj
= re
.search(r
'<title>(.*)</title>', webpage
)
1408 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1410 video_title
= mobj
.group(1).decode('utf-8')
1412 # video uploader is domain name
1413 mobj
= re
.match(r
'(?:https?://)?([^/]*)/.*', url
)
1415 self
._downloader
.trouble(u
'ERROR: unable to extract title')
1417 video_uploader
= mobj
.group(1).decode('utf-8')
1420 'id': video_id
.decode('utf-8'),
1421 'url': video_url
.decode('utf-8'),
1422 'uploader': video_uploader
,
1423 'upload_date': None,
1424 'title': video_title
,
1425 'ext': video_extension
.decode('utf-8'),
1429 class YoutubeSearchIE(InfoExtractor
):
1430 """Information Extractor for YouTube search queries."""
1431 _VALID_URL
= r
'ytsearch(\d+|all)?:[\s\S]+'
1432 _API_URL
= 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1433 _max_youtube_results
= 1000
1434 IE_NAME
= u
'youtube:search'
1436 def __init__(self
, downloader
=None):
1437 InfoExtractor
.__init
__(self
, downloader
)
1439 def report_download_page(self
, query
, pagenum
):
1440 """Report attempt to download search page with given number."""
1441 query
= query
.decode(preferredencoding())
1442 self
._downloader
.to_screen(u
'[youtube] query "%s": Downloading page %s' % (query
, pagenum
))
1444 def _real_extract(self
, query
):
1445 mobj
= re
.match(self
._VALID
_URL
, query
)
1447 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1450 prefix
, query
= query
.split(':')
1452 query
= query
.encode('utf-8')
1454 self
._download
_n
_results
(query
, 1)
1456 elif prefix
== 'all':
1457 self
._download
_n
_results
(query
, self
._max
_youtube
_results
)
1463 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1465 elif n
> self
._max
_youtube
_results
:
1466 self
._downloader
.to_stderr(u
'WARNING: ytsearch returns max %i results (you requested %i)' % (self
._max
_youtube
_results
, n
))
1467 n
= self
._max
_youtube
_results
1468 self
._download
_n
_results
(query
, n
)
1470 except ValueError: # parsing prefix as integer fails
1471 self
._download
_n
_results
(query
, 1)
1474 def _download_n_results(self
, query
, n
):
1475 """Downloads a specified number of results for a query"""
1481 while (50 * pagenum
) < limit
:
1482 self
.report_download_page(query
, pagenum
+1)
1483 result_url
= self
._API
_URL
% (compat_urllib_parse
.quote_plus(query
), (50*pagenum
)+1)
1484 request
= compat_urllib_request
.Request(result_url
)
1486 data
= compat_urllib_request
.urlopen(request
).read()
1487 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1488 self
._downloader
.trouble(u
'ERROR: unable to download API page: %s' % compat_str(err
))
1490 api_response
= json
.loads(data
)['data']
1492 new_ids
= list(video
['id'] for video
in api_response
['items'])
1493 video_ids
+= new_ids
1495 limit
= min(n
, api_response
['totalItems'])
1498 if len(video_ids
) > n
:
1499 video_ids
= video_ids
[:n
]
1500 for id in video_ids
:
1501 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1505 class GoogleSearchIE(InfoExtractor
):
1506 """Information Extractor for Google Video search queries."""
1507 _VALID_URL
= r
'gvsearch(\d+|all)?:[\s\S]+'
1508 _TEMPLATE_URL
= 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1509 _VIDEO_INDICATOR
= r
'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1510 _MORE_PAGES_INDICATOR
= r
'class="pn" id="pnnext"'
1511 _max_google_results
= 1000
1512 IE_NAME
= u
'video.google:search'
1514 def __init__(self
, downloader
=None):
1515 InfoExtractor
.__init
__(self
, downloader
)
1517 def report_download_page(self
, query
, pagenum
):
1518 """Report attempt to download playlist page with given number."""
1519 query
= query
.decode(preferredencoding())
1520 self
._downloader
.to_screen(u
'[video.google] query "%s": Downloading page %s' % (query
, pagenum
))
1522 def _real_extract(self
, query
):
1523 mobj
= re
.match(self
._VALID
_URL
, query
)
1525 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1528 prefix
, query
= query
.split(':')
1530 query
= query
.encode('utf-8')
1532 self
._download
_n
_results
(query
, 1)
1534 elif prefix
== 'all':
1535 self
._download
_n
_results
(query
, self
._max
_google
_results
)
1541 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1543 elif n
> self
._max
_google
_results
:
1544 self
._downloader
.to_stderr(u
'WARNING: gvsearch returns max %i results (you requested %i)' % (self
._max
_google
_results
, n
))
1545 n
= self
._max
_google
_results
1546 self
._download
_n
_results
(query
, n
)
1548 except ValueError: # parsing prefix as integer fails
1549 self
._download
_n
_results
(query
, 1)
1552 def _download_n_results(self
, query
, n
):
1553 """Downloads a specified number of results for a query"""
1559 self
.report_download_page(query
, pagenum
)
1560 result_url
= self
._TEMPLATE
_URL
% (compat_urllib_parse
.quote_plus(query
), pagenum
*10)
1561 request
= compat_urllib_request
.Request(result_url
)
1563 page
= compat_urllib_request
.urlopen(request
).read()
1564 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1565 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1568 # Extract video identifiers
1569 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1570 video_id
= mobj
.group(1)
1571 if video_id
not in video_ids
:
1572 video_ids
.append(video_id
)
1573 if len(video_ids
) == n
:
1574 # Specified n videos reached
1575 for id in video_ids
:
1576 self
._downloader
.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1580 for id in video_ids
:
1581 self
._downloader
.download(['http://video.google.com/videoplay?docid=%s' % id])
1584 pagenum
= pagenum
+ 1
1587 class YahooSearchIE(InfoExtractor
):
1588 """Information Extractor for Yahoo! Video search queries."""
1589 _VALID_URL
= r
'yvsearch(\d+|all)?:[\s\S]+'
1590 _TEMPLATE_URL
= 'http://video.yahoo.com/search/?p=%s&o=%s'
1591 _VIDEO_INDICATOR
= r
'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1592 _MORE_PAGES_INDICATOR
= r
'\s*Next'
1593 _max_yahoo_results
= 1000
1594 IE_NAME
= u
'video.yahoo:search'
1596 def __init__(self
, downloader
=None):
1597 InfoExtractor
.__init
__(self
, downloader
)
1599 def report_download_page(self
, query
, pagenum
):
1600 """Report attempt to download playlist page with given number."""
1601 query
= query
.decode(preferredencoding())
1602 self
._downloader
.to_screen(u
'[video.yahoo] query "%s": Downloading page %s' % (query
, pagenum
))
1604 def _real_extract(self
, query
):
1605 mobj
= re
.match(self
._VALID
_URL
, query
)
1607 self
._downloader
.trouble(u
'ERROR: invalid search query "%s"' % query
)
1610 prefix
, query
= query
.split(':')
1612 query
= query
.encode('utf-8')
1614 self
._download
_n
_results
(query
, 1)
1616 elif prefix
== 'all':
1617 self
._download
_n
_results
(query
, self
._max
_yahoo
_results
)
1623 self
._downloader
.trouble(u
'ERROR: invalid download number %s for query "%s"' % (n
, query
))
1625 elif n
> self
._max
_yahoo
_results
:
1626 self
._downloader
.to_stderr(u
'WARNING: yvsearch returns max %i results (you requested %i)' % (self
._max
_yahoo
_results
, n
))
1627 n
= self
._max
_yahoo
_results
1628 self
._download
_n
_results
(query
, n
)
1630 except ValueError: # parsing prefix as integer fails
1631 self
._download
_n
_results
(query
, 1)
1634 def _download_n_results(self
, query
, n
):
1635 """Downloads a specified number of results for a query"""
1638 already_seen
= set()
1642 self
.report_download_page(query
, pagenum
)
1643 result_url
= self
._TEMPLATE
_URL
% (compat_urllib_parse
.quote_plus(query
), pagenum
)
1644 request
= compat_urllib_request
.Request(result_url
)
1646 page
= compat_urllib_request
.urlopen(request
).read()
1647 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1648 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1651 # Extract video identifiers
1652 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1653 video_id
= mobj
.group(1)
1654 if video_id
not in already_seen
:
1655 video_ids
.append(video_id
)
1656 already_seen
.add(video_id
)
1657 if len(video_ids
) == n
:
1658 # Specified n videos reached
1659 for id in video_ids
:
1660 self
._downloader
.download(['http://video.yahoo.com/watch/%s' % id])
1663 if re
.search(self
._MORE
_PAGES
_INDICATOR
, page
) is None:
1664 for id in video_ids
:
1665 self
._downloader
.download(['http://video.yahoo.com/watch/%s' % id])
1668 pagenum
= pagenum
+ 1
1671 class YoutubePlaylistIE(InfoExtractor
):
1672 """Information Extractor for YouTube playlists."""
1674 _VALID_URL
= r
'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1675 _TEMPLATE_URL
= 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1676 _VIDEO_INDICATOR_TEMPLATE
= r
'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1677 _MORE_PAGES_INDICATOR
= u
"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1678 IE_NAME
= u
'youtube:playlist'
1680 def __init__(self
, downloader
=None):
1681 InfoExtractor
.__init
__(self
, downloader
)
1683 def report_download_page(self
, playlist_id
, pagenum
):
1684 """Report attempt to download playlist page with given number."""
1685 self
._downloader
.to_screen(u
'[youtube] PL %s: Downloading page #%s' % (playlist_id
, pagenum
))
1687 def _real_extract(self
, url
):
1688 # Extract playlist id
1689 mobj
= re
.match(self
._VALID
_URL
, url
)
1691 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1695 if mobj
.group(3) is not None:
1696 self
._downloader
.download([mobj
.group(3)])
1699 # Download playlist pages
1700 # prefix is 'p' as default for playlists but there are other types that need extra care
1701 playlist_prefix
= mobj
.group(1)
1702 if playlist_prefix
== 'a':
1703 playlist_access
= 'artist'
1705 playlist_prefix
= 'p'
1706 playlist_access
= 'view_play_list'
1707 playlist_id
= mobj
.group(2)
1712 self
.report_download_page(playlist_id
, pagenum
)
1713 url
= self
._TEMPLATE
_URL
% (playlist_access
, playlist_prefix
, playlist_id
, pagenum
)
1714 request
= compat_urllib_request
.Request(url
)
1716 page
= compat_urllib_request
.urlopen(request
).read().decode('utf8')
1717 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1718 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1721 # Extract video identifiers
1723 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
_TEMPLATE
% playlist_id
, page
):
1724 if mobj
.group(1) not in ids_in_page
:
1725 ids_in_page
.append(mobj
.group(1))
1726 video_ids
.extend(ids_in_page
)
1728 if self
._MORE
_PAGES
_INDICATOR
not in page
:
1730 pagenum
= pagenum
+ 1
1732 total
= len(video_ids
)
1734 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1735 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1736 if playlistend
== -1:
1737 video_ids
= video_ids
[playliststart
:]
1739 video_ids
= video_ids
[playliststart
:playlistend
]
1741 if len(video_ids
) == total
:
1742 self
._downloader
.to_screen(u
'[youtube] PL %s: Found %i videos' % (playlist_id
, total
))
1744 self
._downloader
.to_screen(u
'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id
, total
, len(video_ids
)))
1746 for id in video_ids
:
1747 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1751 class YoutubeChannelIE(InfoExtractor
):
1752 """Information Extractor for YouTube channels."""
1754 _VALID_URL
= r
"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1755 _TEMPLATE_URL
= 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1756 _MORE_PAGES_INDICATOR
= u
"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1757 IE_NAME
= u
'youtube:channel'
1759 def report_download_page(self
, channel_id
, pagenum
):
1760 """Report attempt to download channel page with given number."""
1761 self
._downloader
.to_screen(u
'[youtube] Channel %s: Downloading page #%s' % (channel_id
, pagenum
))
1763 def _real_extract(self
, url
):
1764 # Extract channel id
1765 mobj
= re
.match(self
._VALID
_URL
, url
)
1767 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1770 # Download channel pages
1771 channel_id
= mobj
.group(1)
1776 self
.report_download_page(channel_id
, pagenum
)
1777 url
= self
._TEMPLATE
_URL
% (channel_id
, pagenum
)
1778 request
= compat_urllib_request
.Request(url
)
1780 page
= compat_urllib_request
.urlopen(request
).read().decode('utf8')
1781 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1782 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1785 # Extract video identifiers
1787 for mobj
in re
.finditer(r
'href="/watch\?v=([0-9A-Za-z_-]+)&', page
):
1788 if mobj
.group(1) not in ids_in_page
:
1789 ids_in_page
.append(mobj
.group(1))
1790 video_ids
.extend(ids_in_page
)
1792 if self
._MORE
_PAGES
_INDICATOR
not in page
:
1794 pagenum
= pagenum
+ 1
1796 self
._downloader
.to_screen(u
'[youtube] Channel %s: Found %i videos' % (channel_id
, len(video_ids
)))
1798 for id in video_ids
:
1799 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % id])
1803 class YoutubeUserIE(InfoExtractor
):
1804 """Information Extractor for YouTube users."""
1806 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1807 _TEMPLATE_URL
= 'http://gdata.youtube.com/feeds/api/users/%s'
1808 _GDATA_PAGE_SIZE
= 50
1809 _GDATA_URL
= 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1810 _VIDEO_INDICATOR
= r
'/watch\?v=(.+?)[\<&]'
1811 IE_NAME
= u
'youtube:user'
1813 def __init__(self
, downloader
=None):
1814 InfoExtractor
.__init
__(self
, downloader
)
1816 def report_download_page(self
, username
, start_index
):
1817 """Report attempt to download user page."""
1818 self
._downloader
.to_screen(u
'[youtube] user %s: Downloading video ids from %d to %d' %
1819 (username
, start_index
, start_index
+ self
._GDATA
_PAGE
_SIZE
))
1821 def _real_extract(self
, url
):
1823 mobj
= re
.match(self
._VALID
_URL
, url
)
1825 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1828 username
= mobj
.group(1)
1830 # Download video ids using YouTube Data API. Result size per
1831 # query is limited (currently to 50 videos) so we need to query
1832 # page by page until there are no video ids - it means we got
1839 start_index
= pagenum
* self
._GDATA
_PAGE
_SIZE
+ 1
1840 self
.report_download_page(username
, start_index
)
1842 request
= compat_urllib_request
.Request(self
._GDATA
_URL
% (username
, self
._GDATA
_PAGE
_SIZE
, start_index
))
1845 page
= compat_urllib_request
.urlopen(request
).read()
1846 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1847 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1850 # Extract video identifiers
1853 for mobj
in re
.finditer(self
._VIDEO
_INDICATOR
, page
):
1854 if mobj
.group(1) not in ids_in_page
:
1855 ids_in_page
.append(mobj
.group(1))
1857 video_ids
.extend(ids_in_page
)
1859 # A little optimization - if current page is not
1860 # "full", ie. does not contain PAGE_SIZE video ids then
1861 # we can assume that this page is the last one - there
1862 # are no more ids on further pages - no need to query
1865 if len(ids_in_page
) < self
._GDATA
_PAGE
_SIZE
:
1870 all_ids_count
= len(video_ids
)
1871 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1872 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1874 if playlistend
== -1:
1875 video_ids
= video_ids
[playliststart
:]
1877 video_ids
= video_ids
[playliststart
:playlistend
]
1879 self
._downloader
.to_screen(u
"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1880 (username
, all_ids_count
, len(video_ids
)))
1882 for video_id
in video_ids
:
1883 self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % video_id
])
1886 class BlipTVUserIE(InfoExtractor
):
1887 """Information Extractor for blip.tv users."""
1889 _VALID_URL
= r
'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1891 IE_NAME
= u
'blip.tv:user'
1893 def __init__(self
, downloader
=None):
1894 InfoExtractor
.__init
__(self
, downloader
)
1896 def report_download_page(self
, username
, pagenum
):
1897 """Report attempt to download user page."""
1898 self
._downloader
.to_screen(u
'[%s] user %s: Downloading video ids from page %d' %
1899 (self
.IE_NAME
, username
, pagenum
))
1901 def _real_extract(self
, url
):
1903 mobj
= re
.match(self
._VALID
_URL
, url
)
1905 self
._downloader
.trouble(u
'ERROR: invalid url: %s' % url
)
1908 username
= mobj
.group(1)
1910 page_base
= 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1912 request
= compat_urllib_request
.Request(url
)
1915 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1916 mobj
= re
.search(r
'data-users-id="([^"]+)"', page
)
1917 page_base
= page_base
% mobj
.group(1)
1918 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1919 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % compat_str(err
))
1923 # Download video ids using BlipTV Ajax calls. Result size per
1924 # query is limited (currently to 12 videos) so we need to query
1925 # page by page until there are no video ids - it means we got
1932 self
.report_download_page(username
, pagenum
)
1934 request
= compat_urllib_request
.Request( page_base
+ "&page=" + str(pagenum
) )
1937 page
= compat_urllib_request
.urlopen(request
).read().decode('utf-8')
1938 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
1939 self
._downloader
.trouble(u
'ERROR: unable to download webpage: %s' % str(err
))
1942 # Extract video identifiers
1945 for mobj
in re
.finditer(r
'href="/([^"]+)"', page
):
1946 if mobj
.group(1) not in ids_in_page
:
1947 ids_in_page
.append(unescapeHTML(mobj
.group(1)))
1949 video_ids
.extend(ids_in_page
)
1951 # A little optimization - if current page is not
1952 # "full", ie. does not contain PAGE_SIZE video ids then
1953 # we can assume that this page is the last one - there
1954 # are no more ids on further pages - no need to query
1957 if len(ids_in_page
) < self
._PAGE
_SIZE
:
1962 all_ids_count
= len(video_ids
)
1963 playliststart
= self
._downloader
.params
.get('playliststart', 1) - 1
1964 playlistend
= self
._downloader
.params
.get('playlistend', -1)
1966 if playlistend
== -1:
1967 video_ids
= video_ids
[playliststart
:]
1969 video_ids
= video_ids
[playliststart
:playlistend
]
1971 self
._downloader
.to_screen(u
"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1972 (self
.IE_NAME
, username
, all_ids_count
, len(video_ids
)))
1974 for video_id
in video_ids
:
1975 self
._downloader
.download([u
'http://blip.tv/'+video_id
])
1978 class DepositFilesIE(InfoExtractor
):
1979 """Information extractor for depositfiles.com"""
1981 _VALID_URL
= r
'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1982 IE_NAME
= u
'DepositFiles'
1984 def __init__(self
, downloader
=None):
1985 InfoExtractor
.__init
__(self
, downloader
)
1987 def report_download_webpage(self
, file_id
):
1988 """Report webpage download."""
1989 self
._downloader
.to_screen(u
'[DepositFiles] %s: Downloading webpage' % file_id
)
1991 def report_extraction(self
, file_id
):
1992 """Report information extraction."""
1993 self
._downloader
.to_screen(u
'[DepositFiles] %s: Extracting information' % file_id
)
1995 def _real_extract(self
, url
):
1996 file_id
= url
.split('/')[-1]
1997 # Rebuild url in english locale
1998 url
= 'http://depositfiles.com/en/files/' + file_id
2000 # Retrieve file webpage with 'Free download' button pressed
2001 free_download_indication
= { 'gateway_result' : '1' }
2002 request
= compat_urllib_request
.Request(url
, compat_urllib_parse
.urlencode(free_download_indication
))
2004 self
.report_download_webpage(file_id
)
2005 webpage
= compat_urllib_request
.urlopen(request
).read()
2006 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2007 self
._downloader
.trouble(u
'ERROR: Unable to retrieve file webpage: %s' % compat_str(err
))
2010 # Search for the real file URL
2011 mobj
= re
.search(r
'<form action="(http://fileshare.+?)"', webpage
)
2012 if (mobj
is None) or (mobj
.group(1) is None):
2013 # Try to figure out reason of the error.
2014 mobj
= re
.search(r
'<strong>(Attention.*?)</strong>', webpage
, re
.DOTALL
)
2015 if (mobj
is not None) and (mobj
.group(1) is not None):
2016 restriction_message
= re
.sub('\s+', ' ', mobj
.group(1)).strip()
2017 self
._downloader
.trouble(u
'ERROR: %s' % restriction_message
)
2019 self
._downloader
.trouble(u
'ERROR: unable to extract download URL from: %s' % url
)
2022 file_url
= mobj
.group(1)
2023 file_extension
= os
.path
.splitext(file_url
)[1][1:]
2025 # Search for file title
2026 mobj
= re
.search(r
'<b title="(.*?)">', webpage
)
2028 self
._downloader
.trouble(u
'ERROR: unable to extract title')
2030 file_title
= mobj
.group(1).decode('utf-8')
2033 'id': file_id
.decode('utf-8'),
2034 'url': file_url
.decode('utf-8'),
2036 'upload_date': None,
2037 'title': file_title
,
2038 'ext': file_extension
.decode('utf-8'),
2042 class FacebookIE(InfoExtractor
):
2043 """Information Extractor for Facebook"""
2046 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2047 _LOGIN_URL
= 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2048 _NETRC_MACHINE
= 'facebook'
2049 _available_formats
= ['video', 'highqual', 'lowqual']
2050 _video_extensions
= {
2055 IE_NAME
= u
'facebook'
2057 def __init__(self
, downloader
=None):
2058 InfoExtractor
.__init
__(self
, downloader
)
2060 def _reporter(self
, message
):
2061 """Add header and report message."""
2062 self
._downloader
.to_screen(u
'[facebook] %s' % message
)
2064 def report_login(self
):
2065 """Report attempt to log in."""
2066 self
._reporter
(u
'Logging in')
2068 def report_video_webpage_download(self
, video_id
):
2069 """Report attempt to download video webpage."""
2070 self
._reporter
(u
'%s: Downloading video webpage' % video_id
)
2072 def report_information_extraction(self
, video_id
):
2073 """Report attempt to extract video information."""
2074 self
._reporter
(u
'%s: Extracting video information' % video_id
)
2076 def _parse_page(self
, video_webpage
):
2077 """Extract video information from page"""
2079 data
= {'title': r
'\("video_title", "(.*?)"\)',
2080 'description': r
'<div class="datawrap">(.*?)</div>',
2081 'owner': r
'\("video_owner_name", "(.*?)"\)',
2082 'thumbnail': r
'\("thumb_url", "(?P<THUMB>.*?)"\)',
2085 for piece
in data
.keys():
2086 mobj
= re
.search(data
[piece
], video_webpage
)
2087 if mobj
is not None:
2088 video_info
[piece
] = compat_urllib_parse
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
2092 for fmt
in self
._available
_formats
:
2093 mobj
= re
.search(r
'\("%s_src\", "(.+?)"\)' % fmt
, video_webpage
)
2094 if mobj
is not None:
2095 # URL is in a Javascript segment inside an escaped Unicode format within
2096 # the generally utf-8 page
2097 video_urls
[fmt
] = compat_urllib_parse
.unquote_plus(mobj
.group(1).decode("unicode_escape"))
2098 video_info
['video_urls'] = video_urls
2102 def _real_initialize(self
):
2103 if self
._downloader
is None:
2108 downloader_params
= self
._downloader
.params
2110 # Attempt to use provided username and password or .netrc data
2111 if downloader_params
.get('username', None) is not None:
2112 useremail
= downloader_params
['username']
2113 password
= downloader_params
['password']
2114 elif downloader_params
.get('usenetrc', False):
2116 info
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
)
2117 if info
is not None:
2121 raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
)
2122 except (IOError, netrc
.NetrcParseError
) as err
:
2123 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % compat_str(err
))
2126 if useremail
is None:
2135 request
= compat_urllib_request
.Request(self
._LOGIN
_URL
, compat_urllib_parse
.urlencode(login_form
))
2138 login_results
= compat_urllib_request
.urlopen(request
).read()
2139 if re
.search(r
'<form(.*)name="login"(.*)</form>', login_results
) is not None:
2140 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2142 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2143 self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % compat_str(err
))
2146 def _real_extract(self
, url
):
2147 mobj
= re
.match(self
._VALID
_URL
, url
)
2149 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2151 video_id
= mobj
.group('ID')
2154 self
.report_video_webpage_download(video_id
)
2155 request
= compat_urllib_request
.Request('https://www.facebook.com/video/video.php?v=%s' % video_id
)
2157 page
= compat_urllib_request
.urlopen(request
)
2158 video_webpage
= page
.read()
2159 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2160 self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % compat_str(err
))
2163 # Start extracting information
2164 self
.report_information_extraction(video_id
)
2166 # Extract information
2167 video_info
= self
._parse
_page
(video_webpage
)
2170 if 'owner' not in video_info
:
2171 self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname')
2173 video_uploader
= video_info
['owner']
2176 if 'title' not in video_info
:
2177 self
._downloader
.trouble(u
'ERROR: unable to extract video title')
2179 video_title
= video_info
['title']
2180 video_title
= video_title
.decode('utf-8')
2183 if 'thumbnail' not in video_info
:
2184 self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail')
2185 video_thumbnail
= ''
2187 video_thumbnail
= video_info
['thumbnail']
2191 if 'upload_date' in video_info
:
2192 upload_time
= video_info
['upload_date']
2193 timetuple
= email
.utils
.parsedate_tz(upload_time
)
2194 if timetuple
is not None:
2196 upload_date
= time
.strftime('%Y%m%d', timetuple
[0:9])
2201 video_description
= video_info
.get('description', 'No description available.')
2203 url_map
= video_info
['video_urls']
2204 if len(url_map
.keys()) > 0:
2205 # Decide which formats to download
2206 req_format
= self
._downloader
.params
.get('format', None)
2207 format_limit
= self
._downloader
.params
.get('format_limit', None)
2209 if format_limit
is not None and format_limit
in self
._available
_formats
:
2210 format_list
= self
._available
_formats
[self
._available
_formats
.index(format_limit
):]
2212 format_list
= self
._available
_formats
2213 existing_formats
= [x
for x
in format_list
if x
in url_map
]
2214 if len(existing_formats
) == 0:
2215 self
._downloader
.trouble(u
'ERROR: no known formats available for video')
2217 if req_format
is None:
2218 video_url_list
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality
2219 elif req_format
== 'worst':
2220 video_url_list
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality
2221 elif req_format
== '-1':
2222 video_url_list
= [(f
, url_map
[f
]) for f
in existing_formats
] # All formats
2225 if req_format
not in url_map
:
2226 self
._downloader
.trouble(u
'ERROR: requested format not available')
2228 video_url_list
= [(req_format
, url_map
[req_format
])] # Specific format
2231 for format_param
, video_real_url
in video_url_list
:
2233 video_extension
= self
._video
_extensions
.get(format_param
, 'mp4')
2236 'id': video_id
.decode('utf-8'),
2237 'url': video_real_url
.decode('utf-8'),
2238 'uploader': video_uploader
.decode('utf-8'),
2239 'upload_date': upload_date
,
2240 'title': video_title
,
2241 'ext': video_extension
.decode('utf-8'),
2242 'format': (format_param
is None and u
'NA' or format_param
.decode('utf-8')),
2243 'thumbnail': video_thumbnail
.decode('utf-8'),
2244 'description': video_description
.decode('utf-8'),
2248 class BlipTVIE(InfoExtractor
):
2249 """Information extractor for blip.tv"""
2251 _VALID_URL
= r
'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2252 _URL_EXT
= r
'^.*\.([a-z0-9]+)$'
2253 IE_NAME
= u
'blip.tv'
2255 def report_extraction(self
, file_id
):
2256 """Report information extraction."""
2257 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
))
2259 def report_direct_download(self
, title
):
2260 """Report information extraction."""
2261 self
._downloader
.to_screen(u
'[%s] %s: Direct download detected' % (self
.IE_NAME
, title
))
2263 def _real_extract(self
, url
):
2264 mobj
= re
.match(self
._VALID
_URL
, url
)
2266 self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
)
2273 json_url
= url
+ cchar
+ 'skin=json&version=2&no_wrap=1'
2274 request
= compat_urllib_request
.Request(json_url
.encode('utf-8'))
2275 self
.report_extraction(mobj
.group(1))
2278 urlh
= compat_urllib_request
.urlopen(request
)
2279 if urlh
.headers
.get('Content-Type', '').startswith('video/'): # Direct download
2280 basename
= url
.split('/')[-1]
2281 title
,ext
= os
.path
.splitext(basename
)
2282 title
= title
.decode('UTF-8')
2283 ext
= ext
.replace('.', '')
2284 self
.report_direct_download(title
)
2289 'upload_date': None,
2294 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2295 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % compat_str(err
))
2297 if info
is None: # Regular URL
2299 json_code
= urlh
.read()
2300 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2301 self
._downloader
.trouble(u
'ERROR: unable to read video info webpage: %s' % compat_str(err
))
2305 json_data
= json
.loads(json_code
)
2306 if 'Post' in json_data
:
2307 data
= json_data
['Post']
2311 upload_date
= datetime
.datetime
.strptime(data
['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2312 video_url
= data
['media']['url']
2313 umobj
= re
.match(self
._URL
_EXT
, video_url
)
2315 raise ValueError('Can not determine filename extension')
2316 ext
= umobj
.group(1)
2319 'id': data
['item_id'],
2321 'uploader': data
['display_name'],
2322 'upload_date': upload_date
,
2323 'title': data
['title'],
2325 'format': data
['media']['mimeType'],
2326 'thumbnail': data
['thumbnailUrl'],
2327 'description': data
['description'],
2328 'player_url': data
['embedUrl']
2330 except (ValueError,KeyError) as err
:
2331 self
._downloader
.trouble(u
'ERROR: unable to parse video information: %s' % repr(err
))
2334 std_headers
['User-Agent'] = 'iTunes/10.6.1'
2338 class MyVideoIE(InfoExtractor
):
2339 """Information Extractor for myvideo.de."""
2341 _VALID_URL
= r
'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2342 IE_NAME
= u
'myvideo'
2344 def __init__(self
, downloader
=None):
2345 InfoExtractor
.__init
__(self
, downloader
)
2347 def report_download_webpage(self
, video_id
):
2348 """Report webpage download."""
2349 self
._downloader
.to_screen(u
'[myvideo] %s: Downloading webpage' % video_id
)
2351 def report_extraction(self
, video_id
):
2352 """Report information extraction."""
2353 self
._downloader
.to_screen(u
'[myvideo] %s: Extracting information' % video_id
)
2355 def _real_extract(self
,url
):
2356 mobj
= re
.match(self
._VALID
_URL
, url
)
2358 self
._download
.trouble(u
'ERROR: invalid URL: %s' % url
)
2361 video_id
= mobj
.group(1)
2364 request
= compat_urllib_request
.Request('http://www.myvideo.de/watch/%s' % video_id
)
2366 self
.report_download_webpage(video_id
)
2367 webpage
= compat_urllib_request
.urlopen(request
).read()
2368 except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
:
2369 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % compat_str(err
))
2372 self
.report_extraction(video_id
)
2373 mobj
= re
.search(r
'<link rel=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />',
2376 self._downloader.trouble(u'ERROR
: unable to extract media URL
')
2378 video_url = mobj.group(1) + ('/%s.flv
' % video_id)
2380 mobj = re.search('<title
>([^
<]+)</title
>', webpage)
2382 self._downloader.trouble(u'ERROR
: unable to extract title
')
2385 video_title = mobj.group(1)
2391 'upload_date
': None,
2392 'title
': video_title,
2396 class ComedyCentralIE(InfoExtractor):
2397 """Information extractor for The Daily Show and Colbert Report """
2399 _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)?
(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
'
2400 IE_NAME = u'comedycentral
'
2402 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2404 _video_extensions = {
2412 _video_dimensions = {
2421 def report_extraction(self, episode_id):
2422 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id)
2424 def report_config_download(self, episode_id):
2425 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id)
2427 def report_index_download(self, episode_id):
2428 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id)
2430 def report_player_url(self, episode_id):
2431 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id)
2434 def _print_formats(self, formats):
2435 print('Available formats
:')
2437 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4
'), self._video_dimensions.get(x, '???
')))
2440 def _real_extract(self, url):
2441 mobj = re.match(self._VALID_URL, url)
2443 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
2446 if mobj.group('shortname
'):
2447 if mobj.group('shortname
') in ('tds
', 'thedailyshow
'):
2448 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/'
2450 url = u'http
://www
.colbertnation
.com
/full
-episodes
/'
2451 mobj = re.match(self._VALID_URL, url)
2452 assert mobj is not None
2454 dlNewest = not mobj.group('episode
')
2456 epTitle = mobj.group('showname
')
2458 epTitle = mobj.group('episode
')
2460 req = compat_urllib_request.Request(url)
2461 self.report_extraction(epTitle)
2463 htmlHandle = compat_urllib_request.urlopen(req)
2464 html = htmlHandle.read()
2465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2466 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % compat_str(err))
2469 url = htmlHandle.geturl()
2470 mobj = re.match(self._VALID_URL, url)
2472 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url)
2474 if mobj.group('episode
') == '':
2475 self._downloader.trouble(u'ERROR
: Redirected URL
is still
not specific
: ' + url)
2477 epTitle = mobj.group('episode
')
2479 mMovieParams = re.findall('(?
:<param name
="movie" value
="|var url = ")(http
://media
.mtvnservices
.com
/([^
"]*episode.*?:.*?))"', html)
2481 if len(mMovieParams) == 0:
2482 # The Colbert Report embeds the information in a without
2483 # a URL prefix; so extract the alternate reference
2484 # and then add the URL prefix manually.
2486 altMovieParams = re.findall('data
-mgid
="([^"]*episode
.*?
:.*?
)"', html)
2487 if len(altMovieParams) == 0:
2488 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2491 mMovieParams = [("http
://media
.mtvnservices
.com
/" + altMovieParams[0], altMovieParams[0])]
2493 playerUrl_raw = mMovieParams[0][0]
2494 self.report_player_url(epTitle)
2496 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2497 playerUrl = urlHandle.geturl()
2498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2502 uri = mMovieParams[0][1]
2503 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2504 self.report_index_download(epTitle)
2506 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2507 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2508 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2513 idoc = xml.etree.ElementTree.fromstring(indexXml)
2514 itemEls = idoc.findall('.//item')
2515 for itemEl in itemEls:
2516 mediaId = itemEl.findall('./guid')[0].text
2517 shortMediaId = mediaId.split(':')[-1]
2518 showId = mediaId.split(':')[-2].replace('.com', '')
2519 officialTitle = itemEl.findall('./title')[0].text
2520 officialDate = itemEl.findall('./pubDate')[0].text
2522 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2523 compat_urllib_parse.urlencode({'uri': mediaId}))
2524 configReq = compat_urllib_request.Request(configUrl)
2525 self.report_config_download(epTitle)
2527 configXml = compat_urllib_request.urlopen(configReq).read()
2528 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2529 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2532 cdoc = xml.etree.ElementTree.fromstring(configXml)
2534 for rendition in cdoc.findall('.//rendition'):
2535 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2539 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2542 if self._downloader.params.get('listformats', None):
2543 self._print_formats([i[0] for i in turls])
2546 # For now, just pick the highest bitrate
2547 format,video_url = turls[-1]
2549 # Get the format arg from the arg stream
2550 req_format = self._downloader.params.get('format', None)
2552 # Select format if we can find one
2555 format, video_url = f, v
2558 # Patch to download from alternative CDN, which does not
2559 # break on current RTMPDump builds
2560 broken_cdn = "rtmpe
://viacomccstrmfs
.fplive
.net
/viacomccstrm
/gsp
.comedystor
/"
2561 better_cdn = "rtmpe
://cp10740
.edgefcs
.net
/ondemand
/mtvnorigin
/gsp
.comedystor
/"
2563 if video_url.startswith(broken_cdn):
2564 video_url = video_url.replace(broken_cdn, better_cdn)
2566 effTitle = showId + u'-' + epTitle
2571 'upload_date': officialDate,
2576 'description': officialTitle,
2577 'player_url': None #playerUrl
2580 results.append(info)
2585 class EscapistIE(InfoExtractor):
2586 """Information extractor for The Escapist """
2588 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2589 IE_NAME = u'escapist'
2591 def report_extraction(self, showName):
2592 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2594 def report_config_download(self, showName):
2595 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2597 def _real_extract(self, url):
2598 mobj = re.match(self._VALID_URL, url)
2600 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2602 showName = mobj.group('showname')
2603 videoId = mobj.group('episode')
2605 self.report_extraction(showName)
2607 webPage = compat_urllib_request.urlopen(url)
2608 webPageBytes = webPage.read()
2609 m = re.match(r'text/html; charset="?
([^
"]+)"?
', webPage.headers['Content
-Type
'])
2610 webPage = webPageBytes.decode(m.group(1) if m else 'utf
-8')
2611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2612 self._downloader.trouble(u'ERROR
: unable to download webpage
: ' + compat_str(err))
2615 descMatch = re.search('<meta name
="description" content
="([^"]*)"', webPage)
2616 description = unescapeHTML(descMatch.group(1))
2617 imgMatch = re.search('<meta property="og
:image
" content="([^
"]*)"', webPage)
2618 imgUrl = unescapeHTML(imgMatch.group(1))
2619 playerUrlMatch = re.search('<meta
property="og:video" content
="([^"]*)"', webPage)
2620 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2621 configUrlMatch = re.search('config=(.*)$', playerUrl)
2622 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2624 self.report_config_download(showName)
2626 configJSON = compat_urllib_request.urlopen(configUrl).read()
2627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2631 # Technically, it's JavaScript, not JSON
2632 configJSON = configJSON.replace("'", '"')
2635 config = json.loads(configJSON)
2636 except (ValueError,) as err:
2637 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2640 playlist = config['playlist']
2641 videoUrl = playlist[1]['url']
2646 'uploader': showName,
2647 'upload_date': None,
2650 'thumbnail': imgUrl,
2651 'description': description,
2652 'player_url': playerUrl,
2658 class CollegeHumorIE(InfoExtractor):
2659 """Information extractor for collegehumor.com"""
2662 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2663 IE_NAME = u'collegehumor'
2665 def report_manifest(self, video_id):
2666 """Report information extraction."""
2667 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2669 def report_extraction(self, video_id):
2670 """Report information extraction."""
2671 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2673 def _real_extract(self, url):
2674 mobj = re.match(self._VALID_URL, url)
2676 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2678 video_id = mobj.group('videoid')
2683 'upload_date': None,
2686 self.report_extraction(video_id)
2687 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2689 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2694 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2696 videoNode = mdoc.findall('./video')[0]
2697 info['description'] = videoNode.findall('./description')[0].text
2698 info['title'] = videoNode.findall('./caption')[0].text
2699 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2700 manifest_url = videoNode.findall('./file')[0].text
2702 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2705 manifest_url += '?hdcore=2.10.3'
2706 self.report_manifest(video_id)
2708 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2710 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2713 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2715 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2716 node_id = media_node.attrib['url']
2717 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2718 except IndexError as err:
2719 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2722 url_pr = compat_urllib_parse_urlparse(manifest_url)
2723 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2730 class XVideosIE(InfoExtractor):
2731 """Information extractor for xvideos.com"""
2733 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2734 IE_NAME = u'xvideos'
2736 def report_webpage(self, video_id):
2737 """Report information extraction."""
2738 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2740 def report_extraction(self, video_id):
2741 """Report information extraction."""
2742 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2744 def _real_extract(self, url):
2745 mobj = re.match(self._VALID_URL, url)
2747 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2749 video_id = mobj.group(1).decode('utf-8')
2751 self.report_webpage(video_id)
2753 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2755 webpage = compat_urllib_request.urlopen(request).read()
2756 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2757 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2760 self.report_extraction(video_id)
2764 mobj = re.search(r'flv_url=(.+?)&', webpage)
2766 self._downloader.trouble(u'ERROR: unable to extract video url')
2768 video_url = compat_urllib_parse.unquote(mobj.group(1).decode('utf-8'))
2772 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2774 self._downloader.trouble(u'ERROR: unable to extract video title')
2776 video_title = mobj.group(1).decode('utf-8')
2779 # Extract video thumbnail
2780 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2782 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2784 video_thumbnail = mobj.group(0).decode('utf-8')
2790 'upload_date': None,
2791 'title': video_title,
2793 'thumbnail': video_thumbnail,
2794 'description': None,
2800 class SoundcloudIE(InfoExtractor):
2801 """Information extractor for soundcloud.com
2802 To access the media, the uid of the song and a stream token
2803 must be extracted from the page source and the script must make
2804 a request to media.soundcloud.com/crossdomain.xml. Then
2805 the media can be grabbed by requesting from an url composed
2806 of the stream token and uid
2809 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2810 IE_NAME = u'soundcloud'
2812 def __init__(self, downloader=None):
2813 InfoExtractor.__init__(self, downloader)
2815 def report_resolve(self, video_id):
2816 """Report information extraction."""
2817 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2819 def report_extraction(self, video_id):
2820 """Report information extraction."""
2821 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2823 def _real_extract(self, url):
2824 mobj = re.match(self._VALID_URL, url)
2826 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2829 # extract uploader (which is in the url)
2830 uploader = mobj.group(1)
2831 # extract simple title (uploader + slug of song title)
2832 slug_title = mobj.group(2)
2833 simple_title = uploader + u'-' + slug_title
2835 self.report_resolve('%s/%s' % (uploader, slug_title))
2837 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2838 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2839 request = compat_urllib_request.Request(resolv_url)
2841 info_json_bytes = compat_urllib_request.urlopen(request).read()
2842 info_json = info_json_bytes.decode('utf-8')
2843 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2844 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2847 info = json.loads(info_json)
2848 video_id = info['id']
2849 self.report_extraction('%s/%s' % (uploader, slug_title))
2851 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2852 request = compat_urllib_request.Request(streams_url)
2854 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2855 stream_json = stream_json_bytes.decode('utf-8')
2856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2857 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2860 streams = json.loads(stream_json)
2861 mediaURL = streams['http_mp3_128_url']
2866 'uploader': info['user']['username'],
2867 'upload_date': info['created_at'],
2868 'title': info['title'],
2870 'description': info['description'],
2874 class InfoQIE(InfoExtractor):
2875 """Information extractor for infoq.com"""
2877 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2880 def report_webpage(self, video_id):
2881 """Report information extraction."""
2882 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2884 def report_extraction(self, video_id):
2885 """Report information extraction."""
2886 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2888 def _real_extract(self, url):
2889 mobj = re.match(self._VALID_URL, url)
2891 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2894 self.report_webpage(url)
2896 request = compat_urllib_request.Request(url)
2898 webpage = compat_urllib_request.urlopen(request).read()
2899 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2900 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2903 self.report_extraction(url)
2907 mobj = re.search(r"jsclassref
='([^']*)'", webpage)
2909 self._downloader.trouble(u'ERROR
: unable to extract video url
')
2911 video_url = 'rtmpe
://video
.infoq
.com
/cfx
/st
/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64
'))
2915 mobj = re.search(r'contentTitle
= "(.*?)";', webpage)
2917 self._downloader.trouble(u'ERROR
: unable to extract video title
')
2919 video_title = mobj.group(1).decode('utf
-8')
2921 # Extract description
2922 video_description = u'No description available
.'
2923 mobj = re.search(r'<meta name
="description" content
="(.*)"(?
:\s
*/)?
>', webpage)
2924 if mobj is not None:
2925 video_description = mobj.group(1).decode('utf
-8')
2927 video_filename = video_url.split('/')[-1]
2928 video_id, extension = video_filename.split('.')
2934 'upload_date
': None,
2935 'title
': video_title,
2936 'ext
': extension, # Extension is always(?) mp4, but seems to be flv
2938 'description
': video_description,
2943 class MixcloudIE(InfoExtractor):
2944 """Information extractor for www.mixcloud.com"""
2945 _VALID_URL = r'^
(?
:https?
://)?
(?
:www\
.)?mixcloud\
.com
/([\w\d
-]+)/([\w\d
-]+)'
2946 IE_NAME = u'mixcloud
'
2948 def __init__(self, downloader=None):
2949 InfoExtractor.__init__(self, downloader)
2951 def report_download_json(self, file_id):
2952 """Report JSON download."""
2953 self._downloader.to_screen(u'[%s] Downloading json
' % self.IE_NAME)
2955 def report_extraction(self, file_id):
2956 """Report information extraction."""
2957 self._downloader.to_screen(u'[%s] %s: Extracting information
' % (self.IE_NAME, file_id))
2959 def get_urls(self, jsonData, fmt, bitrate='best
'):
2960 """Get urls from 'audio_formats
' section in json"""
2963 bitrate_list = jsonData[fmt]
2964 if bitrate is None or bitrate == 'best
' or bitrate not in bitrate_list:
2965 bitrate = max(bitrate_list) # select highest
2967 url_list = jsonData[fmt][bitrate]
2968 except TypeError: # we have no bitrate info.
2969 url_list = jsonData[fmt]
2972 def check_urls(self, url_list):
2973 """Returns 1st active url from list"""
2974 for url in url_list:
2976 compat_urllib_request.urlopen(url)
2978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2983 def _print_formats(self, formats):
2984 print('Available formats
:')
2985 for fmt in formats.keys():
2986 for b in formats[fmt]:
2988 ext = formats[fmt][b][0]
2989 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2990 except TypeError: # we have no bitrate info
2991 ext = formats[fmt][0]
2992 print('%s\t%s\t[%s]' % (fmt, '??
', ext.split('.')[-1]))
2995 def _real_extract(self, url):
2996 mobj = re.match(self._VALID_URL, url)
2998 self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url)
3000 # extract uploader & filename from url
3001 uploader = mobj.group(1).decode('utf
-8')
3002 file_id = uploader + "-" + mobj.group(2).decode('utf
-8')
3004 # construct API request
3005 file_url = 'http
://www
.mixcloud
.com
/api
/1/cloudcast
/' + '/'.join(url.split('/')[-3:-1]) + '.json
'
3006 # retrieve .json file with links to files
3007 request = compat_urllib_request.Request(file_url)
3009 self.report_download_json(file_url)
3010 jsonData = compat_urllib_request.urlopen(request).read()
3011 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3012 self._downloader.trouble(u'ERROR
: Unable to retrieve
file: %s' % compat_str(err))
3016 json_data = json.loads(jsonData)
3017 player_url = json_data['player_swf_url
']
3018 formats = dict(json_data['audio_formats
'])
3020 req_format = self._downloader.params.get('format
', None)
3023 if self._downloader.params.get('listformats
', None):
3024 self._print_formats(formats)
3027 if req_format is None or req_format == 'best
':
3028 for format_param in formats.keys():
3029 url_list = self.get_urls(formats, format_param)
3031 file_url = self.check_urls(url_list)
3032 if file_url is not None:
3035 if req_format not in formats.keys():
3036 self._downloader.trouble(u'ERROR
: format
is not available
')
3039 url_list = self.get_urls(formats, req_format)
3040 file_url = self.check_urls(url_list)
3041 format_param = req_format
3044 'id': file_id.decode('utf
-8'),
3045 'url
': file_url.decode('utf
-8'),
3046 'uploader
': uploader.decode('utf
-8'),
3047 'upload_date
': None,
3048 'title
': json_data['name
'],
3049 'ext
': file_url.split('.')[-1].decode('utf
-8'),
3050 'format
': (format_param is None and u'NA
' or format_param.decode('utf
-8')),
3051 'thumbnail
': json_data['thumbnail_url
'],
3052 'description
': json_data['description
'],
3053 'player_url
': player_url.decode('utf
-8'),
3056 class StanfordOpenClassroomIE(InfoExtractor):
3057 """Information extractor for Stanford's Open ClassRoom
"""
3059 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3060 IE_NAME = u'stanfordoc'
3062 def report_download_webpage(self, objid):
3063 """Report information extraction
."""
3064 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3066 def report_extraction(self, video_id):
3067 """Report information extraction
."""
3068 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3070 def _real_extract(self, url):
3071 mobj = re.match(self._VALID_URL, url)
3073 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3076 if mobj.group('course') and mobj.group('video'): # A specific video
3077 course = mobj.group('course')
3078 video = mobj.group('video')
3080 'id': course + '_' + video,
3082 'upload_date': None,
3085 self.report_extraction(info['id'])
3086 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3087 xmlUrl = baseUrl + video + '.xml'
3089 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3093 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3095 info['title'] = mdoc.findall('./title')[0].text
3096 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3098 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3100 info['ext'] = info['url'].rpartition('.')[2]
3102 elif mobj.group('course'): # A course page
3103 course = mobj.group('course')
3108 'upload_date': None,
3111 self.report_download_webpage(info['id'])
3113 coursepage = compat_urllib_request.urlopen(url).read()
3114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3115 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3118 m = re.search('<h1>([^<]+)</h1>', coursepage)
3120 info['title'] = unescapeHTML(m.group(1))
3122 info['title'] = info['id']
3124 m = re.search('<description>([^<]+)</description>', coursepage)
3126 info['description'] = unescapeHTML(m.group(1))
3128 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3131 'type': 'reference',
3132 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3136 for entry in info['list']:
3137 assert entry['type'] == 'reference'
3138 results += self.extract(entry['url'])
3143 'id': 'Stanford OpenClassroom',
3146 'upload_date': None,
3149 self.report_download_webpage(info['id'])
3150 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3152 rootpage = compat_urllib_request.urlopen(rootURL).read()
3153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3157 info['title'] = info['id']
3159 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3162 'type': 'reference',
3163 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3168 for entry in info['list']:
3169 assert entry['type'] == 'reference'
3170 results += self.extract(entry['url'])
3173 class MTVIE(InfoExtractor):
3174 """Information extractor
for MTV
.com
"""
3176 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3179 def report_webpage(self, video_id):
3180 """Report information extraction
."""
3181 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3183 def report_extraction(self, video_id):
3184 """Report information extraction
."""
3185 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3187 def _real_extract(self, url):
3188 mobj = re.match(self._VALID_URL, url)
3190 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3192 if not mobj.group('proto'):
3193 url = 'http://' + url
3194 video_id = mobj.group('videoid')
3195 self.report_webpage(video_id)
3197 request = compat_urllib_request.Request(url)
3199 webpage = compat_urllib_request.urlopen(request).read()
3200 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3201 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3204 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3206 self._downloader.trouble(u'ERROR: unable to extract song name')
3208 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3209 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3211 self._downloader.trouble(u'ERROR: unable to extract performer')
3213 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214 video_title = performer + ' - ' + song_name
3216 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3218 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3220 mtvn_uri = mobj.group(1)
3222 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3224 self._downloader.trouble(u'ERROR: unable to extract content id')
3226 content_id = mobj.group(1)
3228 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3229 self.report_extraction(video_id)
3230 request = compat_urllib_request.Request(videogen_url)
3232 metadataXml = compat_urllib_request.urlopen(request).read()
3233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3234 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3237 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3238 renditions = mdoc.findall('.//rendition')
3240 # For now, always pick the highest quality.
3241 rendition = renditions[-1]
3244 _,_,ext = rendition.attrib['type'].partition('/')
3245 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3246 video_url = rendition.find('./src').text
3248 self._downloader.trouble('Invalid rendition field.')
3254 'uploader': performer,
3255 'upload_date': None,
3256 'title': video_title,
3264 class YoukuIE(InfoExtractor):
3266 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3269 def __init__(self, downloader=None):
3270 InfoExtractor.__init__(self, downloader)
3272 def report_download_webpage(self, file_id):
3273 """Report webpage download
."""
3274 self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
3276 def report_extraction(self, file_id):
3277 """Report information extraction
."""
3278 self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
3281 nowTime = int(time.time() * 1000)
3282 random1 = random.randint(1000,1998)
3283 random2 = random.randint(1000,9999)
3285 return "%d%d%d" %(nowTime,random1,random2)
3287 def _get_file_ID_mix_string(self, seed):
3289 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3291 for i in range(len(source)):
3292 seed = (seed * 211 + 30031 ) % 65536
3293 index = math.floor(seed / 65536 * len(source) )
3294 mixed.append(source[int(index)])
3295 source.remove(source[int(index)])
3296 #return ''.join(mixed)
3299 def _get_file_id(self, fileId, seed):
3300 mixed = self._get_file_ID_mix_string(seed)
3301 ids = fileId.split('*')
3305 realId.append(mixed[int(ch)])
3306 return ''.join(realId)
3308 def _real_extract(self, url):
3309 mobj = re.match(self._VALID_URL, url)
3311 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3313 video_id = mobj.group('ID')
3315 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3317 request = compat_urllib_request.Request(info_url, None, std_headers)
3319 self.report_download_webpage(video_id)
3320 jsondata = compat_urllib_request.urlopen(request).read()
3321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3325 self.report_extraction(video_id)
3327 config = json.loads(jsondata)
3329 video_title = config['data'][0]['title']
3330 seed = config['data'][0]['seed']
3332 format = self._downloader.params.get('format', None)
3333 supported_format = config['data'][0]['streamfileids'].keys()
3335 if format is None or format == 'best':
3336 if 'hd2' in supported_format:
3341 elif format == 'worst':
3349 fileid = config['data'][0]['streamfileids'][format]
3350 seg_number = len(config['data'][0]['segs'][format])
3353 for i in xrange(seg_number):
3354 keys.append(config['data'][0]['segs'][format][i]['k'])
3357 #youku only could be viewed from mainland china
3359 self._downloader.trouble(u'ERROR: unable to extract info section')
3363 sid = self._gen_sid()
3364 fileid = self._get_file_id(fileid, seed)
3366 #column 8,9 of fileid represent the segment number
3367 #fileid[7:9] should be changed
3368 for index, key in enumerate(keys):
3370 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3374 'id': '%s_part%02d' % (video_id, index),
3375 'url': download_url,
3377 'upload_date': None,
3378 'title': video_title,
3381 files_info.append(info)
3386 class XNXXIE(InfoExtractor):
3387 """Information extractor
for xnxx
.com
"""
3389 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3391 VIDEO_URL_RE = r'flv_url=(.*?)&'
3392 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3395 def report_webpage(self, video_id):
3396 """Report information extraction
"""
3397 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3399 def report_extraction(self, video_id):
3400 """Report information extraction
"""
3401 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3406 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3408 video_id = mobj.group(1).decode('utf-8')
3410 self.report_webpage(video_id)
3412 # Get webpage content
3414 webpage = compat_urllib_request.urlopen(url).read()
3415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3419 result = re.search(self.VIDEO_URL_RE, webpage)
3421 self._downloader.trouble(u'ERROR: unable to extract video url')
3423 video_url = compat_urllib_parse.unquote(result.group(1).decode('utf-8'))
3425 result = re.search(self.VIDEO_TITLE_RE, webpage)
3427 self._downloader.trouble(u'ERROR: unable to extract video title')
3429 video_title = result.group(1).decode('utf-8')
3431 result = re.search(self.VIDEO_THUMB_RE, webpage)
3433 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3435 video_thumbnail = result.group(1).decode('utf-8')
3441 'upload_date': None,
3442 'title': video_title,
3444 'thumbnail': video_thumbnail,
3445 'description': None,
3449 class GooglePlusIE(InfoExtractor):
3450 """Information extractor
for plus
.google
.com
."""
3452 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
3453 IE_NAME = u'plus.google'
3455 def __init__(self, downloader=None):
3456 InfoExtractor.__init__(self, downloader)
3458 def report_extract_entry(self, url):
3459 """Report downloading extry
"""
3460 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
3462 def report_date(self, upload_date):
3463 """Report downloading extry
"""
3464 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3466 def report_uploader(self, uploader):
3467 """Report downloading extry
"""
3468 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
3470 def report_title(self, video_title):
3471 """Report downloading extry
"""
3472 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
3474 def report_extract_vid_page(self, video_page):
3475 """Report information extraction
."""
3476 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
3478 def _real_extract(self, url):
3479 # Extract id from URL
3480 mobj = re.match(self._VALID_URL, url)
3482 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3485 post_url = mobj.group(0)
3486 video_id = mobj.group(2)
3488 video_extension = 'flv'
3490 # Step 1, Retrieve post webpage to extract further information
3491 self.report_extract_entry(post_url)
3492 request = compat_urllib_request.Request(post_url)
3494 webpage = compat_urllib_request.urlopen(request).read()
3495 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3496 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3499 # Extract update date
3501 pattern = 'title="Timestamp">(.*?)</a>'
3502 mobj = re.search(pattern, webpage)
3504 upload_date = mobj.group(1)
3505 # Convert timestring to a format suitable for filename
3506 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3507 upload_date = upload_date.strftime('%Y%m%d')
3508 self.report_date(upload_date)
3512 pattern = r'rel\="author".*?>(.*?)</a>'
3513 mobj = re.search(pattern, webpage)
3515 uploader = mobj.group(1)
3516 self.report_uploader(uploader)
3519 # Get the first line for title
3521 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3522 mobj = re.search(pattern, webpage)
3524 video_title = mobj.group(1)
3525 self.report_title(video_title)
3527 # Step 2, Stimulate clicking the image box to launch video
3528 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3529 mobj = re.search(pattern, webpage)
3531 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3533 video_page = mobj.group(1)
3534 request = compat_urllib_request.Request(video_page)
3536 webpage = compat_urllib_request.urlopen(request).read()
3537 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3538 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3540 self.report_extract_vid_page(video_page)
3543 # Extract video links on video page
3544 """Extract video links of all sizes
"""
3545 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3546 mobj = re.findall(pattern, webpage)
3548 self._downloader.trouble(u'ERROR: unable to extract video links')
3550 # Sort in resolution
3551 links = sorted(mobj)
3553 # Choose the lowest of the sort, i.e. highest resolution
3554 video_url = links[-1]
3555 # Only get the url. The resolution part in the tuple has no use anymore
3556 video_url = video_url[-1]
3557 # Treat escaped \u0026 style hex
3558 video_url = unicode(video_url, "unicode_escape")
3562 'id': video_id.decode('utf-8'),
3564 'uploader': uploader.decode('utf-8'),
3565 'upload_date': upload_date.decode('utf-8'),
3566 'title': video_title.decode('utf-8'),
3567 'ext': video_extension.decode('utf-8'),