2 # -*- coding: utf-8 -*- 
  15 import xml
.etree
.ElementTree
 
  16 from urlparse 
import parse_qs
 
  19         import cStringIO 
as StringIO
 
  26 class InfoExtractor(object): 
  27         """Information Extractor class. 
  29         Information extractors are the classes that, given a URL, extract 
  30         information from the video (or videos) the URL refers to. This 
  31         information includes the real video URL, the video title and simplified 
  32         title, author and others. The information is stored in a dictionary 
  33         which is then passed to the FileDownloader. The FileDownloader 
  34         processes this information possibly downloading the video to the file 
  35         system, among other possible outcomes. The dictionaries must include 
  40         uploader:       Nickname of the video uploader. 
  42         ext:            Video filename extension. 
  44         player_url:     SWF Player URL (may be None). 
  46         The following fields are optional. Their primary purpose is to allow 
  47         youtube-dl to serve as the backend for a video search function, such 
  48         as the one in youtube2mp3.  They are only used when their respective 
  49         forced printing functions are called: 
  51         thumbnail:      Full URL to a video thumbnail image. 
  52         description:    One-line video description. 
  54         Subclasses of this one should re-define the _real_initialize() and 
  55         _real_extract() methods and define a _VALID_URL regexp. 
  56         Probably, they should also be added to the list of extractors. 
  62         def __init__(self
, downloader
=None): 
  63                 """Constructor. Receives an optional downloader.""" 
  65                 self
.set_downloader(downloader
) 
  67         def suitable(self
, url
): 
  68                 """Receives a URL and returns True if suitable for this IE.""" 
  69                 return re
.match(self
._VALID
_URL
, url
) is not None 
  72                 """Initializes an instance (authentication, etc).""" 
  74                         self
._real
_initialize
() 
  77         def extract(self
, url
): 
  78                 """Extracts URL information and returns it in list of dicts.""" 
  80                 return self
._real
_extract
(url
) 
  82         def set_downloader(self
, downloader
): 
  83                 """Sets the downloader for this IE.""" 
  84                 self
._downloader 
= downloader
 
  86         def _real_initialize(self
): 
  87                 """Real initialization process. Redefine in subclasses.""" 
  90         def _real_extract(self
, url
): 
  91                 """Real extraction process. Redefine in subclasses.""" 
  95 class YoutubeIE(InfoExtractor
): 
  96         """Information extractor for youtube.com.""" 
  98         _VALID_URL 
= r
'^((?:https?://)?(?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|tube.majestyc.net/)(?!view_play_list|my_playlists|artist|playlist)(?:(?:(?:v|embed|e)/)|(?:(?:watch(?:_popup)?(?:\.php)?)?(?:\?|#!?)(?:.+&)?v=))?)?([0-9A-Za-z_-]+)(?(1).+)?$' 
  99         _LANG_URL 
= r
'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' 
 100         _LOGIN_URL 
= 'https://www.youtube.com/signup?next=/&gl=US&hl=en' 
 101         _AGE_URL 
= 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' 
 102         _NEXT_URL_RE 
= r
'[\?&]next_url=([^&]+)' 
 103         _NETRC_MACHINE 
= 'youtube' 
 104         # Listed in order of quality 
 105         _available_formats 
= ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] 
 106         _available_formats_prefer_free 
= ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] 
 107         _video_extensions 
= { 
 113                 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever 
 119         _video_dimensions 
= { 
 137         def report_lang(self
): 
 138                 """Report attempt to set language.""" 
 139                 self
._downloader
.to_screen(u
'[youtube] Setting language') 
 141         def report_login(self
): 
 142                 """Report attempt to log in.""" 
 143                 self
._downloader
.to_screen(u
'[youtube] Logging in') 
 145         def report_age_confirmation(self
): 
 146                 """Report attempt to confirm age.""" 
 147                 self
._downloader
.to_screen(u
'[youtube] Confirming age') 
 149         def report_video_webpage_download(self
, video_id
): 
 150                 """Report attempt to download video webpage.""" 
 151                 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video webpage' % video_id
) 
 153         def report_video_info_webpage_download(self
, video_id
): 
 154                 """Report attempt to download video info webpage.""" 
 155                 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video info webpage' % video_id
) 
 157         def report_video_subtitles_download(self
, video_id
): 
 158                 """Report attempt to download video info webpage.""" 
 159                 self
._downloader
.to_screen(u
'[youtube] %s: Downloading video subtitles' % video_id
) 
 161         def report_information_extraction(self
, video_id
): 
 162                 """Report attempt to extract video information.""" 
 163                 self
._downloader
.to_screen(u
'[youtube] %s: Extracting video information' % video_id
) 
 165         def report_unavailable_format(self
, video_id
, format
): 
 166                 """Report extracted video URL.""" 
 167                 self
._downloader
.to_screen(u
'[youtube] %s: Format %s not available' % (video_id
, format
)) 
 169         def report_rtmp_download(self
): 
 170                 """Indicate the download will use the RTMP protocol.""" 
 171                 self
._downloader
.to_screen(u
'[youtube] RTMP download detected') 
 173         def _closed_captions_xml_to_srt(self
, xml_string
): 
 175                 texts 
= re
.findall(r
'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string
, re
.MULTILINE
) 
 176                 # TODO parse xml instead of regex 
 177                 for n
, (start
, dur_tag
, dur
, caption
) in enumerate(texts
): 
 178                         if not dur
: dur 
= '4' 
 180                         end 
= start 
+ float(dur
) 
 181                         start 
= "%02i:%02i:%02i,%03i" %(start
/(60*60), start
/60%60, start
%60, start
%1*1000) 
 182                         end 
= "%02i:%02i:%02i,%03i" %(end
/(60*60), end
/60%60, end
%60, end
%1*1000) 
 183                         caption 
= unescapeHTML(caption
) 
 184                         caption 
= unescapeHTML(caption
) # double cycle, intentional 
 185                         srt 
+= str(n
+1) + '\n' 
 186                         srt 
+= start 
+ ' --> ' + end 
+ '\n' 
 187                         srt 
+= caption 
+ '\n\n' 
 190         def _print_formats(self
, formats
): 
 191                 print 'Available formats:' 
 193                         print '%s\t:\t%s\t[%s]' %(x
, self
._video
_extensions
.get(x
, 'flv'), self
._video
_dimensions
.get(x
, '???')) 
 195         def _real_initialize(self
): 
 196                 if self
._downloader 
is None: 
 201                 downloader_params 
= self
._downloader
.params
 
 203                 # Attempt to use provided username and password or .netrc data 
 204                 if downloader_params
.get('username', None) is not None: 
 205                         username 
= downloader_params
['username'] 
 206                         password 
= downloader_params
['password'] 
 207                 elif downloader_params
.get('usenetrc', False): 
 209                                 info 
= netrc
.netrc().authenticators(self
._NETRC
_MACHINE
) 
 214                                         raise netrc
.NetrcParseError('No authenticators for %s' % self
._NETRC
_MACHINE
) 
 215                         except (IOError, netrc
.NetrcParseError
), err
: 
 216                                 self
._downloader
.to_stderr(u
'WARNING: parsing .netrc: %s' % str(err
)) 
 220                 request 
= urllib2
.Request(self
._LANG
_URL
) 
 223                         urllib2
.urlopen(request
).read() 
 224                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 225                         self
._downloader
.to_stderr(u
'WARNING: unable to set language: %s' % str(err
)) 
 228                 # No authentication to be performed 
 234                                 'current_form': 'loginForm', 
 236                                 'action_login': 'Log In', 
 237                                 'username':     username
, 
 238                                 'password':     password
, 
 240                 request 
= urllib2
.Request(self
._LOGIN
_URL
, urllib
.urlencode(login_form
)) 
 243                         login_results 
= urllib2
.urlopen(request
).read() 
 244                         if re
.search(r
'(?i)<form[^>]* name="loginForm"', login_results
) is not None: 
 245                                 self
._downloader
.to_stderr(u
'WARNING: unable to log in: bad username or password') 
 247                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 248                         self
._downloader
.to_stderr(u
'WARNING: unable to log in: %s' % str(err
)) 
 254                                 'action_confirm':       'Confirm', 
 256                 request 
= urllib2
.Request(self
._AGE
_URL
, urllib
.urlencode(age_form
)) 
 258                         self
.report_age_confirmation() 
 259                         age_results 
= urllib2
.urlopen(request
).read() 
 260                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 261                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
 264         def _real_extract(self
, url
): 
 265                 # Extract original video URL from URL with redirection, like age verification, using next_url parameter 
 266                 mobj 
= re
.search(self
._NEXT
_URL
_RE
, url
) 
 268                         url 
= 'http://www.youtube.com/' + urllib
.unquote(mobj
.group(1)).lstrip('/') 
 270                 # Extract video id from URL 
 271                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 273                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
 275                 video_id 
= mobj
.group(2) 
 278                 self
.report_video_webpage_download(video_id
) 
 279                 request 
= urllib2
.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
) 
 281                         video_webpage 
= urllib2
.urlopen(request
).read() 
 282                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 283                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
 286                 # Attempt to extract SWF player URL 
 287                 mobj 
= re
.search(r
'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage
) 
 289                         player_url 
= re
.sub(r
'\\(.)', r
'\1', mobj
.group(1)) 
 294                 self
.report_video_info_webpage_download(video_id
) 
 295                 for el_type 
in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: 
 296                         video_info_url 
= ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' 
 297                                         % (video_id
, el_type
)) 
 298                         request 
= urllib2
.Request(video_info_url
) 
 300                                 video_info_webpage 
= urllib2
.urlopen(request
).read() 
 301                                 video_info 
= parse_qs(video_info_webpage
) 
 302                                 if 'token' in video_info
: 
 304                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 305                                 self
._downloader
.trouble(u
'ERROR: unable to download video info webpage: %s' % str(err
)) 
 307                 if 'token' not in video_info
: 
 308                         if 'reason' in video_info
: 
 309                                 self
._downloader
.trouble(u
'ERROR: YouTube said: %s' % video_info
['reason'][0].decode('utf-8')) 
 311                                 self
._downloader
.trouble(u
'ERROR: "token" parameter not in video info for unknown reason') 
 314                 # Check for "rental" videos 
 315                 if 'ypc_video_rental_bar_text' in video_info 
and 'author' not in video_info
: 
 316                         self
._downloader
.trouble(u
'ERROR: "rental" videos not supported') 
 319                 # Start extracting information 
 320                 self
.report_information_extraction(video_id
) 
 323                 if 'author' not in video_info
: 
 324                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
 326                 video_uploader 
= urllib
.unquote_plus(video_info
['author'][0]) 
 329                 if 'title' not in video_info
: 
 330                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
 332                 video_title 
= urllib
.unquote_plus(video_info
['title'][0]) 
 333                 video_title 
= video_title
.decode('utf-8') 
 336                 if 'thumbnail_url' not in video_info
: 
 337                         self
._downloader
.trouble(u
'WARNING: unable to extract video thumbnail') 
 339                 else:   # don't panic if we can't find it 
 340                         video_thumbnail 
= urllib
.unquote_plus(video_info
['thumbnail_url'][0]) 
 344                 mobj 
= re
.search(r
'id="eow-date.*?>(.*?)</span>', video_webpage
, re
.DOTALL
) 
 346                         upload_date 
= ' '.join(re
.sub(r
'[/,-]', r
' ', mobj
.group(1)).split()) 
 347                         format_expressions 
= ['%d %B %Y', '%B %d %Y', '%b %d %Y'] 
 348                         for expression 
in format_expressions
: 
 350                                         upload_date 
= datetime
.datetime
.strptime(upload_date
, expression
).strftime('%Y%m%d') 
 355                 video_description 
= get_element_by_id("eow-description", video_webpage
.decode('utf8')) 
 356                 if video_description
: video_description 
= clean_html(video_description
) 
 357                 else: video_description 
= '' 
 360                 video_subtitles 
= None 
 361                 if self
._downloader
.params
.get('writesubtitles', False): 
 363                                 self
.report_video_subtitles_download(video_id
) 
 364                                 request 
= urllib2
.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id
) 
 366                                         srt_list 
= urllib2
.urlopen(request
).read() 
 367                                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 368                                         raise Trouble(u
'WARNING: unable to download video subtitles: %s' % str(err
)) 
 369                                 srt_lang_list 
= re
.findall(r
'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list
) 
 370                                 srt_lang_list 
= dict((l
[1], l
[0]) for l 
in srt_lang_list
) 
 371                                 if not srt_lang_list
: 
 372                                         raise Trouble(u
'WARNING: video has no closed captions') 
 373                                 if self
._downloader
.params
.get('subtitleslang', False): 
 374                                         srt_lang 
= self
._downloader
.params
.get('subtitleslang') 
 375                                 elif 'en' in srt_lang_list
: 
 378                                         srt_lang 
= srt_lang_list
.keys()[0] 
 379                                 if not srt_lang 
in srt_lang_list
: 
 380                                         raise Trouble(u
'WARNING: no closed captions found in the specified language') 
 381                                 request 
= urllib2
.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang
, srt_lang_list
[srt_lang
], video_id
)) 
 383                                         srt_xml 
= urllib2
.urlopen(request
).read() 
 384                                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 385                                         raise Trouble(u
'WARNING: unable to download video subtitles: %s' % str(err
)) 
 387                                         raise Trouble(u
'WARNING: unable to download video subtitles') 
 388                                 video_subtitles 
= self
._closed
_captions
_xml
_to
_srt
(srt_xml
.decode('utf-8')) 
 389                         except Trouble 
as trouble
: 
 390                                 self
._downloader
.trouble(trouble
[0]) 
 393                 video_token 
= urllib
.unquote_plus(video_info
['token'][0]) 
 395                 # Decide which formats to download 
 396                 req_format 
= self
._downloader
.params
.get('format', None) 
 398                 if 'conn' in video_info 
and video_info
['conn'][0].startswith('rtmp'): 
 399                         self
.report_rtmp_download() 
 400                         video_url_list 
= [(None, video_info
['conn'][0])] 
 401                 elif 'url_encoded_fmt_stream_map' in video_info 
and len(video_info
['url_encoded_fmt_stream_map']) >= 1: 
 402                         url_data_strs 
= video_info
['url_encoded_fmt_stream_map'][0].split(',') 
 403                         url_data 
= [parse_qs(uds
) for uds 
in url_data_strs
] 
 404                         url_data 
= filter(lambda ud
: 'itag' in ud 
and 'url' in ud
, url_data
) 
 405                         url_map 
= dict((ud
['itag'][0], ud
['url'][0] + '&signature=' + ud
['sig'][0]) for ud 
in url_data
) 
 407                         format_limit 
= self
._downloader
.params
.get('format_limit', None) 
 408                         available_formats 
= self
._available
_formats
_prefer
_free 
if self
._downloader
.params
.get('prefer_free_formats', False) else self
._available
_formats
 
 409                         if format_limit 
is not None and format_limit 
in available_formats
: 
 410                                 format_list 
= available_formats
[available_formats
.index(format_limit
):] 
 412                                 format_list 
= available_formats
 
 413                         existing_formats 
= [x 
for x 
in format_list 
if x 
in url_map
] 
 414                         if len(existing_formats
) == 0: 
 415                                 self
._downloader
.trouble(u
'ERROR: no known formats available for video') 
 417                         if self
._downloader
.params
.get('listformats', None): 
 418                                 self
._print
_formats
(existing_formats
) 
 420                         if req_format 
is None or req_format 
== 'best': 
 421                                 video_url_list 
= [(existing_formats
[0], url_map
[existing_formats
[0]])] # Best quality 
 422                         elif req_format 
== 'worst': 
 423                                 video_url_list 
= [(existing_formats
[len(existing_formats
)-1], url_map
[existing_formats
[len(existing_formats
)-1]])] # worst quality 
 424                         elif req_format 
in ('-1', 'all'): 
 425                                 video_url_list 
= [(f
, url_map
[f
]) for f 
in existing_formats
] # All formats 
 427                                 # Specific formats. We pick the first in a slash-delimeted sequence. 
 428                                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. 
 429                                 req_formats 
= req_format
.split('/') 
 430                                 video_url_list 
= None 
 431                                 for rf 
in req_formats
: 
 433                                                 video_url_list 
= [(rf
, url_map
[rf
])] 
 435                                 if video_url_list 
is None: 
 436                                         self
._downloader
.trouble(u
'ERROR: requested format not available') 
 439                         self
._downloader
.trouble(u
'ERROR: no conn or url_encoded_fmt_stream_map information found in video info') 
 443                 for format_param
, video_real_url 
in video_url_list
: 
 445                         video_extension 
= self
._video
_extensions
.get(format_param
, 'flv') 
 448                                 'id':           video_id
.decode('utf-8'), 
 449                                 'url':          video_real_url
.decode('utf-8'), 
 450                                 'uploader':     video_uploader
.decode('utf-8'), 
 451                                 'upload_date':  upload_date
, 
 452                                 'title':        video_title
, 
 453                                 'ext':          video_extension
.decode('utf-8'), 
 454                                 'format':       (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
 455                                 'thumbnail':    video_thumbnail
.decode('utf-8'), 
 456                                 'description':  video_description
, 
 457                                 'player_url':   player_url
, 
 458                                 'subtitles':    video_subtitles
 
 463 class MetacafeIE(InfoExtractor
): 
 464         """Information Extractor for metacafe.com.""" 
 466         _VALID_URL 
= r
'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' 
 467         _DISCLAIMER 
= 'http://www.metacafe.com/family_filter/' 
 468         _FILTER_POST 
= 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' 
 469         IE_NAME 
= u
'metacafe' 
 471         def __init__(self
, downloader
=None): 
 472                 InfoExtractor
.__init
__(self
, downloader
) 
 474         def report_disclaimer(self
): 
 475                 """Report disclaimer retrieval.""" 
 476                 self
._downloader
.to_screen(u
'[metacafe] Retrieving disclaimer') 
 478         def report_age_confirmation(self
): 
 479                 """Report attempt to confirm age.""" 
 480                 self
._downloader
.to_screen(u
'[metacafe] Confirming age') 
 482         def report_download_webpage(self
, video_id
): 
 483                 """Report webpage download.""" 
 484                 self
._downloader
.to_screen(u
'[metacafe] %s: Downloading webpage' % video_id
) 
 486         def report_extraction(self
, video_id
): 
 487                 """Report information extraction.""" 
 488                 self
._downloader
.to_screen(u
'[metacafe] %s: Extracting information' % video_id
) 
 490         def _real_initialize(self
): 
 491                 # Retrieve disclaimer 
 492                 request 
= urllib2
.Request(self
._DISCLAIMER
) 
 494                         self
.report_disclaimer() 
 495                         disclaimer 
= urllib2
.urlopen(request
).read() 
 496                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 497                         self
._downloader
.trouble(u
'ERROR: unable to retrieve disclaimer: %s' % str(err
)) 
 503                         'submit': "Continue - I'm over 18", 
 505                 request 
= urllib2
.Request(self
._FILTER
_POST
, urllib
.urlencode(disclaimer_form
)) 
 507                         self
.report_age_confirmation() 
 508                         disclaimer 
= urllib2
.urlopen(request
).read() 
 509                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 510                         self
._downloader
.trouble(u
'ERROR: unable to confirm age: %s' % str(err
)) 
 513         def _real_extract(self
, url
): 
 514                 # Extract id and simplified title from URL 
 515                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 517                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
 520                 video_id 
= mobj
.group(1) 
 522                 # Check if video comes from YouTube 
 523                 mobj2 
= re
.match(r
'^yt-(.*)$', video_id
) 
 524                 if mobj2 
is not None: 
 525                         self
._downloader
.download(['http://www.youtube.com/watch?v=%s' % mobj2
.group(1)]) 
 528                 # Retrieve video webpage to extract further information 
 529                 request 
= urllib2
.Request('http://www.metacafe.com/watch/%s/' % video_id
) 
 531                         self
.report_download_webpage(video_id
) 
 532                         webpage 
= urllib2
.urlopen(request
).read() 
 533                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 534                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
 537                 # Extract URL, uploader and title from webpage 
 538                 self
.report_extraction(video_id
) 
 539                 mobj 
= re
.search(r
'(?m)&mediaURL=([^&]+)', webpage
) 
 541                         mediaURL 
= urllib
.unquote(mobj
.group(1)) 
 542                         video_extension 
= mediaURL
[-3:] 
 544                         # Extract gdaKey if available 
 545                         mobj 
= re
.search(r
'(?m)&gdaKey=(.*?)&', webpage
) 
 549                                 gdaKey 
= mobj
.group(1) 
 550                                 video_url 
= '%s?__gda__=%s' % (mediaURL
, gdaKey
) 
 552                         mobj 
= re
.search(r
' name="flashvars" value="(.*?)"', webpage
) 
 554                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 556                         vardict 
= parse_qs(mobj
.group(1)) 
 557                         if 'mediaData' not in vardict
: 
 558                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 560                         mobj 
= re
.search(r
'"mediaURL":"(http.*?)","key":"(.*?)"', vardict
['mediaData'][0]) 
 562                                 self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 564                         mediaURL 
= mobj
.group(1).replace('\\/', '/') 
 565                         video_extension 
= mediaURL
[-3:] 
 566                         video_url 
= '%s?__gda__=%s' % (mediaURL
, mobj
.group(2)) 
 568                 mobj 
= re
.search(r
'(?im)<title>(.*) - Video</title>', webpage
) 
 570                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
 572                 video_title 
= mobj
.group(1).decode('utf-8') 
 574                 mobj 
= re
.search(r
'(?ms)By:\s*<a .*?>(.+?)<', webpage
) 
 576                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
 578                 video_uploader 
= mobj
.group(1) 
 581                         'id':           video_id
.decode('utf-8'), 
 582                         'url':          video_url
.decode('utf-8'), 
 583                         'uploader':     video_uploader
.decode('utf-8'), 
 584                         'upload_date':  u
'NA', 
 585                         'title':        video_title
, 
 586                         'ext':          video_extension
.decode('utf-8'), 
 592 class DailymotionIE(InfoExtractor
): 
 593         """Information Extractor for Dailymotion""" 
 595         _VALID_URL 
= r
'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' 
 596         IE_NAME 
= u
'dailymotion' 
 598         def __init__(self
, downloader
=None): 
 599                 InfoExtractor
.__init
__(self
, downloader
) 
 601         def report_download_webpage(self
, video_id
): 
 602                 """Report webpage download.""" 
 603                 self
._downloader
.to_screen(u
'[dailymotion] %s: Downloading webpage' % video_id
) 
 605         def report_extraction(self
, video_id
): 
 606                 """Report information extraction.""" 
 607                 self
._downloader
.to_screen(u
'[dailymotion] %s: Extracting information' % video_id
) 
 609         def _real_extract(self
, url
): 
 610                 # Extract id and simplified title from URL 
 611                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 613                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
 616                 video_id 
= mobj
.group(1) 
 618                 video_extension 
= 'flv' 
 620                 # Retrieve video webpage to extract further information 
 621                 request 
= urllib2
.Request(url
) 
 622                 request
.add_header('Cookie', 'family_filter=off') 
 624                         self
.report_download_webpage(video_id
) 
 625                         webpage 
= urllib2
.urlopen(request
).read() 
 626                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 627                         self
._downloader
.trouble(u
'ERROR: unable retrieve video webpage: %s' % str(err
)) 
 630                 # Extract URL, uploader and title from webpage 
 631                 self
.report_extraction(video_id
) 
 632                 mobj 
= re
.search(r
'(?i)addVariable\(\"sequence\"\s*,\s*\"([^\"]+?)\"\)', webpage
) 
 634                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 636                 sequence 
= urllib
.unquote(mobj
.group(1)) 
 637                 mobj 
= re
.search(r
',\"sdURL\"\:\"([^\"]+?)\",', sequence
) 
 639                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 641                 mediaURL 
= urllib
.unquote(mobj
.group(1)).replace('\\', '') 
 643                 # if needed add http://www.dailymotion.com/ if relative URL 
 647                 mobj 
= re
.search(r
'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage
) 
 649                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
 651                 video_title 
= unescapeHTML(mobj
.group('title').decode('utf-8')) 
 653                 mobj 
= re
.search(r
'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a></span>', webpage
) 
 655                         self
._downloader
.trouble(u
'ERROR: unable to extract uploader nickname') 
 657                 video_uploader 
= mobj
.group(1) 
 660                         'id':           video_id
.decode('utf-8'), 
 661                         'url':          video_url
.decode('utf-8'), 
 662                         'uploader':     video_uploader
.decode('utf-8'), 
 663                         'upload_date':  u
'NA', 
 664                         'title':        video_title
, 
 665                         'ext':          video_extension
.decode('utf-8'), 
 671 class GoogleIE(InfoExtractor
): 
 672         """Information extractor for video.google.com.""" 
 674         _VALID_URL 
= r
'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' 
 675         IE_NAME 
= u
'video.google' 
 677         def __init__(self
, downloader
=None): 
 678                 InfoExtractor
.__init
__(self
, downloader
) 
 680         def report_download_webpage(self
, video_id
): 
 681                 """Report webpage download.""" 
 682                 self
._downloader
.to_screen(u
'[video.google] %s: Downloading webpage' % video_id
) 
 684         def report_extraction(self
, video_id
): 
 685                 """Report information extraction.""" 
 686                 self
._downloader
.to_screen(u
'[video.google] %s: Extracting information' % video_id
) 
 688         def _real_extract(self
, url
): 
 689                 # Extract id from URL 
 690                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 692                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
 695                 video_id 
= mobj
.group(1) 
 697                 video_extension 
= 'mp4' 
 699                 # Retrieve video webpage to extract further information 
 700                 request 
= urllib2
.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id
) 
 702                         self
.report_download_webpage(video_id
) 
 703                         webpage 
= urllib2
.urlopen(request
).read() 
 704                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 705                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 708                 # Extract URL, uploader, and title from webpage 
 709                 self
.report_extraction(video_id
) 
 710                 mobj 
= re
.search(r
"download_url:'([^']+)'", webpage
) 
 712                         video_extension 
= 'flv' 
 713                         mobj 
= re
.search(r
"(?i)videoUrl\\x3d(.+?)\\x26", webpage
) 
 715                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 717                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
 718                 mediaURL 
= mediaURL
.replace('\\x3d', '\x3d') 
 719                 mediaURL 
= mediaURL
.replace('\\x26', '\x26') 
 723                 mobj 
= re
.search(r
'<title>(.*)</title>', webpage
) 
 725                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
 727                 video_title 
= mobj
.group(1).decode('utf-8') 
 729                 # Extract video description 
 730                 mobj 
= re
.search(r
'<span id=short-desc-content>([^<]*)</span>', webpage
) 
 732                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
 734                 video_description 
= mobj
.group(1).decode('utf-8') 
 735                 if not video_description
: 
 736                         video_description 
= 'No description available.' 
 738                 # Extract video thumbnail 
 739                 if self
._downloader
.params
.get('forcethumbnail', False): 
 740                         request 
= urllib2
.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id
))) 
 742                                 webpage 
= urllib2
.urlopen(request
).read() 
 743                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 744                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 746                         mobj 
= re
.search(r
'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage
) 
 748                                 self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
 750                         video_thumbnail 
= mobj
.group(1) 
 751                 else:   # we need something to pass to process_info 
 755                         'id':           video_id
.decode('utf-8'), 
 756                         'url':          video_url
.decode('utf-8'), 
 758                         'upload_date':  u
'NA', 
 759                         'title':        video_title
, 
 760                         'ext':          video_extension
.decode('utf-8'), 
 766 class PhotobucketIE(InfoExtractor
): 
 767         """Information extractor for photobucket.com.""" 
 769         _VALID_URL 
= r
'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' 
 770         IE_NAME 
= u
'photobucket' 
 772         def __init__(self
, downloader
=None): 
 773                 InfoExtractor
.__init
__(self
, downloader
) 
 775         def report_download_webpage(self
, video_id
): 
 776                 """Report webpage download.""" 
 777                 self
._downloader
.to_screen(u
'[photobucket] %s: Downloading webpage' % video_id
) 
 779         def report_extraction(self
, video_id
): 
 780                 """Report information extraction.""" 
 781                 self
._downloader
.to_screen(u
'[photobucket] %s: Extracting information' % video_id
) 
 783         def _real_extract(self
, url
): 
 784                 # Extract id from URL 
 785                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 787                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
 790                 video_id 
= mobj
.group(1) 
 792                 video_extension 
= 'flv' 
 794                 # Retrieve video webpage to extract further information 
 795                 request 
= urllib2
.Request(url
) 
 797                         self
.report_download_webpage(video_id
) 
 798                         webpage 
= urllib2
.urlopen(request
).read() 
 799                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 800                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 803                 # Extract URL, uploader, and title from webpage 
 804                 self
.report_extraction(video_id
) 
 805                 mobj 
= re
.search(r
'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage
) 
 807                         self
._downloader
.trouble(u
'ERROR: unable to extract media URL') 
 809                 mediaURL 
= urllib
.unquote(mobj
.group(1)) 
 813                 mobj 
= re
.search(r
'<title>(.*) video by (.*) - Photobucket</title>', webpage
) 
 815                         self
._downloader
.trouble(u
'ERROR: unable to extract title') 
 817                 video_title 
= mobj
.group(1).decode('utf-8') 
 819                 video_uploader 
= mobj
.group(2).decode('utf-8') 
 822                         'id':           video_id
.decode('utf-8'), 
 823                         'url':          video_url
.decode('utf-8'), 
 824                         'uploader':     video_uploader
, 
 825                         'upload_date':  u
'NA', 
 826                         'title':        video_title
, 
 827                         'ext':          video_extension
.decode('utf-8'), 
 833 class YahooIE(InfoExtractor
): 
 834         """Information extractor for video.yahoo.com.""" 
 836         # _VALID_URL matches all Yahoo! Video URLs 
 837         # _VPAGE_URL matches only the extractable '/watch/' URLs 
 838         _VALID_URL 
= r
'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' 
 839         _VPAGE_URL 
= r
'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' 
 840         IE_NAME 
= u
'video.yahoo' 
 842         def __init__(self
, downloader
=None): 
 843                 InfoExtractor
.__init
__(self
, downloader
) 
 845         def report_download_webpage(self
, video_id
): 
 846                 """Report webpage download.""" 
 847                 self
._downloader
.to_screen(u
'[video.yahoo] %s: Downloading webpage' % video_id
) 
 849         def report_extraction(self
, video_id
): 
 850                 """Report information extraction.""" 
 851                 self
._downloader
.to_screen(u
'[video.yahoo] %s: Extracting information' % video_id
) 
 853         def _real_extract(self
, url
, new_video
=True): 
 854                 # Extract ID from URL 
 855                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 857                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
 860                 video_id 
= mobj
.group(2) 
 861                 video_extension 
= 'flv' 
 863                 # Rewrite valid but non-extractable URLs as 
 864                 # extractable English language /watch/ URLs 
 865                 if re
.match(self
._VPAGE
_URL
, url
) is None: 
 866                         request 
= urllib2
.Request(url
) 
 868                                 webpage 
= urllib2
.urlopen(request
).read() 
 869                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 870                                 self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 873                         mobj 
= re
.search(r
'\("id", "([0-9]+)"\);', webpage
) 
 875                                 self
._downloader
.trouble(u
'ERROR: Unable to extract id field') 
 877                         yahoo_id 
= mobj
.group(1) 
 879                         mobj 
= re
.search(r
'\("vid", "([0-9]+)"\);', webpage
) 
 881                                 self
._downloader
.trouble(u
'ERROR: Unable to extract vid field') 
 883                         yahoo_vid 
= mobj
.group(1) 
 885                         url 
= 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid
, yahoo_id
) 
 886                         return self
._real
_extract
(url
, new_video
=False) 
 888                 # Retrieve video webpage to extract further information 
 889                 request 
= urllib2
.Request(url
) 
 891                         self
.report_download_webpage(video_id
) 
 892                         webpage 
= urllib2
.urlopen(request
).read() 
 893                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 894                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 897                 # Extract uploader and title from webpage 
 898                 self
.report_extraction(video_id
) 
 899                 mobj 
= re
.search(r
'<meta name="title" content="(.*)" />', webpage
) 
 901                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
 903                 video_title 
= mobj
.group(1).decode('utf-8') 
 905                 mobj 
= re
.search(r
'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage
) 
 907                         self
._downloader
.trouble(u
'ERROR: unable to extract video uploader') 
 909                 video_uploader 
= mobj
.group(1).decode('utf-8') 
 911                 # Extract video thumbnail 
 912                 mobj 
= re
.search(r
'<link rel="image_src" href="(.*)" />', webpage
) 
 914                         self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
 916                 video_thumbnail 
= mobj
.group(1).decode('utf-8') 
 918                 # Extract video description 
 919                 mobj 
= re
.search(r
'<meta name="description" content="(.*)" />', webpage
) 
 921                         self
._downloader
.trouble(u
'ERROR: unable to extract video description') 
 923                 video_description 
= mobj
.group(1).decode('utf-8') 
 924                 if not video_description
: 
 925                         video_description 
= 'No description available.' 
 927                 # Extract video height and width 
 928                 mobj 
= re
.search(r
'<meta name="video_height" content="([0-9]+)" />', webpage
) 
 930                         self
._downloader
.trouble(u
'ERROR: unable to extract video height') 
 932                 yv_video_height 
= mobj
.group(1) 
 934                 mobj 
= re
.search(r
'<meta name="video_width" content="([0-9]+)" />', webpage
) 
 936                         self
._downloader
.trouble(u
'ERROR: unable to extract video width') 
 938                 yv_video_width 
= mobj
.group(1) 
 940                 # Retrieve video playlist to extract media URL 
 941                 # I'm not completely sure what all these options are, but we 
 942                 # seem to need most of them, otherwise the server sends a 401. 
 943                 yv_lg 
= 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents 
 944                 yv_bitrate 
= '700'  # according to Wikipedia this is hard-coded 
 945                 request 
= urllib2
.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id 
+ 
 946                                 '&tech=flash&mode=playlist&lg=' + yv_lg 
+ '&bitrate=' + yv_bitrate 
+ '&vidH=' + yv_video_height 
+ 
 947                                 '&vidW=' + yv_video_width 
+ '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') 
 949                         self
.report_download_webpage(video_id
) 
 950                         webpage 
= urllib2
.urlopen(request
).read() 
 951                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
 952                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
 955                 # Extract media URL from playlist XML 
 956                 mobj 
= re
.search(r
'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage
) 
 958                         self
._downloader
.trouble(u
'ERROR: Unable to extract media URL') 
 960                 video_url 
= urllib
.unquote(mobj
.group(1) + mobj
.group(2)).decode('utf-8') 
 961                 video_url 
= unescapeHTML(video_url
) 
 964                         'id':           video_id
.decode('utf-8'), 
 966                         'uploader':     video_uploader
, 
 967                         'upload_date':  u
'NA', 
 968                         'title':        video_title
, 
 969                         'ext':          video_extension
.decode('utf-8'), 
 970                         'thumbnail':    video_thumbnail
.decode('utf-8'), 
 971                         'description':  video_description
, 
 972                         'thumbnail':    video_thumbnail
, 
 977 class VimeoIE(InfoExtractor
): 
 978         """Information extractor for vimeo.com.""" 
 980         # _VALID_URL matches Vimeo URLs 
 981         _VALID_URL 
= r
'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' 
 984         def __init__(self
, downloader
=None): 
 985                 InfoExtractor
.__init
__(self
, downloader
) 
 987         def report_download_webpage(self
, video_id
): 
 988                 """Report webpage download.""" 
 989                 self
._downloader
.to_screen(u
'[vimeo] %s: Downloading webpage' % video_id
) 
 991         def report_extraction(self
, video_id
): 
 992                 """Report information extraction.""" 
 993                 self
._downloader
.to_screen(u
'[vimeo] %s: Extracting information' % video_id
) 
 995         def _real_extract(self
, url
, new_video
=True): 
 996                 # Extract ID from URL 
 997                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
 999                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1002                 video_id 
= mobj
.group(1) 
1004                 # Retrieve video webpage to extract further information 
1005                 request 
= urllib2
.Request(url
, None, std_headers
) 
1007                         self
.report_download_webpage(video_id
) 
1008                         webpage 
= urllib2
.urlopen(request
).read() 
1009                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1010                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1013                 # Now we begin extracting as much information as we can from what we 
1014                 # retrieved. First we extract the information common to all extractors, 
1015                 # and latter we extract those that are Vimeo specific. 
1016                 self
.report_extraction(video_id
) 
1018                 # Extract the config JSON 
1019                 config 
= webpage
.split(' = {config:')[1].split(',assets:')[0] 
1021                         config 
= json
.loads(config
) 
1023                         self
._downloader
.trouble(u
'ERROR: unable to extract info section') 
1027                 video_title 
= config
["video"]["title"] 
1030                 video_uploader 
= config
["video"]["owner"]["name"] 
1032                 # Extract video thumbnail 
1033                 video_thumbnail 
= config
["video"]["thumbnail"] 
1035                 # Extract video description 
1036                 video_description 
= get_element_by_id("description", webpage
.decode('utf8')) 
1037                 if video_description
: video_description 
= clean_html(video_description
) 
1038                 else: video_description 
= '' 
1040                 # Extract upload date 
1041                 video_upload_date 
= u
'NA' 
1042                 mobj 
= re
.search(r
'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage
) 
1043                 if mobj 
is not None: 
1044                         video_upload_date 
= mobj
.group(1) 
1046                 # Vimeo specific: extract request signature and timestamp 
1047                 sig 
= config
['request']['signature'] 
1048                 timestamp 
= config
['request']['timestamp'] 
1050                 # Vimeo specific: extract video codec and quality information 
1051                 # TODO bind to format param 
1052                 codecs 
= [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')] 
1053                 for codec 
in codecs
: 
1054                         if codec
[0] in config
["video"]["files"]: 
1055                                 video_codec 
= codec
[0] 
1056                                 video_extension 
= codec
[1] 
1057                                 if 'hd' in config
["video"]["files"][codec
[0]]: quality 
= 'hd' 
1058                                 else: quality 
= 'sd' 
1061                         self
._downloader
.trouble(u
'ERROR: no known codec found') 
1064                 video_url 
= "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 
1065                                         %(video_id
, sig
, timestamp
, quality
, video_codec
.upper()) 
1070                         'uploader':     video_uploader
, 
1071                         'upload_date':  video_upload_date
, 
1072                         'title':        video_title
, 
1073                         'ext':          video_extension
, 
1074                         'thumbnail':    video_thumbnail
, 
1075                         'description':  video_description
, 
1080 class GenericIE(InfoExtractor
): 
1081         """Generic last-resort information extractor.""" 
1084         IE_NAME 
= u
'generic' 
1086         def __init__(self
, downloader
=None): 
1087                 InfoExtractor
.__init
__(self
, downloader
) 
1089         def report_download_webpage(self
, video_id
): 
1090                 """Report webpage download.""" 
1091                 self
._downloader
.to_screen(u
'WARNING: Falling back on generic information extractor.') 
1092                 self
._downloader
.to_screen(u
'[generic] %s: Downloading webpage' % video_id
) 
1094         def report_extraction(self
, video_id
): 
1095                 """Report information extraction.""" 
1096                 self
._downloader
.to_screen(u
'[generic] %s: Extracting information' % video_id
) 
1098         def report_following_redirect(self
, new_url
): 
1099                 """Report information extraction.""" 
1100                 self
._downloader
.to_screen(u
'[redirect] Following redirect to %s' % new_url
) 
1102         def _test_redirect(self
, url
): 
1103                 """Check if it is a redirect, like url shorteners, in case restart chain.""" 
1104                 class HeadRequest(urllib2
.Request
): 
1105                         def get_method(self
): 
1108                 class HEADRedirectHandler(urllib2
.HTTPRedirectHandler
): 
1110                         Subclass the HTTPRedirectHandler to make it use our  
1111                         HeadRequest also on the redirected URL 
1113                         def redirect_request(self
, req
, fp
, code
, msg
, headers
, newurl
):  
1114                                 if code 
in (301, 302, 303, 307): 
1115                                         newurl 
= newurl
.replace(' ', '%20')  
1116                                         newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
1117                                                                           if k
.lower() not in ("content-length", "content-type")) 
1118                                         return HeadRequest(newurl
,  
1120                                                                            origin_req_host
=req
.get_origin_req_host(),  
1123                                         raise urllib2
.HTTPError(req
.get_full_url(), code
, msg
, headers
, fp
)  
1125                 class HTTPMethodFallback(urllib2
.BaseHandler
): 
1127                         Fallback to GET if HEAD is not allowed (405 HTTP error) 
1129                         def http_error_405(self
, req
, fp
, code
, msg
, headers
):  
1133                                 newheaders 
= dict((k
,v
) for k
,v 
in req
.headers
.items() 
1134                                                                   if k
.lower() not in ("content-length", "content-type")) 
1135                                 return self
.parent
.open(urllib2
.Request(req
.get_full_url(),  
1137                                                                                                  origin_req_host
=req
.get_origin_req_host(),  
1141                 opener 
= urllib2
.OpenerDirector()  
1142                 for handler 
in [urllib2
.HTTPHandler
, urllib2
.HTTPDefaultErrorHandler
, 
1143                                                 HTTPMethodFallback
, HEADRedirectHandler
, 
1144                                                 urllib2
.HTTPErrorProcessor
, urllib2
.HTTPSHandler
]: 
1145                         opener
.add_handler(handler()) 
1147                 response 
= opener
.open(HeadRequest(url
)) 
1148                 new_url 
= response
.geturl() 
1150                 if url 
== new_url
: return False 
1152                 self
.report_following_redirect(new_url
) 
1153                 self
._downloader
.download([new_url
]) 
1156         def _real_extract(self
, url
): 
1157                 if self
._test
_redirect
(url
): return 
1159                 video_id 
= url
.split('/')[-1] 
1160                 request 
= urllib2
.Request(url
) 
1162                         self
.report_download_webpage(video_id
) 
1163                         webpage 
= urllib2
.urlopen(request
).read() 
1164                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
1165                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve video webpage: %s' % str(err
)) 
1167                 except ValueError, err
: 
1168                         # since this is the last-resort InfoExtractor, if 
1169                         # this error is thrown, it'll be thrown here 
1170                         self
._downloader
.trouble(u
'ERROR: Invalid URL: %s' % url
) 
1173                 self
.report_extraction(video_id
) 
1174                 # Start with something easy: JW Player in SWFObject 
1175                 mobj 
= re
.search(r
'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) 
1177                         # Broaden the search a little bit 
1178                         mobj = re.search(r'[^A
-Za
-z0
-9]?
(?
:file|source
)=(http
[^
\'"&]*)', webpage) 
1180                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1183                 # It's possible that one of the regexes 
1184                 # matched, but returned an empty group: 
1185                 if mobj.group(1) is None: 
1186                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url) 
1189                 video_url = urllib.unquote(mobj.group(1)) 
1190                 video_id = os.path.basename(video_url) 
1192                 # here's a fun little line of code for you: 
1193                 video_extension = os.path.splitext(video_id)[1][1:] 
1194                 video_id = os.path.splitext(video_id)[0] 
1196                 # it's tempting to parse this further, but you would 
1197                 # have to take into account all the variations like 
1198                 #   Video Title - Site Name 
1199                 #   Site Name | Video Title 
1200                 #   Video Title - Tagline | Site Name 
1201                 # and so on and so forth; it's just not practical 
1202                 mobj = re.search(r'<title>(.*)</title>', webpage) 
1204                         self._downloader.trouble(u'ERROR: unable to extract title') 
1206                 video_title = mobj.group(1).decode('utf-8') 
1208                 # video uploader is domain name 
1209                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url) 
1211                         self._downloader.trouble(u'ERROR: unable to extract title') 
1213                 video_uploader = mobj.group(1).decode('utf-8') 
1216                         'id':           video_id.decode('utf-8'), 
1217                         'url':          video_url.decode('utf-8'), 
1218                         'uploader':     video_uploader, 
1219                         'upload_date':  u'NA', 
1220                         'title':        video_title, 
1221                         'ext':          video_extension.decode('utf-8'), 
1227 class YoutubeSearchIE(InfoExtractor): 
1228         """Information Extractor for YouTube search queries.""" 
1229         _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+' 
1230         _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' 
1231         _max_youtube_results = 1000 
1232         IE_NAME = u'youtube:search' 
1234         def __init__(self, downloader=None): 
1235                 InfoExtractor.__init__(self, downloader) 
1237         def report_download_page(self, query, pagenum): 
1238                 """Report attempt to download search page with given number.""" 
1239                 query = query.decode(preferredencoding()) 
1240                 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum)) 
1242         def _real_extract(self, query): 
1243                 mobj = re.match(self._VALID_URL, query) 
1245                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1248                 prefix, query = query.split(':') 
1250                 query = query.encode('utf-8') 
1252                         self._download_n_results(query, 1) 
1254                 elif prefix == 'all': 
1255                         self._download_n_results(query, self._max_youtube_results) 
1261                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1263                                 elif n > self._max_youtube_results: 
1264                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n)) 
1265                                         n = self._max_youtube_results 
1266                                 self._download_n_results(query, n) 
1268                         except ValueError: # parsing prefix as integer fails 
1269                                 self._download_n_results(query, 1) 
1272         def _download_n_results(self, query, n): 
1273                 """Downloads a specified number of results for a query""" 
1279                 while (50 * pagenum) < limit: 
1280                         self.report_download_page(query, pagenum+1) 
1281                         result_url = self._API_URL % (urllib.quote_plus(query), (50*pagenum)+1) 
1282                         request = urllib2.Request(result_url) 
1284                                 data = urllib2.urlopen(request).read() 
1285                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1286                                 self._downloader.trouble(u'ERROR: unable to download API page: %s' % str(err)) 
1288                         api_response = json.loads(data)['data'] 
1290                         new_ids = list(video['id'] for video in api_response['items']) 
1291                         video_ids += new_ids 
1293                         limit = min(n, api_response['totalItems']) 
1296                 if len(video_ids) > n: 
1297                         video_ids = video_ids[:n] 
1298                 for id in video_ids: 
1299                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) 
1303 class GoogleSearchIE(InfoExtractor): 
1304         """Information Extractor for Google Video search queries.""" 
1305         _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+' 
1306         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en' 
1307         _VIDEO_INDICATOR = r'<a href="http
://video\
.google\
.com
/videoplay
\?docid
=([^
"\&]+)' 
1308         _MORE_PAGES_INDICATOR = r'class="pn
" id="pnnext
"' 
1309         _max_google_results = 1000 
1310         IE_NAME = u'video.google:search' 
1312         def __init__(self, downloader=None): 
1313                 InfoExtractor.__init__(self, downloader) 
1315         def report_download_page(self, query, pagenum): 
1316                 """Report attempt to download playlist page with given number.""" 
1317                 query = query.decode(preferredencoding()) 
1318                 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum)) 
1320         def _real_extract(self, query): 
1321                 mobj = re.match(self._VALID_URL, query) 
1323                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1326                 prefix, query = query.split(':') 
1328                 query = query.encode('utf-8') 
1330                         self._download_n_results(query, 1) 
1332                 elif prefix == 'all': 
1333                         self._download_n_results(query, self._max_google_results) 
1339                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1341                                 elif n > self._max_google_results: 
1342                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) 
1343                                         n = self._max_google_results 
1344                                 self._download_n_results(query, n) 
1346                         except ValueError: # parsing prefix as integer fails 
1347                                 self._download_n_results(query, 1) 
1350         def _download_n_results(self, query, n): 
1351                 """Downloads a specified number of results for a query""" 
1357                         self.report_download_page(query, pagenum) 
1358                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum*10) 
1359                         request = urllib2.Request(result_url) 
1361                                 page = urllib2.urlopen(request).read() 
1362                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1363                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1366                         # Extract video identifiers 
1367                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1368                                 video_id = mobj.group(1) 
1369                                 if video_id not in video_ids: 
1370                                         video_ids.append(video_id) 
1371                                         if len(video_ids) == n: 
1372                                                 # Specified n videos reached 
1373                                                 for id in video_ids: 
1374                                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) 
1377                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1378                                 for id in video_ids: 
1379                                         self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) 
1382                         pagenum = pagenum + 1 
1385 class YahooSearchIE(InfoExtractor): 
1386         """Information Extractor for Yahoo! Video search queries.""" 
1387         _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+' 
1388         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s' 
1389         _VIDEO_INDICATOR = r'href="http
://video\
.yahoo\
.com
/watch
/([0-9]+/[0-9]+)"' 
1390         _MORE_PAGES_INDICATOR = r'\s*Next' 
1391         _max_yahoo_results = 1000 
1392         IE_NAME = u'video.yahoo:search' 
1394         def __init__(self, downloader=None): 
1395                 InfoExtractor.__init__(self, downloader) 
1397         def report_download_page(self, query, pagenum): 
1398                 """Report attempt to download playlist page with given number.""" 
1399                 query = query.decode(preferredencoding()) 
1400                 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum)) 
1402         def _real_extract(self, query): 
1403                 mobj = re.match(self._VALID_URL, query) 
1405                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query) 
1408                 prefix, query = query.split(':') 
1410                 query = query.encode('utf-8') 
1412                         self._download_n_results(query, 1) 
1414                 elif prefix == 'all': 
1415                         self._download_n_results(query, self._max_yahoo_results) 
1421                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query)) 
1423                                 elif n > self._max_yahoo_results: 
1424                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n)) 
1425                                         n = self._max_yahoo_results 
1426                                 self._download_n_results(query, n) 
1428                         except ValueError: # parsing prefix as integer fails 
1429                                 self._download_n_results(query, 1) 
1432         def _download_n_results(self, query, n): 
1433                 """Downloads a specified number of results for a query""" 
1436                 already_seen = set() 
1440                         self.report_download_page(query, pagenum) 
1441                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum) 
1442                         request = urllib2.Request(result_url) 
1444                                 page = urllib2.urlopen(request).read() 
1445                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1446                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1449                         # Extract video identifiers 
1450                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1451                                 video_id = mobj.group(1) 
1452                                 if video_id not in already_seen: 
1453                                         video_ids.append(video_id) 
1454                                         already_seen.add(video_id) 
1455                                         if len(video_ids) == n: 
1456                                                 # Specified n videos reached 
1457                                                 for id in video_ids: 
1458                                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) 
1461                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1462                                 for id in video_ids: 
1463                                         self._downloader.download(['http://video.yahoo.com/watch/%s' % id]) 
1466                         pagenum = pagenum + 1 
1469 class YoutubePlaylistIE(InfoExtractor): 
1470         """Information Extractor for YouTube playlists.""" 
1472         _VALID_URL = r'(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL)?([0-9A-Za-z-_]+)(?:/.*?/([0-9A-Za-z_-]+))?.*' 
1473         _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en' 
1474         _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&list=(PL)?%s&' 
1475         _MORE_PAGES_INDICATOR = r'yt-uix-pager-next' 
1476         IE_NAME = u'youtube:playlist' 
1478         def __init__(self, downloader=None): 
1479                 InfoExtractor.__init__(self, downloader) 
1481         def report_download_page(self, playlist_id, pagenum): 
1482                 """Report attempt to download playlist page with given number.""" 
1483                 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum)) 
1485         def _real_extract(self, url): 
1486                 # Extract playlist id 
1487                 mobj = re.match(self._VALID_URL, url) 
1489                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1493                 if mobj.group(3) is not None: 
1494                         self._downloader.download([mobj.group(3)]) 
1497                 # Download playlist pages 
1498                 # prefix is 'p' as default for playlists but there are other types that need extra care 
1499                 playlist_prefix = mobj.group(1) 
1500                 if playlist_prefix == 'a': 
1501                         playlist_access = 'artist' 
1503                         playlist_prefix = 'p' 
1504                         playlist_access = 'view_play_list' 
1505                 playlist_id = mobj.group(2) 
1510                         self.report_download_page(playlist_id, pagenum) 
1511                         url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum) 
1512                         request = urllib2.Request(url) 
1514                                 page = urllib2.urlopen(request).read() 
1515                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1516                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1519                         # Extract video identifiers 
1521                         for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page): 
1522                                 if mobj.group(1) not in ids_in_page: 
1523                                         ids_in_page.append(mobj.group(1)) 
1524                         video_ids.extend(ids_in_page) 
1526                         if re.search(self._MORE_PAGES_INDICATOR, page) is None: 
1528                         pagenum = pagenum + 1 
1530                 playliststart = self._downloader.params.get('playliststart', 1) - 1 
1531                 playlistend = self._downloader.params.get('playlistend', -1) 
1532                 if playlistend == -1: 
1533                         video_ids = video_ids[playliststart:] 
1535                         video_ids = video_ids[playliststart:playlistend] 
1537                 for id in video_ids: 
1538                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) 
1542 class YoutubeUserIE(InfoExtractor): 
1543         """Information Extractor for YouTube users.""" 
1545         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' 
1546         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' 
1547         _GDATA_PAGE_SIZE = 50 
1548         _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' 
1549         _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' 
1550         IE_NAME = u'youtube:user' 
1552         def __init__(self, downloader=None): 
1553                 InfoExtractor.__init__(self, downloader) 
1555         def report_download_page(self, username, start_index): 
1556                 """Report attempt to download user page.""" 
1557                 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' % 
1558                                 (username, start_index, start_index + self._GDATA_PAGE_SIZE)) 
1560         def _real_extract(self, url): 
1562                 mobj = re.match(self._VALID_URL, url) 
1564                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1567                 username = mobj.group(1) 
1569                 # Download video ids using YouTube Data API. Result size per 
1570                 # query is limited (currently to 50 videos) so we need to query 
1571                 # page by page until there are no video ids - it means we got 
1578                         start_index = pagenum * self._GDATA_PAGE_SIZE + 1 
1579                         self.report_download_page(username, start_index) 
1581                         request = urllib2.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)) 
1584                                 page = urllib2.urlopen(request).read() 
1585                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1586                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err)) 
1589                         # Extract video identifiers 
1592                         for mobj in re.finditer(self._VIDEO_INDICATOR, page): 
1593                                 if mobj.group(1) not in ids_in_page: 
1594                                         ids_in_page.append(mobj.group(1)) 
1596                         video_ids.extend(ids_in_page) 
1598                         # A little optimization - if current page is not 
1599                         # "full
", ie. does not contain PAGE_SIZE video ids then 
1600                         # we can assume that this page is the last one - there 
1601                         # are no more ids on further pages - no need to query 
1604                         if len(ids_in_page) < self._GDATA_PAGE_SIZE: 
1609                 all_ids_count = len(video_ids) 
1610                 playliststart = self._downloader.params.get('playliststart', 1) - 1 
1611                 playlistend = self._downloader.params.get('playlistend', -1) 
1613                 if playlistend == -1: 
1614                         video_ids = video_ids[playliststart:] 
1616                         video_ids = video_ids[playliststart:playlistend] 
1618                 self._downloader.to_screen(u"[youtube
] user 
%s: Collected 
%d video 
ids (downloading 
%d of them
)" % 
1619                                 (username, all_ids_count, len(video_ids))) 
1621                 for video_id in video_ids: 
1622                         self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) 
1625 class BlipTVUserIE(InfoExtractor): 
1626         """Information Extractor for blip.tv users.""" 
1628         _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$' 
1630         IE_NAME = u'blip.tv:user' 
1632         def __init__(self, downloader=None): 
1633                 InfoExtractor.__init__(self, downloader) 
1635         def report_download_page(self, username, pagenum): 
1636                 """Report attempt to download user page.""" 
1637                 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' % 
1638                                 (self.IE_NAME, username, pagenum)) 
1640         def _real_extract(self, url): 
1642                 mobj = re.match(self._VALID_URL, url) 
1644                         self._downloader.trouble(u'ERROR: invalid url: %s' % url) 
1647                 username = mobj.group(1) 
1649                 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1' 
1651                 request = urllib2.Request(url) 
1654                         page = urllib2.urlopen(request).read().decode('utf-8') 
1655                         mobj = re.search(r'data-users-id="([^
"]+)"', page) 
1656                         page_base = page_base % mobj.group(1) 
1657                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1658                         self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % str(err)) 
1662                 # Download video ids using BlipTV Ajax calls. Result size per 
1663                 # query is limited (currently to 12 videos) so we need to query 
1664                 # page by page until there are no video ids - it means we got 
1671                         self.report_download_page(username, pagenum) 
1673                         request = urllib2.Request( page_base + "&page=" + str(pagenum) ) 
1676                                 page = urllib2.urlopen(request).read().decode('utf
-8') 
1677                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1678                                 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % str(err)) 
1681                         # Extract video identifiers 
1684                         for mobj in re.finditer(r'href
="/([^"]+)"', page): 
1685                                 if mobj.group(1) not in ids_in_page: 
1686                                         ids_in_page.append(unescapeHTML(mobj.group(1))) 
1688                         video_ids.extend(ids_in_page) 
1690                         # A little optimization - if current page is not 
1691                         # "full
", ie. does not contain PAGE_SIZE video ids then 
1692                         # we can assume that this page is the last one - there 
1693                         # are no more ids on further pages - no need to query 
1696                         if len(ids_in_page) < self._PAGE_SIZE: 
1701                 all_ids_count = len(video_ids) 
1702                 playliststart = self._downloader.params.get('playliststart', 1) - 1 
1703                 playlistend = self._downloader.params.get('playlistend', -1) 
1705                 if playlistend == -1: 
1706                         video_ids = video_ids[playliststart:] 
1708                         video_ids = video_ids[playliststart:playlistend] 
1710                 self._downloader.to_screen(u"[%s] user 
%s: Collected 
%d video 
ids (downloading 
%d of them
)" % 
1711                                 (self.IE_NAME, username, all_ids_count, len(video_ids))) 
1713                 for video_id in video_ids: 
1714                         self._downloader.download([u'http://blip.tv/'+video_id]) 
1717 class DepositFilesIE(InfoExtractor): 
1718         """Information extractor for depositfiles.com""" 
1720         _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)' 
1721         IE_NAME = u'DepositFiles' 
1723         def __init__(self, downloader=None): 
1724                 InfoExtractor.__init__(self, downloader) 
1726         def report_download_webpage(self, file_id): 
1727                 """Report webpage download.""" 
1728                 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id) 
1730         def report_extraction(self, file_id): 
1731                 """Report information extraction.""" 
1732                 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id) 
1734         def _real_extract(self, url): 
1735                 file_id = url.split('/')[-1] 
1736                 # Rebuild url in english locale 
1737                 url = 'http://depositfiles.com/en/files/' + file_id 
1739                 # Retrieve file webpage with 'Free download' button pressed 
1740                 free_download_indication = { 'gateway_result' : '1' } 
1741                 request = urllib2.Request(url, urllib.urlencode(free_download_indication)) 
1743                         self.report_download_webpage(file_id) 
1744                         webpage = urllib2.urlopen(request).read() 
1745                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1746                         self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % str(err)) 
1749                 # Search for the real file URL 
1750                 mobj = re.search(r'<form action="(http
://fileshare
.+?
)"', webpage) 
1751                 if (mobj is None) or (mobj.group(1) is None): 
1752                         # Try to figure out reason of the error. 
1753                         mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL) 
1754                         if (mobj is not None) and (mobj.group(1) is not None): 
1755                                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip() 
1756                                 self._downloader.trouble(u'ERROR: %s' % restriction_message) 
1758                                 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url) 
1761                 file_url = mobj.group(1) 
1762                 file_extension = os.path.splitext(file_url)[1][1:] 
1764                 # Search for file title 
1765                 mobj = re.search(r'<b title="(.*?
)">', webpage) 
1767                         self._downloader.trouble(u'ERROR: unable to extract title') 
1769                 file_title = mobj.group(1).decode('utf-8') 
1772                         'id':           file_id.decode('utf-8'), 
1773                         'url':          file_url.decode('utf-8'), 
1775                         'upload_date':  u'NA', 
1776                         'title':        file_title, 
1777                         'ext':          file_extension.decode('utf-8'), 
1783 class FacebookIE(InfoExtractor): 
1784         """Information Extractor for Facebook""" 
1786         _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)' 
1787         _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&' 
1788         _NETRC_MACHINE = 'facebook' 
1789         _available_formats = ['video', 'highqual', 'lowqual'] 
1790         _video_extensions = { 
1795         IE_NAME = u'facebook' 
1797         def __init__(self, downloader=None): 
1798                 InfoExtractor.__init__(self, downloader) 
1800         def _reporter(self, message): 
1801                 """Add header and report message.""" 
1802                 self._downloader.to_screen(u'[facebook] %s' % message) 
1804         def report_login(self): 
1805                 """Report attempt to log in.""" 
1806                 self._reporter(u'Logging in') 
1808         def report_video_webpage_download(self, video_id): 
1809                 """Report attempt to download video webpage.""" 
1810                 self._reporter(u'%s: Downloading video webpage' % video_id) 
1812         def report_information_extraction(self, video_id): 
1813                 """Report attempt to extract video information.""" 
1814                 self._reporter(u'%s: Extracting video information' % video_id) 
1816         def _parse_page(self, video_webpage): 
1817                 """Extract video information from page""" 
1819                 data = {'title': r'\("video_title
", "(.*?
)"\)', 
1820                         'description': r'<div class="datawrap
">(.*?)</div>', 
1821                         'owner': r'\("video_owner_name
", "(.*?
)"\)', 
1822                         'thumbnail':  r'\("thumb_url
", "(?P
<THUMB
>.*?
)"\)', 
1825                 for piece in data.keys(): 
1826                         mobj = re.search(data[piece], video_webpage) 
1827                         if mobj is not None: 
1828                                 video_info[piece] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape
")) 
1832                 for fmt in self._available_formats: 
1833                         mobj = re.search(r'\("%s_src
\", "(.+?)"\
)' % fmt, video_webpage) 
1834                         if mobj is not None: 
1835                                 # URL is in a Javascript segment inside an escaped Unicode format within 
1836                                 # the generally utf-8 page 
1837                                 video_urls[fmt] = urllib.unquote_plus(mobj.group(1).decode("unicode_escape")) 
1838                 video_info['video_urls
'] = video_urls 
1842         def _real_initialize(self): 
1843                 if self._downloader is None: 
1848                 downloader_params = self._downloader.params 
1850                 # Attempt to use provided username and password or .netrc data 
1851                 if downloader_params.get('username
', None) is not None: 
1852                         useremail = downloader_params['username
'] 
1853                         password = downloader_params['password
'] 
1854                 elif downloader_params.get('usenetrc
', False): 
1856                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
1857                                 if info is not None: 
1861                                         raise netrc.NetrcParseError('No authenticators 
for %s' % self._NETRC_MACHINE) 
1862                         except (IOError, netrc.NetrcParseError), err: 
1863                                 self._downloader.to_stderr(u'WARNING
: parsing 
.netrc
: %s' % str(err)) 
1866                 if useremail is None: 
1875                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form)) 
1878                         login_results = urllib2.urlopen(request).read() 
1879                         if re.search(r'<form(.*)name
="login"(.*)</form
>', login_results) is not None: 
1880                                 self._downloader.to_stderr(u'WARNING
: unable to log 
in: bad username
/password
, or exceded login rate 
limit (~
3/min). Check credentials 
or wait
.') 
1882                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1883                         self._downloader.to_stderr(u'WARNING
: unable to log 
in: %s' % str(err)) 
1886         def _real_extract(self, url): 
1887                 mobj = re.match(self._VALID_URL, url) 
1889                         self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url) 
1891                 video_id = mobj.group('ID
') 
1894                 self.report_video_webpage_download(video_id) 
1895                 request = urllib2.Request('https
://www
.facebook
.com
/video
/video
.php?v
=%s' % video_id) 
1897                         page = urllib2.urlopen(request) 
1898                         video_webpage = page.read() 
1899                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
1900                         self._downloader.trouble(u'ERROR
: unable to download video webpage
: %s' % str(err)) 
1903                 # Start extracting information 
1904                 self.report_information_extraction(video_id) 
1906                 # Extract information 
1907                 video_info = self._parse_page(video_webpage) 
1910                 if 'owner
' not in video_info: 
1911                         self._downloader.trouble(u'ERROR
: unable to extract uploader nickname
') 
1913                 video_uploader = video_info['owner
'] 
1916                 if 'title
' not in video_info: 
1917                         self._downloader.trouble(u'ERROR
: unable to extract video title
') 
1919                 video_title = video_info['title
'] 
1920                 video_title = video_title.decode('utf
-8') 
1923                 if 'thumbnail
' not in video_info: 
1924                         self._downloader.trouble(u'WARNING
: unable to extract video thumbnail
') 
1925                         video_thumbnail = '' 
1927                         video_thumbnail = video_info['thumbnail
'] 
1931                 if 'upload_date
' in video_info: 
1932                         upload_time = video_info['upload_date
'] 
1933                         timetuple = email.utils.parsedate_tz(upload_time) 
1934                         if timetuple is not None: 
1936                                         upload_date = time.strftime('%Y
%m
%d', timetuple[0:9]) 
1941                 video_description = video_info.get('description
', 'No description available
.') 
1943                 url_map = video_info['video_urls
'] 
1944                 if len(url_map.keys()) > 0: 
1945                         # Decide which formats to download 
1946                         req_format = self._downloader.params.get('format
', None) 
1947                         format_limit = self._downloader.params.get('format_limit
', None) 
1949                         if format_limit is not None and format_limit in self._available_formats: 
1950                                 format_list = self._available_formats[self._available_formats.index(format_limit):] 
1952                                 format_list = self._available_formats 
1953                         existing_formats = [x for x in format_list if x in url_map] 
1954                         if len(existing_formats) == 0: 
1955                                 self._downloader.trouble(u'ERROR
: no known formats available 
for video
') 
1957                         if req_format is None: 
1958                                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality 
1959                         elif req_format == 'worst
': 
1960                                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality 
1961                         elif req_format == '-1': 
1962                                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats 
1965                                 if req_format not in url_map: 
1966                                         self._downloader.trouble(u'ERROR
: requested format 
not available
') 
1968                                 video_url_list = [(req_format, url_map[req_format])] # Specific format 
1971                 for format_param, video_real_url in video_url_list: 
1973                         video_extension = self._video_extensions.get(format_param, 'mp4
') 
1976                                 'id':           video_id.decode('utf
-8'), 
1977                                 'url
':          video_real_url.decode('utf
-8'), 
1978                                 'uploader
':     video_uploader.decode('utf
-8'), 
1979                                 'upload_date
':  upload_date, 
1980                                 'title
':        video_title, 
1981                                 'ext
':          video_extension.decode('utf
-8'), 
1982                                 'format
':       (format_param is None and u'NA
' or format_param.decode('utf
-8')), 
1983                                 'thumbnail
':    video_thumbnail.decode('utf
-8'), 
1984                                 'description
':  video_description.decode('utf
-8'), 
1989 class BlipTVIE(InfoExtractor): 
1990         """Information extractor for blip.tv""" 
1992         _VALID_URL = r'^
(?
:https?
://)?
(?
:\w
+\
.)?blip\
.tv(/.+)$
' 
1993         _URL_EXT = r'^
.*\
.([a
-z0
-9]+)$
' 
1994         IE_NAME = u'blip
.tv
' 
1996         def report_extraction(self, file_id): 
1997                 """Report information extraction.""" 
1998                 self._downloader.to_screen(u'[%s] %s: Extracting information
' % (self.IE_NAME, file_id)) 
2000         def report_direct_download(self, title): 
2001                 """Report information extraction.""" 
2002                 self._downloader.to_screen(u'[%s] %s: Direct download detected
' % (self.IE_NAME, title)) 
2004         def _real_extract(self, url): 
2005                 mobj = re.match(self._VALID_URL, url) 
2007                         self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url) 
2014                 json_url = url + cchar + 'skin
=json
&version
=2&no_wrap
=1' 
2015                 request = urllib2.Request(json_url.encode('utf
-8')) 
2016                 self.report_extraction(mobj.group(1)) 
2019                         urlh = urllib2.urlopen(request) 
2020                         if urlh.headers.get('Content
-Type
', '').startswith('video
/'): # Direct download 
2021                                 basename = url.split('/')[-1] 
2022                                 title,ext = os.path.splitext(basename) 
2023                                 title = title.decode('UTF
-8') 
2024                                 ext = ext.replace('.', '') 
2025                                 self.report_direct_download(title) 
2033                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2034                         self._downloader.trouble(u'ERROR
: unable to download video info webpage
: %s' % str(err)) 
2036                 if info is None: # Regular URL 
2038                                 json_code = urlh.read() 
2039                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2040                                 self._downloader.trouble(u'ERROR
: unable to read video info webpage
: %s' % str(err)) 
2044                                 json_data = json.loads(json_code) 
2045                                 if 'Post
' in json_data: 
2046                                         data = json_data['Post
'] 
2050                                 upload_date = datetime.datetime.strptime(data['datestamp
'], '%m
-%d-%y 
%H
:%M
%p
').strftime('%Y
%m
%d') 
2051                                 video_url = data['media
']['url
'] 
2052                                 umobj = re.match(self._URL_EXT, video_url) 
2054                                         raise ValueError('Can 
not determine filename extension
') 
2055                                 ext = umobj.group(1) 
2058                                         'id': data['item_id
'], 
2060                                         'uploader
': data['display_name
'], 
2061                                         'upload_date
': upload_date, 
2062                                         'title
': data['title
'], 
2064                                         'format
': data['media
']['mimeType
'], 
2065                                         'thumbnail
': data['thumbnailUrl
'], 
2066                                         'description
': data['description
'], 
2067                                         'player_url
': data['embedUrl
'] 
2069                         except (ValueError,KeyError), err: 
2070                                 self._downloader.trouble(u'ERROR
: unable to parse video information
: %s' % repr(err)) 
2073                 std_headers['User
-Agent
'] = 'iTunes
/10.6.1' 
2077 class MyVideoIE(InfoExtractor): 
2078         """Information Extractor for myvideo.de.""" 
2080         _VALID_URL = r'(?
:http
://)?
(?
:www\
.)?myvideo\
.de
/watch
/([0-9]+)/([^?
/]+).*' 
2081         IE_NAME = u'myvideo
' 
2083         def __init__(self, downloader=None): 
2084                 InfoExtractor.__init__(self, downloader) 
2086         def report_download_webpage(self, video_id): 
2087                 """Report webpage download.""" 
2088                 self._downloader.to_screen(u'[myvideo
] %s: Downloading webpage
' % video_id) 
2090         def report_extraction(self, video_id): 
2091                 """Report information extraction.""" 
2092                 self._downloader.to_screen(u'[myvideo
] %s: Extracting information
' % video_id) 
2094         def _real_extract(self,url): 
2095                 mobj = re.match(self._VALID_URL, url) 
2097                         self._download.trouble(u'ERROR
: invalid URL
: %s' % url) 
2100                 video_id = mobj.group(1) 
2103                 request = urllib2.Request('http
://www
.myvideo
.de
/watch
/%s' % video_id) 
2105                         self.report_download_webpage(video_id) 
2106                         webpage = urllib2.urlopen(request).read() 
2107                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2108                         self._downloader.trouble(u'ERROR
: Unable to retrieve video webpage
: %s' % str(err)) 
2111                 self.report_extraction(video_id) 
2112                 mobj = re.search(r'<link rel
=\'image_src
\' href
=\'(http
://is[0-9].myvideo\
.de
/de
/movie
[0-9]+/[a
-f0
-9]+)/thumbs
/[^
.]+\
.jpg
\' />', 
2115                         self._downloader.trouble(u'ERROR
: unable to extract media URL
') 
2117                 video_url = mobj.group(1) + ('/%s.flv
' % video_id) 
2119                 mobj = re.search('<title
>([^
<]+)</title
>', webpage) 
2121                         self._downloader.trouble(u'ERROR
: unable to extract title
') 
2124                 video_title = mobj.group(1) 
2130                         'upload_date
':  u'NA
', 
2131                         'title
':        video_title, 
2137 class ComedyCentralIE(InfoExtractor): 
2138         """Information extractor for The Daily Show and Colbert Report """ 
2140         _VALID_URL = r'^
(:(?P
<shortname
>tds|thedailyshow|cr|colbert|colbertnation|colbertreport
))|
(https?
://)?
(www\
.)?
(?P
<showname
>thedailyshow|colbertnation
)\
.com
/full
-episodes
/(?P
<episode
>.*)$
' 
2141         IE_NAME = u'comedycentral
' 
2143         def report_extraction(self, episode_id): 
2144                 self._downloader.to_screen(u'[comedycentral
] %s: Extracting information
' % episode_id) 
2146         def report_config_download(self, episode_id): 
2147                 self._downloader.to_screen(u'[comedycentral
] %s: Downloading configuration
' % episode_id) 
2149         def report_index_download(self, episode_id): 
2150                 self._downloader.to_screen(u'[comedycentral
] %s: Downloading show index
' % episode_id) 
2152         def report_player_url(self, episode_id): 
2153                 self._downloader.to_screen(u'[comedycentral
] %s: Determining player URL
' % episode_id) 
2155         def _real_extract(self, url): 
2156                 mobj = re.match(self._VALID_URL, url) 
2158                         self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url) 
2161                 if mobj.group('shortname
'): 
2162                         if mobj.group('shortname
') in ('tds
', 'thedailyshow
'): 
2163                                 url = u'http
://www
.thedailyshow
.com
/full
-episodes
/' 
2165                                 url = u'http
://www
.colbertnation
.com
/full
-episodes
/' 
2166                         mobj = re.match(self._VALID_URL, url) 
2167                         assert mobj is not None 
2169                 dlNewest = not mobj.group('episode
') 
2171                         epTitle = mobj.group('showname
') 
2173                         epTitle = mobj.group('episode
') 
2175                 req = urllib2.Request(url) 
2176                 self.report_extraction(epTitle) 
2178                         htmlHandle = urllib2.urlopen(req) 
2179                         html = htmlHandle.read() 
2180                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2181                         self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err)) 
2184                         url = htmlHandle.geturl() 
2185                         mobj = re.match(self._VALID_URL, url) 
2187                                 self._downloader.trouble(u'ERROR
: Invalid redirected URL
: ' + url) 
2189                         if mobj.group('episode
') == '': 
2190                                 self._downloader.trouble(u'ERROR
: Redirected URL 
is still 
not specific
: ' + url) 
2192                         epTitle = mobj.group('episode
') 
2194                 mMovieParams = re.findall('(?
:<param name
="movie" value
="|var url = ")(http
://media
.mtvnservices
.com
/([^
"]*episode.*?:.*?))"', html) 
2195                 if len(mMovieParams) == 0: 
2196                         self._downloader.trouble(u'ERROR
: unable to find Flash URL 
in webpage 
' + url) 
2199                 playerUrl_raw = mMovieParams[0][0] 
2200                 self.report_player_url(epTitle) 
2202                         urlHandle = urllib2.urlopen(playerUrl_raw) 
2203                         playerUrl = urlHandle.geturl() 
2204                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2205                         self._downloader.trouble(u'ERROR
: unable to find out player URL
: ' + unicode(err)) 
2208                 uri = mMovieParams[0][1] 
2209                 indexUrl = 'http
://shadow
.comedycentral
.com
/feeds
/video_player
/mrss
/?
' + urllib.urlencode({'uri
': uri}) 
2210                 self.report_index_download(epTitle) 
2212                         indexXml = urllib2.urlopen(indexUrl).read() 
2213                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2214                         self._downloader.trouble(u'ERROR
: unable to download episode index
: ' + unicode(err)) 
2219                 idoc = xml.etree.ElementTree.fromstring(indexXml) 
2220                 itemEls = idoc.findall('.//item
') 
2221                 for itemEl in itemEls: 
2222                         mediaId = itemEl.findall('./guid
')[0].text 
2223                         shortMediaId = mediaId.split(':')[-1] 
2224                         showId = mediaId.split(':')[-2].replace('.com
', '') 
2225                         officialTitle = itemEl.findall('./title
')[0].text 
2226                         officialDate = itemEl.findall('./pubDate
')[0].text 
2228                         configUrl = ('http
://www
.comedycentral
.com
/global/feeds
/entertainment
/media
/mediaGenEntertainment
.jhtml?
' + 
2229                                                 urllib.urlencode({'uri
': mediaId})) 
2230                         configReq = urllib2.Request(configUrl) 
2231                         self.report_config_download(epTitle) 
2233                                 configXml = urllib2.urlopen(configReq).read() 
2234                         except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2235                                 self._downloader.trouble(u'ERROR
: unable to download webpage
: %s' % unicode(err)) 
2238                         cdoc = xml.etree.ElementTree.fromstring(configXml) 
2240                         for rendition in cdoc.findall('.//rendition
'): 
2241                                 finfo = (rendition.attrib['bitrate
'], rendition.findall('./src
')[0].text) 
2245                                 self._downloader.trouble(u'\nERROR
: unable to download 
' + mediaId + ': No videos found
') 
2248                         # For now, just pick the highest bitrate 
2249                         format,video_url = turls[-1] 
2251                         effTitle = showId + u'-' + epTitle 
2256                                 'upload_date
': officialDate, 
2261                                 'description
': officialTitle, 
2262                                 'player_url
': playerUrl 
2265                         results.append(info) 
2270 class EscapistIE(InfoExtractor): 
2271         """Information extractor for The Escapist """ 
2273         _VALID_URL = r'^
(https?
://)?
(www\
.)?escapistmagazine\
.com
/videos
/view
/(?P
<showname
>[^
/]+)/(?P
<episode
>[^
/?
]+)[/?
]?
.*$
' 
2274         IE_NAME = u'escapist
' 
2276         def report_extraction(self, showName): 
2277                 self._downloader.to_screen(u'[escapist
] %s: Extracting information
' % showName) 
2279         def report_config_download(self, showName): 
2280                 self._downloader.to_screen(u'[escapist
] %s: Downloading configuration
' % showName) 
2282         def _real_extract(self, url): 
2283                 mobj = re.match(self._VALID_URL, url) 
2285                         self._downloader.trouble(u'ERROR
: invalid URL
: %s' % url) 
2287                 showName = mobj.group('showname
') 
2288                 videoId = mobj.group('episode
') 
2290                 self.report_extraction(showName) 
2292                         webPage = urllib2.urlopen(url) 
2293                         webPageBytes = webPage.read() 
2294                         m = re.match(r'text
/html
; charset
="?([^"]+)"?', webPage.headers['Content-Type']) 
2295                         webPage = webPageBytes.decode(m.group(1) if m else 'utf-8') 
2296                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2297                         self._downloader.trouble(u'ERROR: unable to download webpage: ' + unicode(err)) 
2300                 descMatch = re.search('<meta name="description
" content="([^
"]*)"', webPage) 
2301                 description = unescapeHTML(descMatch.group(1)) 
2302                 imgMatch = re.search('<meta 
property="og:image" content
="([^"]*)"', webPage) 
2303                 imgUrl = unescapeHTML(imgMatch.group(1)) 
2304                 playerUrlMatch = re.search('<meta property="og
:video
" content="([^
"]*)"', webPage) 
2305                 playerUrl = unescapeHTML(playerUrlMatch.group(1)) 
2306                 configUrlMatch = re.search('config
=(.*)$
', playerUrl) 
2307                 configUrl = urllib2.unquote(configUrlMatch.group(1)) 
2309                 self.report_config_download(showName) 
2311                         configJSON = urllib2.urlopen(configUrl).read() 
2312                 except (urllib2.URLError, httplib.HTTPException, socket.error), err: 
2313                         self._downloader.trouble(u'ERROR
: unable to download configuration
: ' + unicode(err)) 
2316                 # Technically, it's JavaScript
, not JSON
 
2317                 configJSON 
= configJSON
.replace("'", '"') 
2320                         config 
= json
.loads(configJSON
) 
2321                 except (ValueError,), err
: 
2322                         self
._downloader
.trouble(u
'ERROR: Invalid JSON in configuration file: ' + unicode(err
)) 
2325                 playlist 
= config
['playlist'] 
2326                 videoUrl 
= playlist
[1]['url'] 
2331                         'uploader': showName
, 
2332                         'upload_date': None, 
2336                         'thumbnail': imgUrl
, 
2337                         'description': description
, 
2338                         'player_url': playerUrl
, 
2344 class CollegeHumorIE(InfoExtractor
): 
2345         """Information extractor for collegehumor.com""" 
2347         _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$' 
2348         IE_NAME 
= u
'collegehumor' 
2350         def report_webpage(self
, video_id
): 
2351                 """Report information extraction.""" 
2352                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
)) 
2354         def report_extraction(self
, video_id
): 
2355                 """Report information extraction.""" 
2356                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2358         def _real_extract(self
, url
): 
2359                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2361                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2363                 video_id 
= mobj
.group('videoid') 
2365                 self
.report_webpage(video_id
) 
2366                 request 
= urllib2
.Request(url
) 
2368                         webpage 
= urllib2
.urlopen(request
).read() 
2369                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2370                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
2373                 m 
= re
.search(r
'id="video:(?P<internalvideoid>[0-9]+)"', webpage
) 
2375                         self
._downloader
.trouble(u
'ERROR: Cannot extract internal video ID') 
2377                 internal_video_id 
= m
.group('internalvideoid') 
2381                         'internal_id': internal_video_id
, 
2384                 self
.report_extraction(video_id
) 
2385                 xmlUrl 
= 'http://www.collegehumor.com/moogaloop/video:' + internal_video_id
 
2387                         metaXml 
= urllib2
.urlopen(xmlUrl
).read() 
2388                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2389                         self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % str(err
)) 
2392                 mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2394                         videoNode 
= mdoc
.findall('./video')[0] 
2395                         info
['description'] = videoNode
.findall('./description')[0].text
 
2396                         info
['title'] = videoNode
.findall('./caption')[0].text
 
2397                         info
['url'] = videoNode
.findall('./file')[0].text
 
2398                         info
['thumbnail'] = videoNode
.findall('./thumbnail')[0].text
 
2399                         info
['ext'] = info
['url'].rpartition('.')[2] 
2400                         info
['format'] = info
['ext'] 
2402                         self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file') 
2408 class XVideosIE(InfoExtractor
): 
2409         """Information extractor for xvideos.com""" 
2411         _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)' 
2412         IE_NAME 
= u
'xvideos' 
2414         def report_webpage(self
, video_id
): 
2415                 """Report information extraction.""" 
2416                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
)) 
2418         def report_extraction(self
, video_id
): 
2419                 """Report information extraction.""" 
2420                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2422         def _real_extract(self
, url
): 
2423                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2425                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2427                 video_id 
= mobj
.group(1).decode('utf-8') 
2429                 self
.report_webpage(video_id
) 
2431                 request 
= urllib2
.Request(r
'http://www.xvideos.com/video' + video_id
) 
2433                         webpage 
= urllib2
.urlopen(request
).read() 
2434                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2435                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
2438                 self
.report_extraction(video_id
) 
2442                 mobj 
= re
.search(r
'flv_url=(.+?)&', webpage
) 
2444                         self
._downloader
.trouble(u
'ERROR: unable to extract video url') 
2446                 video_url 
= urllib2
.unquote(mobj
.group(1).decode('utf-8')) 
2450                 mobj 
= re
.search(r
'<title>(.*?)\s+-\s+XVID', webpage
) 
2452                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
2454                 video_title 
= mobj
.group(1).decode('utf-8') 
2457                 # Extract video thumbnail 
2458                 mobj 
= re
.search(r
'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage
) 
2460                         self
._downloader
.trouble(u
'ERROR: unable to extract video thumbnail') 
2462                 video_thumbnail 
= mobj
.group(0).decode('utf-8') 
2468                         'upload_date': None, 
2469                         'title': video_title
, 
2472                         'thumbnail': video_thumbnail
, 
2473                         'description': None, 
2480 class SoundcloudIE(InfoExtractor
): 
2481         """Information extractor for soundcloud.com 
2482            To access the media, the uid of the song and a stream token 
2483            must be extracted from the page source and the script must make 
2484            a request to media.soundcloud.com/crossdomain.xml. Then 
2485            the media can be grabbed by requesting from an url composed 
2486            of the stream token and uid 
2489         _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2490         IE_NAME 
= u
'soundcloud' 
2492         def __init__(self
, downloader
=None): 
2493                 InfoExtractor
.__init
__(self
, downloader
) 
2495         def report_webpage(self
, video_id
): 
2496                 """Report information extraction.""" 
2497                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
)) 
2499         def report_extraction(self
, video_id
): 
2500                 """Report information extraction.""" 
2501                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2503         def _real_extract(self
, url
): 
2504                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2506                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2509                 # extract uploader (which is in the url) 
2510                 uploader 
= mobj
.group(1).decode('utf-8') 
2511                 # extract simple title (uploader + slug of song title) 
2512                 slug_title 
=  mobj
.group(2).decode('utf-8') 
2513                 simple_title 
= uploader 
+ u
'-' + slug_title
 
2515                 self
.report_webpage('%s/%s' % (uploader
, slug_title
)) 
2517                 request 
= urllib2
.Request('http://soundcloud.com/%s/%s' % (uploader
, slug_title
)) 
2519                         webpage 
= urllib2
.urlopen(request
).read() 
2520                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2521                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
2524                 self
.report_extraction('%s/%s' % (uploader
, slug_title
)) 
2526                 # extract uid and stream token that soundcloud hands out for access 
2527                 mobj 
= re
.search('"uid":"([\w\d]+?)".*?stream_token=([\w\d]+)', webpage
) 
2529                         video_id 
= mobj
.group(1) 
2530                         stream_token 
= mobj
.group(2) 
2532                 # extract unsimplified title 
2533                 mobj 
= re
.search('"title":"(.*?)",', webpage
) 
2535                         title 
= mobj
.group(1).decode('utf-8') 
2537                         title 
= simple_title
 
2539                 # construct media url (with uid/token) 
2540                 mediaURL 
= "http://media.soundcloud.com/stream/%s?stream_token=%s" 
2541                 mediaURL 
= mediaURL 
% (video_id
, stream_token
) 
2544                 description 
= u
'No description available' 
2545                 mobj 
= re
.search('track-description-value"><p>(.*?)</p>', webpage
) 
2547                         description 
= mobj
.group(1) 
2551                 mobj 
= re
.search("pretty-date'>on ([\w]+ [\d]+, [\d]+ \d+:\d+)</abbr></h2>", webpage
) 
2554                                 upload_date 
= datetime
.datetime
.strptime(mobj
.group(1), '%B %d, %Y %H:%M').strftime('%Y%m%d') 
2555                         except Exception, e
: 
2556                                 self
._downloader
.to_stderr(str(e
)) 
2558                 # for soundcloud, a request to a cross domain is required for cookies 
2559                 request 
= urllib2
.Request('http://media.soundcloud.com/crossdomain.xml', std_headers
) 
2562                         'id':           video_id
.decode('utf-8'), 
2564                         'uploader':     uploader
.decode('utf-8'), 
2565                         'upload_date':  upload_date
, 
2570                         'description': description
.decode('utf-8') 
2574 class InfoQIE(InfoExtractor
): 
2575         """Information extractor for infoq.com""" 
2577         _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$' 
2580         def report_webpage(self
, video_id
): 
2581                 """Report information extraction.""" 
2582                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
)) 
2584         def report_extraction(self
, video_id
): 
2585                 """Report information extraction.""" 
2586                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2588         def _real_extract(self
, url
): 
2589                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2591                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2594                 self
.report_webpage(url
) 
2596                 request 
= urllib2
.Request(url
) 
2598                         webpage 
= urllib2
.urlopen(request
).read() 
2599                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2600                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
2603                 self
.report_extraction(url
) 
2607                 mobj 
= re
.search(r
"jsclassref='([^']*)'", webpage
) 
2609                         self
._downloader
.trouble(u
'ERROR: unable to extract video url') 
2611                 video_url 
= 'rtmpe://video.infoq.com/cfx/st/' + urllib2
.unquote(mobj
.group(1).decode('base64')) 
2615                 mobj 
= re
.search(r
'contentTitle = "(.*?)";', webpage
) 
2617                         self
._downloader
.trouble(u
'ERROR: unable to extract video title') 
2619                 video_title 
= mobj
.group(1).decode('utf-8') 
2621                 # Extract description 
2622                 video_description 
= u
'No description available.' 
2623                 mobj 
= re
.search(r
'<meta name="description" content="(.*)"(?:\s*/)?>', webpage
) 
2624                 if mobj 
is not None: 
2625                         video_description 
= mobj
.group(1).decode('utf-8') 
2627                 video_filename 
= video_url
.split('/')[-1] 
2628                 video_id
, extension 
= video_filename
.split('.') 
2634                         'upload_date': None, 
2635                         'title': video_title
, 
2637                         'format': extension
, # Extension is always(?) mp4, but seems to be flv 
2639                         'description': video_description
, 
2645 class MixcloudIE(InfoExtractor
): 
2646         """Information extractor for www.mixcloud.com""" 
2647         _VALID_URL 
= r
'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)' 
2648         IE_NAME 
= u
'mixcloud' 
2650         def __init__(self
, downloader
=None): 
2651                 InfoExtractor
.__init
__(self
, downloader
) 
2653         def report_download_json(self
, file_id
): 
2654                 """Report JSON download.""" 
2655                 self
._downloader
.to_screen(u
'[%s] Downloading json' % self
.IE_NAME
) 
2657         def report_extraction(self
, file_id
): 
2658                 """Report information extraction.""" 
2659                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, file_id
)) 
2661         def get_urls(self
, jsonData
, fmt
, bitrate
='best'): 
2662                 """Get urls from 'audio_formats' section in json""" 
2665                         bitrate_list 
= jsonData
[fmt
] 
2666                         if bitrate 
is None or bitrate 
== 'best' or bitrate 
not in bitrate_list
: 
2667                                 bitrate 
= max(bitrate_list
) # select highest 
2669                         url_list 
= jsonData
[fmt
][bitrate
] 
2670                 except TypeError: # we have no bitrate info. 
2671                         url_list 
= jsonData
[fmt
] 
2674         def check_urls(self
, url_list
): 
2675                 """Returns 1st active url from list""" 
2676                 for url 
in url_list
: 
2678                                 urllib2
.urlopen(url
) 
2680                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2685         def _print_formats(self
, formats
): 
2686                 print 'Available formats:' 
2687                 for fmt 
in formats
.keys(): 
2688                         for b 
in formats
[fmt
]: 
2690                                         ext 
= formats
[fmt
][b
][0] 
2691                                         print '%s\t%s\t[%s]' % (fmt
, b
, ext
.split('.')[-1]) 
2692                                 except TypeError: # we have no bitrate info 
2693                                         ext 
= formats
[fmt
][0] 
2694                                         print '%s\t%s\t[%s]' % (fmt
, '??', ext
.split('.')[-1]) 
2697         def _real_extract(self
, url
): 
2698                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2700                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2702                 # extract uploader & filename from url 
2703                 uploader 
= mobj
.group(1).decode('utf-8') 
2704                 file_id 
= uploader 
+ "-" + mobj
.group(2).decode('utf-8') 
2706                 # construct API request 
2707                 file_url 
= 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url
.split('/')[-3:-1]) + '.json' 
2708                 # retrieve .json file with links to files 
2709                 request 
= urllib2
.Request(file_url
) 
2711                         self
.report_download_json(file_url
) 
2712                         jsonData 
= urllib2
.urlopen(request
).read() 
2713                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2714                         self
._downloader
.trouble(u
'ERROR: Unable to retrieve file: %s' % str(err
)) 
2718                 json_data 
= json
.loads(jsonData
) 
2719                 player_url 
= json_data
['player_swf_url'] 
2720                 formats 
= dict(json_data
['audio_formats']) 
2722                 req_format 
= self
._downloader
.params
.get('format', None) 
2725                 if self
._downloader
.params
.get('listformats', None): 
2726                         self
._print
_formats
(formats
) 
2729                 if req_format 
is None or req_format 
== 'best': 
2730                         for format_param 
in formats
.keys(): 
2731                                 url_list 
= self
.get_urls(formats
, format_param
) 
2733                                 file_url 
= self
.check_urls(url_list
) 
2734                                 if file_url 
is not None: 
2737                         if req_format 
not in formats
.keys(): 
2738                                 self
._downloader
.trouble(u
'ERROR: format is not available') 
2741                         url_list 
= self
.get_urls(formats
, req_format
) 
2742                         file_url 
= self
.check_urls(url_list
) 
2743                         format_param 
= req_format
 
2746                         'id': file_id
.decode('utf-8'), 
2747                         'url': file_url
.decode('utf-8'), 
2748                         'uploader':     uploader
.decode('utf-8'), 
2749                         'upload_date': u
'NA', 
2750                         'title': json_data
['name'], 
2751                         'ext': file_url
.split('.')[-1].decode('utf-8'), 
2752                         'format': (format_param 
is None and u
'NA' or format_param
.decode('utf-8')), 
2753                         'thumbnail': json_data
['thumbnail_url'], 
2754                         'description': json_data
['description'], 
2755                         'player_url': player_url
.decode('utf-8'), 
2758 class StanfordOpenClassroomIE(InfoExtractor
): 
2759         """Information extractor for Stanford's Open ClassRoom""" 
2761         _VALID_URL 
= r
'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' 
2762         IE_NAME 
= u
'stanfordoc' 
2764         def report_download_webpage(self
, objid
): 
2765                 """Report information extraction.""" 
2766                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, objid
)) 
2768         def report_extraction(self
, video_id
): 
2769                 """Report information extraction.""" 
2770                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2772         def _real_extract(self
, url
): 
2773                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2775                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2778                 if mobj
.group('course') and mobj
.group('video'): # A specific video 
2779                         course 
= mobj
.group('course') 
2780                         video 
= mobj
.group('video') 
2782                                 'id': course 
+ '_' + video
, 
2785                         self
.report_extraction(info
['id']) 
2786                         baseUrl 
= 'http://openclassroom.stanford.edu/MainFolder/courses/' + course 
+ '/videos/' 
2787                         xmlUrl 
= baseUrl 
+ video 
+ '.xml' 
2789                                 metaXml 
= urllib2
.urlopen(xmlUrl
).read() 
2790                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2791                                 self
._downloader
.trouble(u
'ERROR: unable to download video info XML: %s' % unicode(err
)) 
2793                         mdoc 
= xml
.etree
.ElementTree
.fromstring(metaXml
) 
2795                                 info
['title'] = mdoc
.findall('./title')[0].text
 
2796                                 info
['url'] = baseUrl 
+ mdoc
.findall('./videoFile')[0].text
 
2798                                 self
._downloader
.trouble(u
'\nERROR: Invalid metadata XML file') 
2800                         info
['ext'] = info
['url'].rpartition('.')[2] 
2801                         info
['format'] = info
['ext'] 
2803                 elif mobj
.group('course'): # A course page 
2804                         course 
= mobj
.group('course') 
2810                         self
.report_download_webpage(info
['id']) 
2812                                 coursepage 
= urllib2
.urlopen(url
).read() 
2813                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2814                                 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
)) 
2817                         m 
= re
.search('<h1>([^<]+)</h1>', coursepage
) 
2819                                 info
['title'] = unescapeHTML(m
.group(1)) 
2821                                 info
['title'] = info
['id'] 
2823                         m 
= re
.search('<description>([^<]+)</description>', coursepage
) 
2825                                 info
['description'] = unescapeHTML(m
.group(1)) 
2827                         links 
= orderedSet(re
.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage
)) 
2830                                         'type': 'reference', 
2831                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage
), 
2835                         for entry 
in info
['list']: 
2836                                 assert entry
['type'] == 'reference' 
2837                                 results 
+= self
.extract(entry
['url']) 
2842                                 'id': 'Stanford OpenClassroom', 
2846                         self
.report_download_webpage(info
['id']) 
2847                         rootURL 
= 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' 
2849                                 rootpage 
= urllib2
.urlopen(rootURL
).read() 
2850                         except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2851                                 self
._downloader
.trouble(u
'ERROR: unable to download course info page: ' + unicode(err
)) 
2854                         info
['title'] = info
['id'] 
2856                         links 
= orderedSet(re
.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage
)) 
2859                                         'type': 'reference', 
2860                                         'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage
), 
2865                         for entry 
in info
['list']: 
2866                                 assert entry
['type'] == 'reference' 
2867                                 results 
+= self
.extract(entry
['url']) 
2870 class MTVIE(InfoExtractor
): 
2871         """Information extractor for MTV.com""" 
2873         _VALID_URL 
= r
'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$' 
2876         def report_webpage(self
, video_id
): 
2877                 """Report information extraction.""" 
2878                 self
._downloader
.to_screen(u
'[%s] %s: Downloading webpage' % (self
.IE_NAME
, video_id
)) 
2880         def report_extraction(self
, video_id
): 
2881                 """Report information extraction.""" 
2882                 self
._downloader
.to_screen(u
'[%s] %s: Extracting information' % (self
.IE_NAME
, video_id
)) 
2884         def _real_extract(self
, url
): 
2885                 mobj 
= re
.match(self
._VALID
_URL
, url
) 
2887                         self
._downloader
.trouble(u
'ERROR: invalid URL: %s' % url
) 
2889                 if not mobj
.group('proto'): 
2890                         url 
= 'http://' + url
 
2891                 video_id 
= mobj
.group('videoid') 
2892                 self
.report_webpage(video_id
) 
2894                 request 
= urllib2
.Request(url
) 
2896                         webpage 
= urllib2
.urlopen(request
).read() 
2897                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2898                         self
._downloader
.trouble(u
'ERROR: unable to download video webpage: %s' % str(err
)) 
2901                 mobj 
= re
.search(r
'<meta name="mtv_vt" content="([^"]+)"/>', webpage
) 
2903                         self
._downloader
.trouble(u
'ERROR: unable to extract song name') 
2905                 song_name 
= unescapeHTML(mobj
.group(1).decode('iso-8859-1')) 
2906                 mobj 
= re
.search(r
'<meta name="mtv_an" content="([^"]+)"/>', webpage
) 
2908                         self
._downloader
.trouble(u
'ERROR: unable to extract performer') 
2910                 performer 
= unescapeHTML(mobj
.group(1).decode('iso-8859-1')) 
2911                 video_title 
= performer 
+ ' - ' + song_name 
 
2913                 mobj 
= re
.search(r
'<meta name="mtvn_uri" content="([^"]+)"/>', webpage
) 
2915                         self
._downloader
.trouble(u
'ERROR: unable to mtvn_uri') 
2917                 mtvn_uri 
= mobj
.group(1) 
2919                 mobj 
= re
.search(r
'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage
) 
2921                         self
._downloader
.trouble(u
'ERROR: unable to extract content id') 
2923                 content_id 
= mobj
.group(1) 
2925                 videogen_url 
= 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri 
+ '&id=' + content_id 
+ '&vid=' + video_id 
+ '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 
2926                 self
.report_extraction(video_id
) 
2927                 request 
= urllib2
.Request(videogen_url
) 
2929                         metadataXml 
= urllib2
.urlopen(request
).read() 
2930                 except (urllib2
.URLError
, httplib
.HTTPException
, socket
.error
), err
: 
2931                         self
._downloader
.trouble(u
'ERROR: unable to download video metadata: %s' % str(err
)) 
2934                 mdoc 
= xml
.etree
.ElementTree
.fromstring(metadataXml
) 
2935                 renditions 
= mdoc
.findall('.//rendition') 
2937                 # For now, always pick the highest quality. 
2938                 rendition 
= renditions
[-1] 
2941                         _
,_
,ext 
= rendition
.attrib
['type'].partition('/') 
2942                         format 
= ext 
+ '-' + rendition
.attrib
['width'] + 'x' + rendition
.attrib
['height'] + '_' + rendition
.attrib
['bitrate'] 
2943                         video_url 
= rendition
.find('./src').text
 
2945                         self
._downloader
.trouble('Invalid rendition field.') 
2951                         'uploader': performer
, 
2952                         'title': video_title
,