# 84
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
"<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
- # 83
+ # 83 - vfl26ng3K 2013/07/10
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
- "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"),
+ "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
# 82
("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
"Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),
else:
self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))
+ def test_keywords(self):
+ ies = gen_extractors()
+ matching_ies = lambda url: [ie.IE_NAME for ie in ies
+ if ie.suitable(url) and ie.IE_NAME != 'generic']
+ self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions'])
+ self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions'])
+ self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral'])
+ self.assertEqual(matching_ies(':tds'), ['ComedyCentral'])
+ self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral'])
+ self.assertEqual(matching_ies(':cr'), ['ComedyCentral'])
+
if __name__ == '__main__':
unittest.main()
def test_83(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
- right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"
+ right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
self.assertEqual(sig(wrong), right)
def test_82(self):
-
+from .archiveorg import ArchiveOrgIE
from .ard import ARDIE
from .arte import ArteTvIE
from .auengine import AUEngineIE
from .bandcamp import BandcampIE
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
+from .brightcove import BrightcoveIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
from .cspan import CSpanIE
from .dailymotion import DailymotionIE
from .depositfiles import DepositFilesIE
+from .dotsub import DotsubIE
+from .dreisat import DreiSatIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
from .facebook import FacebookIE
from .tutv import TutvIE
from .ustream import UstreamIE
from .vbox7 import Vbox7IE
+from .veoh import VeohIE
from .vevo import VevoIE
from .vimeo import VimeoIE
from .vine import VineIE
from .youjizz import YouJizzIE
from .youku import YoukuIE
from .youporn import YouPornIE
-from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE
+from .youtube import (
+ YoutubeIE,
+ YoutubePlaylistIE,
+ YoutubeSearchIE,
+ YoutubeUserIE,
+ YoutubeChannelIE,
+ YoutubeShowIE,
+ YoutubeSubscriptionsIE,
+)
from .zdf import ZDFIE
--- /dev/null
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ unified_strdate,
+)
+
+
+class ArchiveOrgIE(InfoExtractor):
+ IE_NAME = 'archive.org'
+ IE_DESC = 'archive.org videos'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+ _TEST = {
+ u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
+ u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+ u'md5': u'8af1d4cf447933ed3c7f4871162602db',
+ u'info_dict': {
+ u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
+ u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
+ u"upload_date": u"19681210",
+ u"uploader": u"SRI International"
+ }
+ }
+
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+
+ json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
+ json_data = self._download_webpage(json_url, video_id)
+ data = json.loads(json_data)
+
+ title = data['metadata']['title'][0]
+ description = data['metadata']['description'][0]
+ uploader = data['metadata']['creator'][0]
+ upload_date = unified_strdate(data['metadata']['date'][0])
+
+ formats = [{
+ 'format': fdata['format'],
+ 'url': 'http://' + data['server'] + data['dir'] + fn,
+ 'file_size': int(fdata['size']),
+ }
+ for fn,fdata in data['files'].items()
+ if 'Video' in fdata['format']]
+ formats.sort(key=lambda fdata: fdata['file_size'])
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': description,
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ }
+ thumbnail = data.get('misc', {}).get('image')
+ if thumbnail:
+ info['thumbnail'] = thumbnail
+
+ # TODO: Remove when #980 has been merged
+ info['url'] = formats[-1]['url']
+ info['ext'] = determine_ext(formats[-1]['url'])
+
+ return self.video_result(info)
\ No newline at end of file
# determine title and media streams from webpage
html = self._download_webpage(url, video_id)
title = re.search(self._TITLE, html).group('title')
- streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+ streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
if not streams:
assert '"fsk"' in html
raise ExtractorError(u'This video is only available after 8:00 pm')
import re
import json
+import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- # This is used by the not implemented extractLiveStream method
- compat_urllib_parse,
-
ExtractorError,
unified_strdate,
)
www.arte.tv/guide, the extraction process is different for each one.
The videos expire in 7 days, so we can't add tests.
"""
- _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
- _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
+ _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+ _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
_LIVE_URL = r'index-[0-9]+\.html$'
IE_NAME = u'arte.tv'
return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
# TODO implement Live Stream
+ # from ..utils import compat_urllib_parse
# def extractLiveStream(self, url):
# video_lang = url.split('/')[-4]
# info = self.grep_webpage(
def _real_extract(self, url):
mobj = re.match(self._EMISSION_URL, url)
if mobj is not None:
- name = mobj.group('name')
+ lang = mobj.group('lang')
# This is not a real id, it can be for example AJT for the news
# http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
video_id = mobj.group('id')
- return self._extract_emission(url, video_id)
+ return self._extract_emission(url, video_id, lang)
mobj = re.match(self._VIDEOS_URL, url)
if mobj is not None:
id = mobj.group('id')
- return self._extract_video(url, id)
+ lang = mobj.group('lang')
+ return self._extract_video(url, id, lang)
if re.search(self._LIVE_URL, video_id) is not None:
raise ExtractorError(u'Arte live streams are not yet supported, sorry')
# self.extractLiveStream(url)
# return
- def _extract_emission(self, url, video_id):
+ def _extract_emission(self, url, video_id, lang):
"""Extract from www.arte.tv/guide"""
webpage = self._download_webpage(url, video_id)
json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
}
formats = player_info['VSR'].values()
+ def _match_lang(f):
+ # Return true if that format is in the language of the url
+ if lang == 'fr':
+ l = 'F'
+ elif lang == 'de':
+ l = 'A'
+ regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
+ return any(re.match(r, f['versionCode']) for r in regexes)
+ # Some formats may not be in the same language as the url
+ formats = filter(_match_lang, formats)
# We order the formats by quality
formats = sorted(formats, key=lambda f: int(f['height']))
# Pick the best quality
return info_dict
- def _extract_video(self, url, video_id):
+ def _extract_video(self, url, video_id, lang):
"""Extract from videos.arte.tv"""
- config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
- config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
- config_xml = self._download_webpage(config_xml_url, video_id)
- config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
- config_xml = self._download_webpage(config_xml_url, video_id)
+ ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
+ ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
+ ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
+ ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+ config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
+ config_xml_url = config_node.attrib['ref']
+ config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
def _key(m):
)
class AUEngineIE(InfoExtractor):
+ _TEST = {
+ u'url': u'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
+ u'file': u'lfvlytY6.mp4',
+ u'md5': u'48972bdbcf1a3a2f5533e62425b41d4f',
+ u'info_dict': {
+ u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
+ }
+ }
_VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
def _real_extract(self, url):
_TEST = {
u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
u'file': u'5779306.m4v',
- u'md5': u'b2d849efcf7ee18917e4b4d9ff37cafe',
+ u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
u'info_dict': {
u"upload_date": u"20111205",
u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596",
data = json_data
upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
- video_url = data['media']['url']
+ if 'additionalMedia' in data:
+ formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
+ best_format = formats[-1]
+ video_url = best_format['url']
+ else:
+ video_url = data['media']['url']
umobj = re.match(self._URL_EXT, video_url)
if umobj is None:
raise ValueError('Can not determine filename extension')
pagenum += 1
urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
- url_entries = [self.url_result(url, 'BlipTV') for url in urls]
+ url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
return [self.playlist_result(url_entries, playlist_title = username)]
--- /dev/null
+import re
+import json
+
+from .common import InfoExtractor
+
+class BrightcoveIE(InfoExtractor):
+ _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ query = mobj.group('query')
+ video_id = mobj.group('id')
+
+ request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+ webpage = self._download_webpage(request_url, video_id)
+
+ self.report_extraction(video_id)
+ info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
+ info = json.loads(info)['data']
+ video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+ renditions = video_info['renditions']
+ renditions = sorted(renditions, key=lambda r: r['size'])
+ best_format = renditions[-1]
+
+ return {'id': video_id,
+ 'title': video_info['displayName'],
+ 'url': best_format['defaultURL'],
+ 'ext': 'mp4',
+ 'description': video_info.get('shortDescription'),
+ 'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
+ 'uploader': video_info.get('publisherName'),
+ }
import re
import socket
import sys
+import netrc
from ..utils import (
compat_http_client,
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
+ thumbnails: A list of dictionaries (with the entries "resolution" and
+ "url") for the varying thumbnails
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
uploader: Full name of the video uploader.
"""Report attempt to confirm age."""
self.to_screen(u'Confirming age')
+ def report_login(self):
+ """Report attempt to log in."""
+ self.to_screen(u'Logging in')
+
#Methods for following #608
#They set the correct value of the '_type' key
def video_result(self, video_info):
else:
return res
+ def _get_login_info(self):
+ """
+ Get the the login info as (username, password)
+ It will look in the netrc file using the _NETRC_MACHINE value
+ If there's no info available, return (None, None)
+ """
+ if self._downloader is None:
+ return (None, None)
+
+ username = None
+ password = None
+ downloader_params = self._downloader.params
+
+ # Attempt to use provided username and password or .netrc data
+ if downloader_params.get('username', None) is not None:
+ username = downloader_params['username']
+ password = downloader_params['password']
+ elif downloader_params.get('usenetrc', False):
+ try:
+ info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+ if info is not None:
+ username = info[0]
+ password = info[2]
+ else:
+ raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+ except (IOError, netrc.NetrcParseError) as err:
+ self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+
+ return (username, password)
+
class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
import re
+import json
from .common import InfoExtractor
from ..utils import (
compat_urllib_request,
- compat_urllib_parse,
ExtractorError,
- unescapeHTML,
)
class DailymotionIE(InfoExtractor):
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
- mobj = re.search(r'\s*var flashvars = (.*)', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
- flashvars = compat_urllib_parse.unquote(mobj.group(1))
- for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
- if key in flashvars:
- max_quality = key
- self.to_screen(u'Using %s' % key)
- break
- else:
- raise ExtractorError(u'Unable to extract video URL')
-
- mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
- if mobj is None:
- raise ExtractorError(u'Unable to extract video URL')
-
- video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
-
- # TODO: support choosing qualities
+ video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
+ webpage, 'title')
- mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = unescapeHTML(mobj.group('title'))
-
- video_uploader = None
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
if mobj is not None:
video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
+ embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
+ embed_page = self._download_webpage(embed_url, video_id,
+ u'Downloading embed page')
+ info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info')
+ info = json.loads(info)
+
+ # TODO: support choosing qualities
+
+ for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
+ 'stream_h264_hq_url','stream_h264_url',
+ 'stream_h264_ld_url']:
+ if info.get(key):#key in info and info[key]:
+ max_quality = key
+ self.to_screen(u'Using %s' % key)
+ break
+ else:
+ raise ExtractorError(u'Unable to extract video URL')
+ video_url = info[max_quality]
+
return [{
'id': video_id,
'url': video_url,
'upload_date': video_upload_date,
'title': video_title,
'ext': video_extension,
+ 'thumbnail': info['thumbnail_url']
}]
--- /dev/null
+import re
+import json
+import time
+
+from .common import InfoExtractor
+
+
+class DotsubIE(InfoExtractor):
+ _VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
+ _TEST = {
+ u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
+ u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
+ u'md5': u'0914d4d69605090f623b7ac329fea66e',
+ u'info_dict': {
+ u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
+ u"uploader": u"4v4l0n42",
+ u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
+ u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
+ u'upload_date': u'20101213',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
+ webpage = self._download_webpage(info_url, video_id)
+ info = json.loads(webpage)
+ date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
+
+ return [{
+ 'id': video_id,
+ 'url': info['mediaURI'],
+ 'ext': 'flv',
+ 'title': info['title'],
+ 'thumbnail': info['screenshotURI'],
+ 'description': info['description'],
+ 'uploader': info['user'],
+ 'view_count': info['numberOfViews'],
+ 'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
+ }]
--- /dev/null
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ unified_strdate,
+)
+
+
+class DreiSatIE(InfoExtractor):
+ IE_NAME = '3sat'
+ _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+ _TEST = {
+ u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
+ u'file': u'36983.webm',
+ u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+ u'info_dict': {
+ u"title": u"Kaffeeland Schweiz",
+ u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...",
+ u"uploader": u"3sat",
+ u"upload_date": u"20130622"
+ }
+ }
+
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+ details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
+ details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+
+ thumbnail_els = details_doc.findall('.//teaserimage')
+ thumbnails = [{
+ 'width': te.attrib['key'].partition('x')[0],
+ 'height': te.attrib['key'].partition('x')[2],
+ 'url': te.text,
+ } for te in thumbnail_els]
+
+ information_el = details_doc.find('.//information')
+ video_title = information_el.find('./title').text
+ video_description = information_el.find('./detail').text
+
+ details_el = details_doc.find('.//details')
+ video_uploader = details_el.find('./channel').text
+ upload_date = unified_strdate(details_el.find('./airtime').text)
+
+ format_els = details_doc.findall('.//formitaet')
+ formats = [{
+ 'format_id': fe.attrib['basetype'],
+ 'width': int(fe.find('./width').text),
+ 'height': int(fe.find('./height').text),
+ 'url': fe.find('./url').text,
+ 'filesize': int(fe.find('./filesize').text),
+ 'video_bitrate': int(fe.find('./videoBitrate').text),
+ '3sat_qualityname': fe.find('./quality').text,
+ } for fe in format_els
+ if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
+
+ def _sortkey(format):
+ qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
+ prefer_http = 1 if 'rtmp' in format['url'] else 0
+ return (qidx, prefer_http, format['video_bitrate'])
+ formats.sort(key=_sortkey)
+
+ info = {
+ 'id': video_id,
+ 'title': video_title,
+ 'formats': formats,
+ 'description': video_description,
+ 'thumbnails': thumbnails,
+ 'thumbnail': thumbnails[-1]['url'],
+ 'uploader': video_uploader,
+ 'upload_date': upload_date,
+ }
+
+ # TODO: Remove when #980 has been merged
+ info['url'] = formats[-1]['url']
+ info['ext'] = determine_ext(formats[-1]['url'])
+
+ return self.video_result(info)
\ No newline at end of file
from .common import InfoExtractor
from ..utils import (
unified_strdate,
+ compat_urllib_parse,
)
class GameSpotIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/'
+ _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
_TEST = {
u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
u"file": u"6410818.mp4",
- u"md5": u"5569d64ca98db01f0177c934fe8c1e9b",
+ u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
u"info_dict": {
u"title": u"Arma III - Community Guide: SITREP I",
u"upload_date": u"20130627",
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(3).split("-")[-1]
- info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id)
+ page_id = mobj.group('page_id')
+ webpage = self._download_webpage(url, page_id)
+ video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
+ r'http://www\.gamespot\.com/videoembed/(\d+)'],
+ webpage, 'video id')
+ data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
+ info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
info_xml = self._download_webpage(info_url, video_id)
doc = xml.etree.ElementTree.fromstring(info_xml)
clip_el = doc.find('./playList/clip')
- video_url = clip_el.find('./URI').text
+ http_urls = [{'url': node.find('filePath').text,
+ 'rate': int(node.find('rate').text)}
+ for node in clip_el.find('./httpURI')]
+ best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
+ video_url = best_quality['url']
title = clip_el.find('./title').text
ext = video_url.rpartition('.')[2]
thumbnail_url = clip_el.find('./screenGrabURI').text
+# coding: utf-8
+
import re
+import json
from .common import InfoExtractor
class TudouIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)'
+ _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
_TEST = {
u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
- u'file': u'159447792.f4v',
- u'md5': u'ad7c358a01541e926a1e413612c6b10a',
+ u'file': u'159448201.f4v',
+ u'md5': u'140a49ed444bd22f93330985d8475fcb',
u'info_dict': {
- u"title": u"\u5361\u9a6c\u4e54\u56fd\u8db3\u5f00\u5927\u811a\u957f\u4f20\u51b2\u540a\u96c6\u9526"
+ u"title": u"卡马乔国足开大脚长传冲吊集锦"
}
}
+ def _url_for_id(self, id, quality = None):
+ info_url = "http://v2.tudou.com/f?id="+str(id)
+ if quality:
+ info_url += '&hd' + quality
+ webpage = self._download_webpage(info_url, id, "Opening the info webpage")
+ final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
+ return final_url
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(2).replace('.html','')
+ video_id = mobj.group(2)
webpage = self._download_webpage(url, video_id)
- video_id = re.search('"k":(.+?),',webpage).group(1)
title = re.search(",kw:\"(.+)\"",webpage)
if title is None:
title = re.search(",kw: \'(.+)\'",webpage)
if thumbnail_url is None:
thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
thumbnail_url = thumbnail_url.group(1)
- info_url = "http://v2.tudou.com/f?id="+str(video_id)
- webpage = self._download_webpage(info_url, video_id, "Opening the info webpage")
- final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1)
- ext = (final_url.split('?')[0]).split('.')[-1]
- return [{
- 'id': video_id,
- 'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- }]
+
+ segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
+ segments = json.loads(segs_json)
+ # It looks like the keys are the arguments that have to be passed as
+ # the hd field in the request url, we pick the higher
+ quality = sorted(segments.keys())[-1]
+ parts = segments[quality]
+ result = []
+ len_parts = len(parts)
+ if len_parts > 1:
+ self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
+ for part in parts:
+ part_id = part['k']
+ final_url = self._url_for_id(part_id, quality)
+ ext = (final_url.split('?')[0]).split('.')[-1]
+ part_info = {'id': part_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': thumbnail_url,
+ }
+ result.append(part_info)
+
+ return result
--- /dev/null
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+)
+
+class VeohIE(InfoExtractor):
+ _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+
+ _TEST = {
+ u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ u'file': u'56314296.mp4',
+ u'md5': u'620e68e6a3cff80086df3348426c9ca3',
+ u'info_dict': {
+ u'title': u'Straight Backs Are Stronger',
+ u'uploader': u'LUMOback',
+ u'description': u'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
+ }
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+ if m_youtube is not None:
+ youtube_id = m_youtube.group(1)
+ self.to_screen(u'%s: detected Youtube video.' % video_id)
+ return self.url_result(youtube_id, 'Youtube')
+
+ self.report_extraction(video_id)
+ info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
+ info = json.loads(info)
+ video_url = info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+
+ return {'id': info['videoId'],
+ 'title': info['title'],
+ 'ext': determine_ext(video_url),
+ 'url': video_url,
+ 'uploader': info['username'],
+ 'thumbnail': info.get('highResImage') or info.get('medResImage'),
+ 'description': info['description'],
+ 'view_count': info['views'],
+ }
# _VALID_URL matches Vimeo URLs
_VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
+ _NETRC_MACHINE = 'vimeo'
IE_NAME = u'vimeo'
_TEST = {
u'url': u'http://vimeo.com/56015672',
}
}
+ def _login(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ self.report_login()
+ login_url = 'https://vimeo.com/log_in'
+ webpage = self._download_webpage(login_url, None, False)
+ token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+ data = compat_urllib_parse.urlencode({'email': username,
+ 'password': password,
+ 'action': 'login',
+ 'service': 'vimeo',
+ 'token': token,
+ })
+ login_request = compat_urllib_request.Request(login_url, data)
+ login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+ login_request.add_header('Cookie', 'xsrft=%s' % token)
+ self._download_webpage(login_request, None, False, u'Wrong login info')
+
def _verify_video_password(self, url, video_id, webpage):
password = self._downloader.params.get('videopassword', None)
if password is None:
u'Verifying the password',
u'Wrong password')
+ def _real_initialize(self):
+ self._login()
+
def _real_extract(self, url, new_video=True):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
import netrc
import re
import socket
+import itertools
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
+ orderedSet,
)
u"uploader": u"IconaPop",
u"uploader_id": u"IconaPop"
}
- }
+ },
+ {
+ u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
+ u"file": u"07FYdnEawAQ.mp4",
+ u"note": u"Test VEVO video with age protection (#956)",
+ u"info_dict": {
+ u"upload_date": u"20130703",
+ u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
+ u"description": u"md5:64249768eec3bc4276236606ea996373",
+ u"uploader": u"justintimberlakeVEVO",
+ u"uploader_id": u"justintimberlakeVEVO"
+ }
+ },
]
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url): return False
+ if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_lang(self):
"""Report attempt to set language."""
self.to_screen(u'Setting language')
- def report_login(self):
- """Report attempt to log in."""
- self.to_screen(u'Logging in')
-
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83:
- return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36]
+ return s[:81]
elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
if self._downloader is None:
return
- username = None
- password = None
- downloader_params = self._downloader.params
-
- # Attempt to use provided username and password or .netrc data
- if downloader_params.get('username', None) is not None:
- username = downloader_params['username']
- password = downloader_params['password']
- elif downloader_params.get('usenetrc', False):
- try:
- info = netrc.netrc().authenticators(self._NETRC_MACHINE)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
- except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
- return
-
# Set language
request = compat_urllib_request.Request(self._LANG_URL)
try:
self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return
+ (username, password) = self._get_login_info()
+
# No authentication to be performed
if username is None:
return
# Get video info
self.report_video_info_webpage_download(video_id)
- for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
- video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
- % (video_id, el_type))
+ if re.search(r'player-age-gate-content">', video_webpage) is not None:
+ self.report_age_confirmation()
+ age_gate = True
+ # We simulate the access to the video from www.youtube.com/v/{video_id}
+ # this can be viewed without login into Youtube
+ data = compat_urllib_parse.urlencode({'video_id': video_id,
+ 'el': 'embedded',
+ 'gl': 'US',
+ 'hl': 'en',
+ 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+ 'asv': 3,
+ 'sts':'1588',
+ })
+ video_info_url = 'https://www.youtube.com/get_video_info?' + data
video_info_webpage = self._download_webpage(video_info_url, video_id,
note=False,
errnote='unable to download video info webpage')
video_info = compat_parse_qs(video_info_webpage)
- if 'token' in video_info:
- break
+ else:
+ age_gate = False
+ for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+ video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+ % (video_id, el_type))
+ video_info_webpage = self._download_webpage(video_info_url, video_id,
+ note=False,
+ errnote='unable to download video info webpage')
+ video_info = compat_parse_qs(video_info_webpage)
+ if 'token' in video_info:
+ break
if 'token' not in video_info:
if 'reason' in video_info:
raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
# thumbnail image
- if 'thumbnail_url' not in video_info:
+ # We try first to get a high quality image:
+ m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+ video_webpage, re.DOTALL)
+ if m_thumb is not None:
+ video_thumbnail = m_thumb.group(1)
+ elif 'thumbnail_url' not in video_info:
self._downloader.report_warning(u'unable to extract video thumbnail')
video_thumbnail = ''
else: # don't panic if we can't find it
elif 's' in url_data:
if self._downloader.params.get('verbose'):
s = url_data['s'][0]
- player = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
- 'html5 player', fatal=False)
- self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' %
+ if age_gate:
+ player_version = self._search_regex(r'ad3-(.+?)\.swf',
+ video_info['ad3_module'][0], 'flash player',
+ fatal=False)
+ player = 'flash player %s' % player_version
+ else:
+ player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
+ 'html5 player', fatal=False)
+ self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
(len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
signature = self._decrypt_signature(url_data['s'][0])
url += '&signature=' + signature
videos = [v[1] for v in sorted(videos)]
- url_results = [self.url_result(url, 'Youtube') for url in videos]
+ url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
return [self.playlist_result(url_results, playlist_id, playlist_title)]
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
- url_entries = [self.url_result(url, 'Youtube') for url in urls]
+ url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
return [self.playlist_result(url_entries, channel_id)]
pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
- url_results = [self.url_result(url, 'Youtube') for url in urls]
+ url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
return [self.playlist_result(url_results, playlist_title = username)]
class YoutubeSearchIE(SearchInfoExtractor):
m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+
+
+class YoutubeSubscriptionsIE(YoutubeIE):
+ """It's a subclass of YoutubeIE because we need to login"""
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ IE_NAME = u'youtube:subscriptions'
+ _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+ _PAGING_STEP = 30
+
+ # Overwrite YoutubeIE properties we don't want
+ _TESTS = []
+ @classmethod
+ def suitable(cls, url):
+ return re.match(cls._VALID_URL, url) is not None
+
+ def _real_initialize(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
+ super(YoutubeSubscriptionsIE, self)._real_initialize()
+
+ def _real_extract(self, url):
+ feed_entries = []
+ # The step argument is available only in 2.7 or higher
+ for i in itertools.count(0):
+ paging = i*self._PAGING_STEP
+ info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+ u'Downloading page %s' % i)
+ info = json.loads(info)
+ feed_html = info['feed_html']
+ m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+ ids = orderedSet(m.group(1) for m in m_ids)
+ feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+ if info['paging'] is None:
+ break
+ return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')
date_str = date_str.replace(',',' ')
# %z (UTC offset) is only supported in python>=3.2
date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
- format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
+ format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
for expression in format_expressions:
try:
upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
pass
return upload_date
+def determine_ext(url):
+ guess = url.partition(u'?')[0].rpartition(u'.')[2]
+ if re.match(r'^[A-Za-z0-9]+$', guess):
+ return guess
+ else:
+ return u'unknown_video'
+
def date_from_str(date_str):
"""
Return a datetime object from a string in the format YYYYMMDD or
-__version__ = '2013.07.02'
+__version__ = '2013.07.10'