Merge tag 'upstream/2013.07.10'

author Rogério Brito <rbrito@ime.usp.br>

Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)

committer Rogério Brito <rbrito@ime.usp.br>

Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)
author Rogério Brito <rbrito@ime.usp.br>
Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)
committer Rogério Brito <rbrito@ime.usp.br>
Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)
diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py

index c3d69e6f445af3846081af3bdea0f8c9a2a7063c..150c88d1754c4cfcb6f79b20ef559406f0dc2937 100644 (file)
--- a/devscripts/youtube_genalgo.py
+++ b/devscripts/youtube_genalgo.py
@@ -20,9 +20,9 @@ tests = [
      # 84
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
       "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
-    # 83
+    # 83 - vfl26ng3K 2013/07/10
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
-     "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"),
+     "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
      # 82
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
       "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),
diff --git a/test/test_all_urls.py b/test/test_all_urls.py

index 39a5ee33a70d181b7e6e33786f2d6fedb044c3a3..c73d0e4679853b3e80bd9832a57b0d804265895a 100644 (file)
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -61,6 +61,17 @@ class TestAllURLsMatching(unittest.TestCase):
                  else:
                      self.assertFalse(ie.suitable(url), '%s should not match URL %r' % (type(ie).__name__, url))
  
+    def test_keywords(self):
+        ies = gen_extractors()
+        matching_ies = lambda url: [ie.IE_NAME for ie in ies
+                                    if ie.suitable(url) and ie.IE_NAME != 'generic']
+        self.assertEqual(matching_ies(':ytsubs'), ['youtube:subscriptions'])
+        self.assertEqual(matching_ies(':ytsubscriptions'), ['youtube:subscriptions'])
+        self.assertEqual(matching_ies(':thedailyshow'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':tds'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':colbertreport'), ['ComedyCentral'])
+        self.assertEqual(matching_ies(':cr'), ['ComedyCentral'])
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py

index e87b6259bef3841e900ae0c9d98c03d6c30e0f20..e7660424484ef2d4cb43410ab1478d79734870f5 100755 (executable)
--- a/test/test_youtube_sig.py
+++ b/test/test_youtube_sig.py
@@ -45,7 +45,7 @@ class TestYoutubeSig(unittest.TestCase):
  
      def test_83(self):
          wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
-        right = "D.>/?;}[{=+_)(*&^%$#!MNBVCXeAS<FGHJKLPOIUYTREWZ0987654321mnbvcxzasdfghjklpoiuytrQ"
+        right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
          self.assertEqual(sig(wrong), right)
  
      def test_82(self):
diff --git a/youtube-dl b/youtube-dl

index 4ddbeccf5cd09981c234ab44431a355006bd930d..982454368362308e844a4969f985c4ba52357b12 100755 (executable)

Binary files a/youtube-dl and b/youtube-dl differ
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 41efc57d4d7470ccb24797050964612a4351008d..934419c4343a9d078ceffa7c91be9a21b4600888 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,15 +1,18 @@
-
+from .archiveorg import ArchiveOrgIE
  from .ard import ARDIE
  from .arte import ArteTvIE
  from .auengine import AUEngineIE
  from .bandcamp import BandcampIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .breakcom import BreakIE
+from .brightcove import BrightcoveIE
  from .collegehumor import CollegeHumorIE
  from .comedycentral import ComedyCentralIE
  from .cspan import CSpanIE
  from .dailymotion import DailymotionIE
  from .depositfiles import DepositFilesIE
+from .dotsub import DotsubIE
+from .dreisat import DreiSatIE
  from .eighttracks import EightTracksIE
  from .escapist import EscapistIE
  from .facebook import FacebookIE
@@ -55,6 +58,7 @@ from .tumblr import TumblrIE
  from .tutv import TutvIE
  from .ustream import UstreamIE
  from .vbox7 import Vbox7IE
+from .veoh import VeohIE
  from .vevo import VevoIE
  from .vimeo import VimeoIE
  from .vine import VineIE
@@ -68,7 +72,15 @@ from .yahoo import YahooIE, YahooSearchIE
  from .youjizz import YouJizzIE
  from .youku import YoukuIE
  from .youporn import YouPornIE
-from .youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE, YoutubeShowIE
+from .youtube import (
+    YoutubeIE,
+    YoutubePlaylistIE,
+    YoutubeSearchIE,
+    YoutubeUserIE,
+    YoutubeChannelIE,
+    YoutubeShowIE,
+    YoutubeSubscriptionsIE,
+)
  from .zdf import ZDFIE
  
  
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py

new file mode 100644 (file)

index 0000000..29cb9bd
--- /dev/null
+++ b/youtube_dl/extractor/archiveorg.py
@@ -0,0 +1,66 @@
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unified_strdate,
+)
+
+
+class ArchiveOrgIE(InfoExtractor):
+    IE_NAME = 'archive.org'
+    IE_DESC = 'archive.org videos'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?archive.org/details/(?P<id>[^?/]+)(?:[?].*)?$'
+    _TEST = {
+        u"url": u"http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect",
+        u'file': u'XD300-23_68HighlightsAResearchCntAugHumanIntellect.ogv',
+        u'md5': u'8af1d4cf447933ed3c7f4871162602db',
+        u'info_dict': {
+            u"title": u"1968 Demo - FJCC Conference Presentation Reel #1",
+            u"description": u"Reel 1 of 3: Also known as the \"Mother of All Demos\", Doug Engelbart's presentation at the Fall Joint Computer Conference in San Francisco, December 9, 1968 titled \"A Research Center for Augmenting Human Intellect.\" For this presentation, Doug and his team astonished the audience by not only relating their research, but demonstrating it live. This was the debut of the mouse, interactive computing, hypermedia, computer supported software engineering, video teleconferencing, etc. See also <a href=\"http://dougengelbart.org/firsts/dougs-1968-demo.html\" rel=\"nofollow\">Doug's 1968 Demo page</a> for more background, highlights, links, and the detailed paper published in this conference proceedings. Filmed on 3 reels: Reel 1 | <a href=\"http://www.archive.org/details/XD300-24_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 2</a> | <a href=\"http://www.archive.org/details/XD300-25_68HighlightsAResearchCntAugHumanIntellect\" rel=\"nofollow\">Reel 3</a>",
+            u"upload_date": u"19681210",
+            u"uploader": u"SRI International"
+        }
+    }
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        json_url = url + (u'?' if u'?' in url else '&') + u'output=json'
+        json_data = self._download_webpage(json_url, video_id)
+        data = json.loads(json_data)
+
+        title = data['metadata']['title'][0]
+        description = data['metadata']['description'][0]
+        uploader = data['metadata']['creator'][0]
+        upload_date = unified_strdate(data['metadata']['date'][0])
+
+        formats = [{
+                'format': fdata['format'],
+                'url': 'http://' + data['server'] + data['dir'] + fn,
+                'file_size': int(fdata['size']),
+            }
+            for fn,fdata in data['files'].items()
+            if 'Video' in fdata['format']]
+        formats.sort(key=lambda fdata: fdata['file_size'])
+
+        info = {
+            'id': video_id,
+            'title': title,
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'upload_date': upload_date,
+        }
+        thumbnail = data.get('misc', {}).get('image')
+        if thumbnail:
+            info['thumbnail'] = thumbnail
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = determine_ext(formats[-1]['url'])
+
+        return self.video_result(info)
+\ No newline at end of file
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

index 5793a4129d21c8c36e20d1b57607629e3adb94fd..dbf8eed9901527edfa4001719937c070940cec85 100644 (file)
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -32,7 +32,7 @@ class ARDIE(InfoExtractor):
          # determine title and media streams from webpage
          html = self._download_webpage(url, video_id)
          title = re.search(self._TITLE, html).group('title')
-        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        streams = [mo.groupdict() for mo in re.finditer(self._MEDIA_STREAM, html)]
          if not streams:
              assert '"fsk"' in html
              raise ExtractorError(u'This video is only available after 8:00 pm')
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index 183274eb75f0947d11434867f30ba3ef9f6fcc0c..e7a91a1eb5e835c9b6e8bd9f16302a9bc7a8bf90 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,11 +1,9 @@
  import re
  import json
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
-    # This is used by the not implemented extractLiveStream method
-    compat_urllib_parse,
-
      ExtractorError,
      unified_strdate,
  )
@@ -16,8 +14,8 @@ class ArteTvIE(InfoExtractor):
      www.arte.tv/guide, the extraction process is different for each one.
      The videos expire in 7 days, so we can't add tests.
      """
-    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
-    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?:fr|de)/.*-(?P<id>.*?).html'
+    _EMISSION_URL = r'(?:http://)?www\.arte.tv/guide/(?P<lang>fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
+    _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?P<lang>fr|de)/.*-(?P<id>.*?).html'
      _LIVE_URL = r'index-[0-9]+\.html$'
  
      IE_NAME = u'arte.tv'
@@ -27,6 +25,7 @@ class ArteTvIE(InfoExtractor):
          return any(re.match(regex, url) for regex in (cls._EMISSION_URL, cls._VIDEOS_URL))
  
      # TODO implement Live Stream
+    # from ..utils import compat_urllib_parse
      # def extractLiveStream(self, url):
      #     video_lang = url.split('/')[-4]
      #     info = self.grep_webpage(
@@ -56,23 +55,24 @@ class ArteTvIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._EMISSION_URL, url)
          if mobj is not None:
-            name = mobj.group('name')
+            lang = mobj.group('lang')
              # This is not a real id, it can be for example AJT for the news
              # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
              video_id = mobj.group('id')
-            return self._extract_emission(url, video_id)
+            return self._extract_emission(url, video_id, lang)
  
          mobj = re.match(self._VIDEOS_URL, url)
          if mobj is not None:
              id = mobj.group('id')
-            return self._extract_video(url, id)
+            lang = mobj.group('lang')
+            return self._extract_video(url, id, lang)
  
          if re.search(self._LIVE_URL, video_id) is not None:
              raise ExtractorError(u'Arte live streams are not yet supported, sorry')
              # self.extractLiveStream(url)
              # return
  
-    def _extract_emission(self, url, video_id):
+    def _extract_emission(self, url, video_id, lang):
          """Extract from www.arte.tv/guide"""
          webpage = self._download_webpage(url, video_id)
          json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
@@ -91,6 +91,16 @@ class ArteTvIE(InfoExtractor):
                       }
  
          formats = player_info['VSR'].values()
+        def _match_lang(f):
+            # Return true if that format is in the language of the url
+            if lang == 'fr':
+                l = 'F'
+            elif lang == 'de':
+                l = 'A'
+            regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
+            return any(re.match(r, f['versionCode']) for r in regexes)
+        # Some formats may not be in the same language as the url
+        formats = filter(_match_lang, formats)
          # We order the formats by quality
          formats = sorted(formats, key=lambda f: int(f['height']))
          # Pick the best quality
@@ -103,13 +113,15 @@ class ArteTvIE(InfoExtractor):
  
          return info_dict
  
-    def _extract_video(self, url, video_id):
+    def _extract_video(self, url, video_id, lang):
          """Extract from videos.arte.tv"""
-        config_xml_url = url.replace('/videos/', '/do_delegate/videos/')
-        config_xml_url = config_xml_url.replace('.html', ',view,asPlayerXml.xml')
-        config_xml = self._download_webpage(config_xml_url, video_id)
-        config_xml_url = self._html_search_regex(r'<video lang=".*?" ref="(.*?)"', config_xml, 'config xml url')
-        config_xml = self._download_webpage(config_xml_url, video_id)
+        ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
+        ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
+        ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
+        ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+        config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
+        config_xml_url = config_node.attrib['ref']
+        config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
  
          video_urls = list(re.finditer(r'<url quality="(?P<quality>.*?)">(?P<url>.*?)</url>', config_xml))
          def _key(m):
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py

index 3b4ade3bf7174fbb33b9e0ce2d193912c0f93a6c..0febbff4f6c42afd10f8dbc13ea9df883edae4c6 100644 (file)
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -8,6 +8,14 @@ from ..utils import (
  )
  
  class AUEngineIE(InfoExtractor):
+    _TEST = {
+        u'url': u'http://auengine.com/embed.php?file=lfvlytY6&w=650&h=370',
+        u'file': u'lfvlytY6.mp4',
+        u'md5': u'48972bdbcf1a3a2f5533e62425b41d4f',
+        u'info_dict': {
+            u"title": u"[Commie]The Legend of the Legendary Heroes - 03 - Replication Eye (Alpha Stigma)[F9410F5A]"
+        }
+    }
      _VALID_URL = r'(?:http://)?(?:www\.)?auengine\.com/embed.php\?.*?file=([^&]+).*?'
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bliptv.py b/youtube_dl/extractor/bliptv.py

index 37141e6a0a9016a39c9ef0f254c3e0599109baa5..08b28c994272e3461bba5d99856928ff6adb6cf3 100644 (file)
--- a/youtube_dl/extractor/bliptv.py
+++ b/youtube_dl/extractor/bliptv.py
@@ -27,7 +27,7 @@ class BlipTVIE(InfoExtractor):
      _TEST = {
          u'url': u'http://blip.tv/cbr/cbr-exclusive-gotham-city-imposters-bats-vs-jokerz-short-3-5796352',
          u'file': u'5779306.m4v',
-        u'md5': u'b2d849efcf7ee18917e4b4d9ff37cafe',
+        u'md5': u'80baf1ec5c3d2019037c1c707d676b9f',
          u'info_dict': {
              u"upload_date": u"20111205", 
              u"description": u"md5:9bc31f227219cde65e47eeec8d2dc596", 
@@ -103,7 +103,12 @@ class BlipTVIE(InfoExtractor):
                      data = json_data
  
                  upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
-                video_url = data['media']['url']
+                if 'additionalMedia' in data:
+                    formats = sorted(data['additionalMedia'], key=lambda f: int(f['media_height']))
+                    best_format = formats[-1]
+                    video_url = best_format['url']
+                else:
+                    video_url = data['media']['url']
                  umobj = re.match(self._URL_EXT, video_url)
                  if umobj is None:
                      raise ValueError('Can not determine filename extension')
@@ -184,5 +189,5 @@ class BlipTVUserIE(InfoExtractor):
              pagenum += 1
  
          urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
-        url_entries = [self.url_result(url, 'BlipTV') for url in urls]
+        url_entries = [self.url_result(vurl, 'BlipTV') for vurl in urls]
          return [self.playlist_result(url_entries, playlist_title = username)]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

new file mode 100644 (file)

index 0000000..f85acbb
--- /dev/null
+++ b/youtube_dl/extractor/brightcove.py
@@ -0,0 +1,32 @@
+import re
+import json
+
+from .common import InfoExtractor
+
+class BrightcoveIE(InfoExtractor):
+    _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        query = mobj.group('query')
+        video_id = mobj.group('id')
+
+        request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+        webpage = self._download_webpage(request_url, video_id)
+
+        self.report_extraction(video_id)
+        info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
+        info = json.loads(info)['data']
+        video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+        renditions = video_info['renditions']
+        renditions = sorted(renditions, key=lambda r: r['size'])
+        best_format = renditions[-1]
+        
+        return {'id': video_id,
+                'title': video_info['displayName'],
+                'url': best_format['defaultURL'], 
+                'ext': 'mp4',
+                'description': video_info.get('shortDescription'),
+                'thumbnail': video_info.get('videoStillURL') or video_info.get('thumbnailURL'),
+                'uploader': video_info.get('publisherName'),
+                }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 655836ff6efca2ef95e80631d528901d6ae28d46..1d98222ce6518398fd2a1381100400d5bacb99c0 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -3,6 +3,7 @@ import os
  import re
  import socket
  import sys
+import netrc
  
  from ..utils import (
      compat_http_client,
@@ -36,6 +37,8 @@ class InfoExtractor(object):
      The following fields are optional:
  
      format:         The video format, defaults to ext (used for --get-format)
+    thumbnails:     A list of dictionaries (with the entries "resolution" and
+                    "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
      description:    One-line video description.
      uploader:       Full name of the video uploader.
@@ -161,6 +164,10 @@ class InfoExtractor(object):
          """Report attempt to confirm age."""
          self.to_screen(u'Confirming age')
  
+    def report_login(self):
+        """Report attempt to log in."""
+        self.to_screen(u'Logging in')
+
      #Methods for following #608
      #They set the correct value of the '_type' key
      def video_result(self, video_info):
@@ -225,6 +232,36 @@ class InfoExtractor(object):
          else:
              return res
  
+    def _get_login_info(self):
+        """
+        Get the the login info as (username, password)
+        It will look in the netrc file using the _NETRC_MACHINE value
+        If there's no info available, return (None, None)
+        """
+        if self._downloader is None:
+            return (None, None)
+
+        username = None
+        password = None
+        downloader_params = self._downloader.params
+
+        # Attempt to use provided username and password or .netrc data
+        if downloader_params.get('username', None) is not None:
+            username = downloader_params['username']
+            password = downloader_params['password']
+        elif downloader_params.get('usenetrc', False):
+            try:
+                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
+                if info is not None:
+                    username = info[0]
+                    password = info[2]
+                else:
+                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
+            except (IOError, netrc.NetrcParseError) as err:
+                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
+        
+        return (username, password)
+
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index 3297a8549e38e7ddb5be2bda2053964bf70802e7..5fd2221a798403ff4832bf6992b8724bdf74f964 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,12 +1,11 @@
  import re
+import json
  
  from .common import InfoExtractor
  from ..utils import (
      compat_urllib_request,
-    compat_urllib_parse,
  
      ExtractorError,
-    unescapeHTML,
  )
  
  class DailymotionIE(InfoExtractor):
@@ -39,33 +38,10 @@ class DailymotionIE(InfoExtractor):
  
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
-        mobj = re.search(r'\s*var flashvars = (.*)', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract media URL')
-        flashvars = compat_urllib_parse.unquote(mobj.group(1))
  
-        for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
-            if key in flashvars:
-                max_quality = key
-                self.to_screen(u'Using %s' % key)
-                break
-        else:
-            raise ExtractorError(u'Unable to extract video URL')
-
-        mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract video URL')
-
-        video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
-
-        # TODO: support choosing qualities
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
+                                              webpage, 'title')
  
-        mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
-
-        video_uploader = None
          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                               # Looking for official user
                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,6 +52,25 @@ class DailymotionIE(InfoExtractor):
          if mobj is not None:
              video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
  
+        embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
+        embed_page = self._download_webpage(embed_url, video_id,
+                                            u'Downloading embed page')
+        info = self._search_regex(r'var info = ({.*?}),', embed_page, 'video info')
+        info = json.loads(info)
+
+        # TODO: support choosing qualities
+
+        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
+                    'stream_h264_hq_url','stream_h264_url',
+                    'stream_h264_ld_url']:
+            if info.get(key):#key in info and info[key]:
+                max_quality = key
+                self.to_screen(u'Using %s' % key)
+                break
+        else:
+            raise ExtractorError(u'Unable to extract video URL')
+        video_url = info[max_quality]
+
          return [{
              'id':       video_id,
              'url':      video_url,
@@ -83,4 +78,5 @@ class DailymotionIE(InfoExtractor):
              'upload_date':  video_upload_date,
              'title':    video_title,
              'ext':      video_extension,
+            'thumbnail': info['thumbnail_url']
          }]
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py

new file mode 100644 (file)

index 0000000..0ee9a68
--- /dev/null
+++ b/youtube_dl/extractor/dotsub.py
@@ -0,0 +1,41 @@
+import re
+import json
+import time
+
+from .common import InfoExtractor
+
+
+class DotsubIE(InfoExtractor):
+    _VALID_URL = r'(?:http://)?(?:www\.)?dotsub\.com/view/([^/]+)'
+    _TEST = {
+        u'url': u'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
+        u'file': u'aed3b8b2-1889-4df5-ae63-ad85f5572f27.flv',
+        u'md5': u'0914d4d69605090f623b7ac329fea66e',
+        u'info_dict': {
+            u"title": u"Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary",
+            u"uploader": u"4v4l0n42",
+            u'description': u'Pyramids of Waste (2010) also known as "The lightbulb conspiracy" is a documentary about how our economic system based on consumerism  and planned obsolescence is breaking our planet down.\r\n\r\nSolutions to this can be found at:\r\nhttp://robotswillstealyourjob.com\r\nhttp://www.federicopistono.org\r\n\r\nhttp://opensourceecology.org\r\nhttp://thezeitgeistmovement.com',
+            u'thumbnail': u'http://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
+            u'upload_date': u'20101213',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+        info_url = "https://dotsub.com/api/media/%s/metadata" %(video_id)
+        webpage = self._download_webpage(info_url, video_id)
+        info = json.loads(webpage)
+        date = time.gmtime(info['dateCreated']/1000) # The timestamp is in miliseconds
+
+        return [{
+            'id':          video_id,
+            'url':         info['mediaURI'],
+            'ext':         'flv',
+            'title':       info['title'],
+            'thumbnail':   info['screenshotURI'],
+            'description': info['description'],
+            'uploader':    info['user'],
+            'view_count':  info['numberOfViews'],
+            'upload_date': u'%04i%02i%02i' % (date.tm_year, date.tm_mon, date.tm_mday),
+        }]
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py

new file mode 100644 (file)

index 0000000..847f733
--- /dev/null
+++ b/youtube_dl/extractor/dreisat.py
@@ -0,0 +1,84 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    unified_strdate,
+)
+
+
+class DreiSatIE(InfoExtractor):
+    IE_NAME = '3sat'
+    _VALID_URL = r'(?:http://)?(?:www\.)?3sat.de/mediathek/index.php\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
+    _TEST = {
+        u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983",
+        u'file': u'36983.webm',
+        u'md5': u'57c97d0469d71cf874f6815aa2b7c944',
+        u'info_dict': {
+            u"title": u"Kaffeeland Schweiz",
+            u"description": u"Über 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", 
+            u"uploader": u"3sat",
+            u"upload_date": u"20130622"
+        }
+    }
+
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
+        details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+
+        thumbnail_els = details_doc.findall('.//teaserimage')
+        thumbnails = [{
+            'width': te.attrib['key'].partition('x')[0],
+            'height': te.attrib['key'].partition('x')[2],
+            'url': te.text,
+        } for te in thumbnail_els]
+
+        information_el = details_doc.find('.//information')
+        video_title = information_el.find('./title').text
+        video_description = information_el.find('./detail').text
+
+        details_el = details_doc.find('.//details')
+        video_uploader = details_el.find('./channel').text
+        upload_date = unified_strdate(details_el.find('./airtime').text)
+
+        format_els = details_doc.findall('.//formitaet')
+        formats = [{
+            'format_id': fe.attrib['basetype'],
+            'width': int(fe.find('./width').text),
+            'height': int(fe.find('./height').text),
+            'url': fe.find('./url').text,
+            'filesize': int(fe.find('./filesize').text),
+            'video_bitrate': int(fe.find('./videoBitrate').text),
+            '3sat_qualityname': fe.find('./quality').text,
+        } for fe in format_els
+            if not fe.find('./url').text.startswith('http://www.metafilegenerator.de/')]
+
+        def _sortkey(format):
+            qidx = ['low', 'med', 'high', 'veryhigh'].index(format['3sat_qualityname'])
+            prefer_http = 1 if 'rtmp' in format['url'] else 0
+            return (qidx, prefer_http, format['video_bitrate'])
+        formats.sort(key=_sortkey)
+
+        info = {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats,
+            'description': video_description,
+            'thumbnails': thumbnails,
+            'thumbnail': thumbnails[-1]['url'],
+            'uploader': video_uploader,
+            'upload_date': upload_date,
+        }
+
+        # TODO: Remove when #980 has been merged
+        info['url'] = formats[-1]['url']
+        info['ext'] = determine_ext(formats[-1]['url'])
+
+        return self.video_result(info)
+\ No newline at end of file
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py

index cec3b7ac863247e8ddc2c2add953372bd809eed4..7585b70618d1e4f92e8297fbf4d1397359a5224b 100644 (file)
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -4,14 +4,15 @@ import xml.etree.ElementTree
  from .common import InfoExtractor
  from ..utils import (
      unified_strdate,
+    compat_urllib_parse,
  )
  
  class GameSpotIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/([^/]+)/videos/([^/]+)-([^/d]+)/'
+    _VALID_URL = r'(?:http://)?(?:www\.)?gamespot\.com/.*-(?P<page_id>\d+)/?'
      _TEST = {
          u"url": u"http://www.gamespot.com/arma-iii/videos/arma-iii-community-guide-sitrep-i-6410818/",
          u"file": u"6410818.mp4",
-        u"md5": u"5569d64ca98db01f0177c934fe8c1e9b",
+        u"md5": u"b2a30deaa8654fcccd43713a6b6a4825",
          u"info_dict": {
              u"title": u"Arma III - Community Guide: SITREP I",
              u"upload_date": u"20130627", 
@@ -21,13 +22,22 @@ class GameSpotIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(3).split("-")[-1]
-        info_url = "http://www.gamespot.com/pages/video_player/xml.php?id="+str(video_id)
+        page_id = mobj.group('page_id')
+        webpage = self._download_webpage(url, page_id)
+        video_id = self._html_search_regex([r'"og:video" content=".*?\?id=(\d+)"',
+                                            r'http://www\.gamespot\.com/videoembed/(\d+)'],
+                                           webpage, 'video id')
+        data = compat_urllib_parse.urlencode({'id': video_id, 'newplayer': '1'})
+        info_url = 'http://www.gamespot.com/pages/video_player/xml.php?' + data
          info_xml = self._download_webpage(info_url, video_id)
          doc = xml.etree.ElementTree.fromstring(info_xml)
          clip_el = doc.find('./playList/clip')
  
-        video_url = clip_el.find('./URI').text
+        http_urls = [{'url': node.find('filePath').text,
+                      'rate': int(node.find('rate').text)}
+            for node in clip_el.find('./httpURI')]
+        best_quality = sorted(http_urls, key=lambda f: f['rate'])[-1]
+        video_url = best_quality['url']
          title = clip_el.find('./title').text
          ext = video_url.rpartition('.')[2]
          thumbnail_url = clip_el.find('./screenGrabURI').text
diff --git a/youtube_dl/extractor/tudou.py b/youtube_dl/extractor/tudou.py

index 4681a6f797ce17bdfc96381308991d46a6e96cd6..1405b73f76ad5166d45d9a9eb9687c49fa8a0bde 100644 (file)
--- a/youtube_dl/extractor/tudou.py
+++ b/youtube_dl/extractor/tudou.py
@@ -1,24 +1,34 @@
+# coding: utf-8
+
  import re
+import json
  
  from .common import InfoExtractor
  
  
  class TudouIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+)\.html)'
+    _VALID_URL = r'(?:http://)?(?:www\.)?tudou\.com/(?:listplay|programs)/(?:view|(.+?))/(?:([^/]+)|([^/]+))(?:\.html)?'
      _TEST = {
          u'url': u'http://www.tudou.com/listplay/zzdE77v6Mmo/2xN2duXMxmw.html',
-        u'file': u'159447792.f4v',
-        u'md5': u'ad7c358a01541e926a1e413612c6b10a',
+        u'file': u'159448201.f4v',
+        u'md5': u'140a49ed444bd22f93330985d8475fcb',
          u'info_dict': {
-            u"title": u"\u5361\u9a6c\u4e54\u56fd\u8db3\u5f00\u5927\u811a\u957f\u4f20\u51b2\u540a\u96c6\u9526"
+            u"title": u"卡马乔国足开大脚长传冲吊集锦"
          }
      }
  
+    def _url_for_id(self, id, quality = None):
+        info_url = "http://v2.tudou.com/f?id="+str(id)
+        if quality:
+            info_url += '&hd' + quality
+        webpage = self._download_webpage(info_url, id, "Opening the info webpage")
+        final_url = self._html_search_regex('>(.+?)</f>',webpage, 'video url')
+        return final_url
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(2).replace('.html','')
+        video_id = mobj.group(2)
          webpage = self._download_webpage(url, video_id)
-        video_id = re.search('"k":(.+?),',webpage).group(1)
          title = re.search(",kw:\"(.+)\"",webpage)
          if title is None:
              title = re.search(",kw: \'(.+)\'",webpage)
@@ -27,14 +37,27 @@ class TudouIE(InfoExtractor):
          if thumbnail_url is None:
              thumbnail_url = re.search(",pic:\"(.+?)\"",webpage)
          thumbnail_url = thumbnail_url.group(1)
-        info_url = "http://v2.tudou.com/f?id="+str(video_id)
-        webpage = self._download_webpage(info_url, video_id, "Opening the info webpage")
-        final_url = re.search('\>(.+?)\<\/f\>',webpage).group(1)
-        ext = (final_url.split('?')[0]).split('.')[-1]
-        return [{
-            'id':        video_id,
-            'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-        }]
+
+        segs_json = self._search_regex(r'segs: \'(.*)\'', webpage, 'segments')
+        segments = json.loads(segs_json)
+        # It looks like the keys are the arguments that have to be passed as
+        # the hd field in the request url, we pick the higher
+        quality = sorted(segments.keys())[-1]
+        parts = segments[quality]
+        result = []
+        len_parts = len(parts)
+        if len_parts > 1:
+            self.to_screen(u'%s: found %s parts' % (video_id, len_parts))
+        for part in parts:
+            part_id = part['k']
+            final_url = self._url_for_id(part_id, quality)
+            ext = (final_url.split('?')[0]).split('.')[-1]
+            part_info = {'id': part_id,
+                          'url': final_url,
+                          'ext': ext,
+                          'title': title,
+                          'thumbnail': thumbnail_url,
+                          }
+            result.append(part_info)
+
+        return result
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py

new file mode 100644 (file)

index 0000000..00672c9
--- /dev/null
+++ b/youtube_dl/extractor/veoh.py
@@ -0,0 +1,47 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+class VeohIE(InfoExtractor):
+    _VALID_URL = r'http://www\.veoh\.com/watch/v(?P<id>\d*)'
+
+    _TEST = {
+        u'url': u'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+        u'file': u'56314296.mp4',
+        u'md5': u'620e68e6a3cff80086df3348426c9ca3',
+        u'info_dict': {
+            u'title': u'Straight Backs Are Stronger',
+            u'uploader': u'LUMOback',
+            u'description': u'At LUMOback, we believe straight backs are stronger.  The LUMOback Posture & Movement Sensor:  It gently vibrates when you slouch, inspiring improved posture and mobility.  Use the app to track your data and improve your posture over time. ',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        m_youtube = re.search(r'http://www\.youtube\.com/v/(.*?)(\&|")', webpage)
+        if m_youtube is not None:
+            youtube_id = m_youtube.group(1)
+            self.to_screen(u'%s: detected Youtube video.' % video_id)
+            return self.url_result(youtube_id, 'Youtube')
+
+        self.report_extraction(video_id)
+        info = self._search_regex(r'videoDetailsJSON = \'({.*?})\';', webpage, 'info')
+        info = json.loads(info)
+        video_url =  info.get('fullPreviewHashHighPath') or info.get('fullPreviewHashLowPath')
+
+        return {'id': info['videoId'], 
+                'title': info['title'],
+                'ext': determine_ext(video_url),
+                'url': video_url,
+                'uploader': info['username'],
+                'thumbnail': info.get('highResImage') or info.get('medResImage'),
+                'description': info['description'],
+                'view_count': info['views'],
+                }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index 7c4562790d1452f8bdad25a09156607b56e552fa..ac32043c1e651abaadf1a223c18e4983b9301ba7 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -17,6 +17,7 @@ class VimeoIE(InfoExtractor):
  
      # _VALID_URL matches Vimeo URLs
      _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)(?:[?].*)?$'
+    _NETRC_MACHINE = 'vimeo'
      IE_NAME = u'vimeo'
      _TEST = {
          u'url': u'http://vimeo.com/56015672',
@@ -31,6 +32,25 @@ class VimeoIE(InfoExtractor):
          }
      }
  
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        login_url = 'https://vimeo.com/log_in'
+        webpage = self._download_webpage(login_url, None, False)
+        token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
+        data = compat_urllib_parse.urlencode({'email': username,
+                                              'password': password,
+                                              'action': 'login',
+                                              'service': 'vimeo',
+                                              'token': token,
+                                              })
+        login_request = compat_urllib_request.Request(login_url, data)
+        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        login_request.add_header('Cookie', 'xsrft=%s' % token)
+        self._download_webpage(login_request, None, False, u'Wrong login info')
+
      def _verify_video_password(self, url, video_id, webpage):
          password = self._downloader.params.get('videopassword', None)
          if password is None:
@@ -50,6 +70,9 @@ class VimeoIE(InfoExtractor):
                                 u'Verifying the password',
                                 u'Wrong password')
  
+    def _real_initialize(self):
+        self._login()
+
      def _real_extract(self, url, new_video=True):
          # Extract ID from URL
          mobj = re.match(self._VALID_URL, url)
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 14a8bd6ea9ead38b15ae7fe41366248e6507882b..afb655c04f8f5495b7872d4ce72fbe7d8a7db583 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -4,6 +4,7 @@ import json
  import netrc
  import re
  import socket
+import itertools
  
  from .common import InfoExtractor, SearchInfoExtractor
  from ..utils import (
@@ -19,6 +20,7 @@ from ..utils import (
      ExtractorError,
      unescapeHTML,
      unified_strdate,
+    orderedSet,
  )
  
  
@@ -115,24 +117,32 @@ class YoutubeIE(InfoExtractor):
                  u"uploader": u"IconaPop",
                  u"uploader_id": u"IconaPop"
              }
-        }
+        },
+        {
+            u"url":  u"https://www.youtube.com/watch?v=07FYdnEawAQ",
+            u"file":  u"07FYdnEawAQ.mp4",
+            u"note": u"Test VEVO video with age protection (#956)",
+            u"info_dict": {
+                u"upload_date": u"20130703",
+                u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
+                u"description": u"md5:64249768eec3bc4276236606ea996373",
+                u"uploader": u"justintimberlakeVEVO",
+                u"uploader_id": u"justintimberlakeVEVO"
+            }
+        },
      ]
  
  
      @classmethod
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
-        if YoutubePlaylistIE.suitable(url): return False
+        if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  
      def report_lang(self):
          """Report attempt to set language."""
          self.to_screen(u'Setting language')
  
-    def report_login(self):
-        """Report attempt to log in."""
-        self.to_screen(u'Logging in')
-
      def report_video_webpage_download(self, video_id):
          """Report attempt to download video webpage."""
          self.to_screen(u'%s: Downloading video webpage' % video_id)
@@ -180,7 +190,7 @@ class YoutubeIE(InfoExtractor):
          elif len(s) == 84:
              return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
          elif len(s) == 83:
-            return s[52] + s[81:55:-1] + s[2] + s[54:52:-1] + s[82] + s[51:36:-1] + s[55] + s[35:2:-1] + s[36]
+            return s[:81]
          elif len(s) == 82:
              return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
  
@@ -294,26 +304,6 @@ class YoutubeIE(InfoExtractor):
          if self._downloader is None:
              return
  
-        username = None
-        password = None
-        downloader_params = self._downloader.params
-
-        # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username', None) is not None:
-            username = downloader_params['username']
-            password = downloader_params['password']
-        elif downloader_params.get('usenetrc', False):
-            try:
-                info = netrc.netrc().authenticators(self._NETRC_MACHINE)
-                if info is not None:
-                    username = info[0]
-                    password = info[2]
-                else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
-            except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
-                return
-
          # Set language
          request = compat_urllib_request.Request(self._LANG_URL)
          try:
@@ -323,6 +313,8 @@ class YoutubeIE(InfoExtractor):
              self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
              return
  
+        (username, password) = self._get_login_info()
+
          # No authentication to be performed
          if username is None:
              return
@@ -430,15 +422,35 @@ class YoutubeIE(InfoExtractor):
  
          # Get video info
          self.report_video_info_webpage_download(video_id)
-        for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
-            video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
-                    % (video_id, el_type))
+        if re.search(r'player-age-gate-content">', video_webpage) is not None:
+            self.report_age_confirmation()
+            age_gate = True
+            # We simulate the access to the video from www.youtube.com/v/{video_id}
+            # this can be viewed without login into Youtube
+            data = compat_urllib_parse.urlencode({'video_id': video_id,
+                                                  'el': 'embedded',
+                                                  'gl': 'US',
+                                                  'hl': 'en',
+                                                  'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                                                  'asv': 3,
+                                                  'sts':'1588',
+                                                  })
+            video_info_url = 'https://www.youtube.com/get_video_info?' + data
              video_info_webpage = self._download_webpage(video_info_url, video_id,
                                      note=False,
                                      errnote='unable to download video info webpage')
              video_info = compat_parse_qs(video_info_webpage)
-            if 'token' in video_info:
-                break
+        else:
+            age_gate = False
+            for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
+                video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+                        % (video_id, el_type))
+                video_info_webpage = self._download_webpage(video_info_url, video_id,
+                                        note=False,
+                                        errnote='unable to download video info webpage')
+                video_info = compat_parse_qs(video_info_webpage)
+                if 'token' in video_info:
+                    break
          if 'token' not in video_info:
              if 'reason' in video_info:
                  raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
@@ -471,7 +483,12 @@ class YoutubeIE(InfoExtractor):
          video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
  
          # thumbnail image
-        if 'thumbnail_url' not in video_info:
+        # We try first to get a high quality image:
+        m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+                            video_webpage, re.DOTALL)
+        if m_thumb is not None:
+            video_thumbnail = m_thumb.group(1)
+        elif 'thumbnail_url' not in video_info:
              self._downloader.report_warning(u'unable to extract video thumbnail')
              video_thumbnail = ''
          else:   # don't panic if we can't find it
@@ -560,9 +577,15 @@ class YoutubeIE(InfoExtractor):
                      elif 's' in url_data:
                          if self._downloader.params.get('verbose'):
                              s = url_data['s'][0]
-                            player = self._search_regex(r'html5player-(.+?)\.js', video_webpage,
-                                'html5 player', fatal=False)
-                            self.to_screen('encrypted signature length %d (%d.%d), itag %s, html5 player %s' %
+                            if age_gate:
+                                player_version = self._search_regex(r'ad3-(.+?)\.swf',
+                                    video_info['ad3_module'][0], 'flash player',
+                                    fatal=False)
+                                player = 'flash player %s' % player_version
+                            else:
+                                player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
+                                    'html5 player', fatal=False)
+                            self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
                                  (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
                          signature = self._decrypt_signature(url_data['s'][0])
                          url += '&signature=' + signature
@@ -690,7 +713,7 @@ class YoutubePlaylistIE(InfoExtractor):
  
          videos = [v[1] for v in sorted(videos)]
  
-        url_results = [self.url_result(url, 'Youtube') for url in videos]
+        url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
          return [self.playlist_result(url_results, playlist_id, playlist_title)]
  
  
@@ -748,7 +771,7 @@ class YoutubeChannelIE(InfoExtractor):
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
          urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
-        url_entries = [self.url_result(url, 'Youtube') for url in urls]
+        url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
          return [self.playlist_result(url_entries, channel_id)]
  
  
@@ -805,7 +828,7 @@ class YoutubeUserIE(InfoExtractor):
              pagenum += 1
  
          urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
-        url_results = [self.url_result(url, 'Youtube') for url in urls]
+        url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
          return [self.playlist_result(url_results, playlist_title = username)]
  
  class YoutubeSearchIE(SearchInfoExtractor):
@@ -864,3 +887,40 @@ class YoutubeShowIE(InfoExtractor):
          m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
          self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
          return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+
+
+class YoutubeSubscriptionsIE(YoutubeIE):
+    """It's a subclass of YoutubeIE because we need to login"""
+    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    IE_NAME = u'youtube:subscriptions'
+    _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+    _PAGING_STEP = 30
+
+    # Overwrite YoutubeIE properties we don't want
+    _TESTS = []
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._VALID_URL, url) is not None
+
+    def _real_initialize(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
+        super(YoutubeSubscriptionsIE, self)._real_initialize()
+
+    def _real_extract(self, url):
+        feed_entries = []
+        # The step argument is available only in 2.7 or higher
+        for i in itertools.count(0):
+            paging = i*self._PAGING_STEP
+            info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+                                          u'Downloading page %s' % i)
+            info = json.loads(info)
+            feed_html = info['feed_html']
+            m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+            ids = orderedSet(m.group(1) for m in m_ids)
+            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+            if info['paging'] is None:
+                break
+        return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 9137a4f70be1423c20b9c2107f15db2c91abdada..b9bff5fde87d91a5956e978c98880f05034ac6ab 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -623,7 +623,7 @@ def unified_strdate(date_str):
      date_str = date_str.replace(',',' ')
      # %z (UTC offset) is only supported in python>=3.2
      date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
-    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S']
+    format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
      for expression in format_expressions:
          try:
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
@@ -631,6 +631,13 @@ def unified_strdate(date_str):
              pass
      return upload_date
  
+def determine_ext(url):
+    guess = url.partition(u'?')[0].rpartition(u'.')[2]
+    if re.match(r'^[A-Za-z0-9]+$', guess):
+        return guess
+    else:
+        return u'unknown_video'
+
  def date_from_str(date_str):
      """
      Return a datetime object from a string in the format YYYYMMDD or
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index bc4ad90be20ab33bd81971ad37c2f40b639410f5..2f20826c24a316f575e004ae2f02adf93e942b9a 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.07.02'
+__version__ = '2013.07.10'
author	Rogério Brito <rbrito@ime.usp.br>
	Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)
committer	Rogério Brito <rbrito@ime.usp.br>
	Wed, 10 Jul 2013 21:15:07 +0000 (18:15 -0300)
devscripts/youtube_genalgo.py		patch \| blob \| history
test/test_all_urls.py		patch \| blob \| history
test/test_youtube_sig.py		patch \| blob \| history
youtube-dl		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/archiveorg.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/ard.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/auengine.py		patch \| blob \| history
youtube_dl/extractor/bliptv.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/dotsub.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dreisat.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/gamespot.py		patch \| blob \| history
youtube_dl/extractor/tudou.py		patch \| blob \| history
youtube_dl/extractor/veoh.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history