Imported Upstream version 2013.08.02

author Rogério Brito <rbrito@ime.usp.br>

Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)

committer Rogério Brito <rbrito@ime.usp.br>

Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)
author Rogério Brito <rbrito@ime.usp.br>
Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)
committer Rogério Brito <rbrito@ime.usp.br>
Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)
diff --git a/README.md b/README.md

index b246d3c53317848166351daba7abba9eddbc5359..560bcdca185494cc096c7e1ec7f5a55cf0c34732 100644 (file)
--- a/README.md
+++ b/README.md
@@ -16,7 +16,9 @@ which means you can modify it, redistribute it or use it however you like.
  # OPTIONS
      -h, --help                 print this help text and exit
      --version                  print program version and exit
-    -U, --update               update this program to latest version
+    -U, --update               update this program to latest version. Make sure
+                               that you have sufficient permissions (run with
+                               sudo if needed)
      -i, --ignore-errors        continue on download errors
      --dump-user-agent          display the current browser identification
      --user-agent UA            specify a custom user agent
diff --git a/README.txt b/README.txt

index 8f08dd27b55ed119a45f7458f15bbac767e56fab..b13711a0f6c1ccc9aa2918147e31f16209e34353 100644 (file)
--- a/README.txt
+++ b/README.txt
@@ -23,7 +23,9 @@ OPTIONS
  
      -h, --help                 print this help text and exit
      --version                  print program version and exit
-    -U, --update               update this program to latest version
+    -U, --update               update this program to latest version. Make sure
+                               that you have sufficient permissions (run with
+                               sudo if needed)
      -i, --ignore-errors        continue on download errors
      --dump-user-agent          display the current browser identification
      --user-agent UA            specify a custom user agent
diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py

index 150c88d1754c4cfcb6f79b20ef559406f0dc2937..31d6ec95295f97a1382edc0fd866348a3456fca8 100644 (file)
--- a/devscripts/youtube_genalgo.py
+++ b/devscripts/youtube_genalgo.py
@@ -5,27 +5,45 @@
  import sys
  
  tests = [
+    # 92 - vflQw-fB4 2013/07/17
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"",
+     "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"),
+    # 90
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`",
+     "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"),
      # 88
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<",
       "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"),
-    # 87
+    # 87 - vflART1Nf 2013/07/24
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<",
-     "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"),
-    # 86 - vfl_ymO4Z 2013/06/27
+     "tyuioplkjhgfdsazxcv<nm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>"),
+    # 86 - vflm_D8eE 2013/07/31
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<",
-     "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),
-    # 85
+     ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK<POIUYTREW509876L432/mnbvcxzasdfghjklpoiuytre"),
+    # 85 - vflSAFCP9 2013/07/19
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<",
-     "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"),
+     "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"),
      # 84
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<",
       "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"),
-    # 83 - vfl26ng3K 2013/07/10
+    # 83 - vflTWC9KW 2013/08/01
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<",
-     "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
+     "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"),
      # 82
      ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<",
       "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"),
+    # 81 - vflLC8JvQ 2013/07/25
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
+     "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
+    # 79 - vflLC8JvQ 2013/07/25 (sporadic)
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/",
+     "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
+]
+
+tests_age_gate = [
+    # 86 - vflqinMWD
+    ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<",
+     "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),
  ]
  
  def find_matching(wrong, right):
@@ -78,6 +96,8 @@ def genall(tests):
  
  def main():
      print(genall(tests))
+    print(u'    Age gate:')
+    print(genall(tests_age_gate))
  
  if __name__ == '__main__':
      main()
diff --git a/test/test_playlists.py b/test/test_playlists.py

new file mode 100644 (file)

index 0000000..65de3a5
--- /dev/null
+++ b/test/test_playlists.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+import sys
+import unittest
+import json
+
+# Allow direct execution
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE
+from youtube_dl.utils import *
+
+from helper import FakeYDL
+
+class TestPlaylists(unittest.TestCase):
+    def assertIsPlaylist(self, info):
+        """Make sure the info has '_type' set to 'playlist'"""
+        self.assertEqual(info['_type'], 'playlist')
+
+    def test_dailymotion_playlist(self):
+        dl = FakeYDL()
+        ie = DailymotionPlaylistIE(dl)
+        result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'SPORT')
+        self.assertTrue(len(result['entries']) > 20)
+
+    def test_vimeo_channel(self):
+        dl = FakeYDL()
+        ie = VimeoChannelIE(dl)
+        result = ie.extract('http://vimeo.com/channels/tributes')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Vimeo Tributes')
+        self.assertTrue(len(result['entries']) > 24)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py

index c4b71362e354bf3d748dc5d109611566e18edbaa..be1069105209ddb705ec6d63b2179a315577a1a4 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -4,6 +4,7 @@
  
  import sys
  import unittest
+import xml.etree.ElementTree
  
  # Allow direct execution
  import os
@@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML
  from youtube_dl.utils import orderedSet
  from youtube_dl.utils import DateRange
  from youtube_dl.utils import unified_strdate
+from youtube_dl.utils import find_xpath_attr
  
  if sys.version_info < (3, 0):
      _compat_str = lambda b: b.decode('unicode-escape')
@@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
          self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
  
+    def test_find_xpath_attr(self):
+        testxml = u'''<root>
+            <node/>
+            <node x="a"/>
+            <node x="a" y="c" />
+            <node x="b" y="d" />
+        </root>'''
+        doc = xml.etree.ElementTree.fromstring(testxml)
+
+        self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
+        self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
+        self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py

old mode 100755 (executable)

new mode 100644 (file)

index e766042..d645c08
--- a/test/test_youtube_sig.py
+++ b/test/test_youtube_sig.py
@@ -10,12 +10,19 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  from youtube_dl.extractor.youtube import YoutubeIE
  from helper import FakeYDL
  
-sig = YoutubeIE(FakeYDL())._decrypt_signature
+ie = YoutubeIE(FakeYDL())
+sig = ie._decrypt_signature
+sig_age_gate = ie._decrypt_signature_age_gate
  
  class TestYoutubeSig(unittest.TestCase):
-    def test_43_43(self):
-        wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135'
-        right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE'
+    def test_92(self):
+        wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8"
+        right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7"
+        self.assertEqual(sig(wrong), right)
+
+    def test_90(self):
+        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`"
+        right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"
          self.assertEqual(sig(wrong), right)
  
      def test_88(self):
@@ -25,17 +32,17 @@ class TestYoutubeSig(unittest.TestCase):
  
      def test_87(self):
          wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<"
-        right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"
+        right = "tyuioplkjhgfdsazxcv<nm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>"
          self.assertEqual(sig(wrong), right)
  
      def test_86(self):
          wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"
-        right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"
+        right = ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK<POIUYTREW509876L432/mnbvcxzasdfghjklpoiuytre"
          self.assertEqual(sig(wrong), right)
  
      def test_85(self):
          wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<"
-        right = "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"
+        right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"
          self.assertEqual(sig(wrong), right)
  
      def test_84(self):
@@ -45,7 +52,7 @@ class TestYoutubeSig(unittest.TestCase):
  
      def test_83(self):
          wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
-        right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
+        right = "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"
          self.assertEqual(sig(wrong), right)
  
      def test_82(self):
@@ -53,5 +60,20 @@ class TestYoutubeSig(unittest.TestCase):
          right = "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"
          self.assertEqual(sig(wrong), right)
  
+    def test_81(self):
+        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>."
+        right = "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"
+        self.assertEqual(sig(wrong), right)
+
+    def test_79(self):
+        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/"
+        right = "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"
+        self.assertEqual(sig(wrong), right)
+    
+    def test_86_age_gate(self):
+        wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"
+        right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"
+        self.assertEqual(sig_age_gate(wrong), right)
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/youtube-dl b/youtube-dl

index 982454368362308e844a4969f985c4ba52357b12..39baeee938d5f6bcdbd4a1f049eb2b7619d5598a 100755 (executable)

Binary files a/youtube-dl and b/youtube-dl differ
diff --git a/youtube-dl.1 b/youtube-dl.1

index 001c05e2fc14dfce138a17a03e4a5c41fa5f9f1f..e2ea5b8c6304983868a61453d047c2865a3be4ad 100644 (file)
--- a/youtube-dl.1
+++ b/youtube-dl.1
@@ -20,7 +20,9 @@ redistribute it or use it however you like.
  \f[C]
  \-h,\ \-\-help\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ this\ help\ text\ and\ exit
  \-\-version\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ program\ version\ and\ exit
-\-U,\ \-\-update\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ update\ this\ program\ to\ latest\ version
+\-U,\ \-\-update\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ update\ this\ program\ to\ latest\ version.\ Make\ sure
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ that\ you\ have\ sufficient\ permissions\ (run\ with
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sudo\ if\ needed)
  \-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ continue\ on\ download\ errors
  \-\-dump\-user\-agent\ \ \ \ \ \ \ \ \ \ display\ the\ current\ browser\ identification
  \-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py

index 155895fe26bb13c11d0e5ae5cec5379a911460df..ea6b9d626efa7a18eafe20afa8c473d1afee315b 100644 (file)
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -329,6 +329,35 @@ class FileDownloader(object):
              self.report_error(u'mplayer exited with code %d' % retval)
              return False
  
+    def _download_m3u8_with_ffmpeg(self, filename, url):
+        self.report_destination(filename)
+        tmpfilename = self.temp_name(filename)
+
+        args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename]
+        # Check for ffmpeg first
+        try:
+            subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+        except (OSError, IOError):
+            self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] )
+            return False
+
+        retval = subprocess.call(args)
+        if retval == 0:
+            fsize = os.path.getsize(encodeFilename(tmpfilename))
+            self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
+            self.try_rename(tmpfilename, filename)
+            self._hook_progress({
+                'downloaded_bytes': fsize,
+                'total_bytes': fsize,
+                'filename': filename,
+                'status': 'finished',
+            })
+            return True
+        else:
+            self.to_stderr(u"\n")
+            self.report_error(u'ffmpeg exited with code %d' % retval)
+            return False
+
  
      def _do_download(self, filename, info_dict):
          url = info_dict['url']
@@ -354,6 +383,10 @@ class FileDownloader(object):
          if url.startswith('mms') or url.startswith('rtsp'):
              return self._download_with_mplayer(filename, url)
  
+        # m3u8 manifest are downloaded with ffmpeg
+        if determine_ext(url) == u'm3u8':
+            return self._download_m3u8_with_ffmpeg(filename, url)
+
          tmpfilename = self.temp_name(filename)
          stream = None
  
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py

index 8c5e5399177458d9cb7e0fe8ffa5b3c873634729..fddf58606015b92cc21a9f89818c90852c365e83 100644 (file)
--- a/youtube_dl/PostProcessor.py
+++ b/youtube_dl/PostProcessor.py
@@ -100,7 +100,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
          self._nopostoverwrites = nopostoverwrites
  
      def get_audio_codec(self, path):
-        if not self._exes['ffprobe'] and not self._exes['avprobe']: return None
+        if not self._exes['ffprobe'] and not self._exes['avprobe']:
+            raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')
          try:
              cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))]
              handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)
@@ -208,7 +209,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
              try:
                  os.utime(encodeFilename(new_path), (time.time(), information['filetime']))
              except:
-                self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
+                self._downloader.report_warning(u'Cannot update utime of audio file')
  
          information['filepath'] = new_path
          return self._nopostoverwrites,information
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index d3281fed25c8a6b7d2476b662167691c93322c3a..4968669002a8edd0297a5cc3ba1ddddd3d59df49 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -348,6 +348,7 @@ class YoutubeDL(object):
  
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
+            ie_result.update(extra_info)
              if 'playlist' not in ie_result:
                  # It isn't part of a playlist
                  ie_result['playlist'] = None
@@ -528,10 +529,8 @@ class YoutubeDL(object):
                  return
  
          if self.params.get('writethumbnail', False):
-            if 'thumbnail' in info_dict:
-                thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
-                if not thumb_format:
-                    thumb_format = 'jpg'
+            if info_dict.get('thumbnail') is not None:
+                thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
                  thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
                  self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
                                 (info_dict['extractor'], info_dict['id']))
@@ -595,7 +594,7 @@ class YoutubeDL(object):
                          # No clear decision yet, let IE decide
                          keep_video = keep_video_wish
              except PostProcessingError as e:
-                self.to_stderr(u'ERROR: ' + e.msg)
+                self.report_error(e.msg)
          if keep_video is False and not self.params.get('keepvideo', False):
              try:
                  self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index db63d0adb7a68f3f1f1eb39e5da7dd4a38937ebf..bf040aacd79fb16a19c4d4e688c1d628faf65a9f 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -129,7 +129,7 @@ def parseOpts(overrideArguments=None):
      general.add_option('-v', '--version',
              action='version', help='print program version and exit')
      general.add_option('-U', '--update',
-            action='store_true', dest='update_self', help='update this program to latest version')
+            action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
      general.add_option('-i', '--ignore-errors',
              action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
      general.add_option('--dump-user-agent',
@@ -398,6 +398,8 @@ def _real_main(argv=None):
              batchurls = batchfd.readlines()
              batchurls = [x.strip() for x in batchurls]
              batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+            if opts.verbose:
+                sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
          except IOError:
              sys.exit(u'ERROR: batch file could not be read')
      all_urls = batchurls + args
@@ -580,7 +582,7 @@ def _real_main(argv=None):
          })
  
      if opts.verbose:
-        ydl.to_screen(u'[debug] youtube-dl version ' + __version__)
+        sys.stderr.write(u'[debug] youtube-dl version ' + __version__ + u'\n')
          try:
              sp = subprocess.Popen(
                  ['git', 'rev-parse', '--short', 'HEAD'],
@@ -589,11 +591,14 @@ def _real_main(argv=None):
              out, err = sp.communicate()
              out = out.decode().strip()
              if re.match('[0-9a-f]+', out):
-                ydl.to_screen(u'[debug] Git HEAD: ' + out)
+                sys.stderr.write(u'[debug] Git HEAD: ' + out + u'\n')
          except:
-            sys.exc_clear()
-        ydl.to_screen(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()))
-        ydl.to_screen(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
+            try:
+                sys.exc_clear()
+            except:
+                pass
+        sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n')
+        sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n')
  
      ydl.add_default_info_extractors()
  
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index 934419c4343a9d078ceffa7c91be9a21b4600888..c20172a53a0372c09810b1a0ba1c0d99c8899d7c 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -6,17 +6,23 @@ from .bandcamp import BandcampIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .breakcom import BreakIE
  from .brightcove import BrightcoveIE
+from .canalplus import CanalplusIE
  from .collegehumor import CollegeHumorIE
  from .comedycentral import ComedyCentralIE
+from .condenast import CondeNastIE
+from .criterion import CriterionIE
  from .cspan import CSpanIE
-from .dailymotion import DailymotionIE
+from .dailymotion import DailymotionIE, DailymotionPlaylistIE
  from .depositfiles import DepositFilesIE
  from .dotsub import DotsubIE
  from .dreisat import DreiSatIE
+from .ehow import EHowIE
  from .eighttracks import EightTracksIE
  from .escapist import EscapistIE
+from .exfm import ExfmIE
  from .facebook import FacebookIE
  from .flickr import FlickrIE
+from .freesound import FreesoundIE
  from .funnyordie import FunnyOrDieIE
  from .gamespot import GameSpotIE
  from .gametrailers import GametrailersIE
@@ -26,13 +32,16 @@ from .googlesearch import GoogleSearchIE
  from .hotnewhiphop import HotNewHipHopIE
  from .howcast import HowcastIE
  from .hypem import HypemIE
+from .ign import IGNIE, OneUPIE
  from .ina import InaIE
  from .infoq import InfoQIE
  from .instagram import InstagramIE
  from .jukebox import JukeboxIE
  from .justintv import JustinTVIE
+from .kankan import KankanIE
  from .keek import KeekIE
  from .liveleak import LiveLeakIE
+from .livestream import LivestreamIE
  from .metacafe import MetacafeIE
  from .mixcloud import MixcloudIE
  from .mtv import MTVIE
@@ -44,6 +53,8 @@ from .pornotube import PornotubeIE
  from .rbmaradio import RBMARadioIE
  from .redtube import RedTubeIE
  from .ringtv import RingTVIE
+from .roxwel import RoxwelIE
+from .sina import SinaIE
  from .soundcloud import SoundcloudIE, SoundcloudSetIE
  from .spiegel import SpiegelIE
  from .stanfordoc import StanfordOpenClassroomIE
@@ -52,6 +63,7 @@ from .steam import SteamIE
  from .teamcoco import TeamcocoIE
  from .ted import TEDIE
  from .tf1 import TF1IE
+from .thisav import ThisAVIE
  from .traileraddict import TrailerAddictIE
  from .tudou import TudouIE
  from .tumblr import TumblrIE
@@ -60,9 +72,11 @@ from .ustream import UstreamIE
  from .vbox7 import Vbox7IE
  from .veoh import VeohIE
  from .vevo import VevoIE
-from .vimeo import VimeoIE
+from .vimeo import VimeoIE, VimeoChannelIE
  from .vine import VineIE
+from .c56 import C56IE
  from .wat import WatIE
+from .weibo import WeiboIE
  from .wimp import WimpIE
  from .worldstarhiphop import WorldStarHipHopIE
  from .xhamster import XHamsterIE
@@ -80,6 +94,9 @@ from .youtube import (
      YoutubeChannelIE,
      YoutubeShowIE,
      YoutubeSubscriptionsIE,
+    YoutubeRecommendedIE,
+    YoutubeWatchLaterIE,
+    YoutubeFavouritesIE,
  )
  from .zdf import ZDFIE
  
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py

index 29cb9bdee1e032fc6c316a4b6806a22f55ffb662..7efd1d82324c5397bb6d6f10e1bfa993a2531584 100644 (file)
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -48,6 +48,7 @@ class ArchiveOrgIE(InfoExtractor):
          formats.sort(key=lambda fdata: fdata['file_size'])
  
          info = {
+            '_type': 'video',
              'id': video_id,
              'title': title,
              'formats': formats,
@@ -63,4 +64,4 @@ class ArchiveOrgIE(InfoExtractor):
          info['url'] = formats[-1]['url']
          info['ext'] = determine_ext(formats[-1]['url'])
  
-        return self.video_result(info)
-\ No newline at end of file
+        return info
+\ No newline at end of file
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index e7a91a1eb5e835c9b6e8bd9f16302a9bc7a8bf90..18d5916589b239c9cf12a7caa629efeea784ce86 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -5,6 +5,7 @@ import xml.etree.ElementTree
  from .common import InfoExtractor
  from ..utils import (
      ExtractorError,
+    find_xpath_attr,
      unified_strdate,
  )
  
@@ -97,7 +98,7 @@ class ArteTvIE(InfoExtractor):
                  l = 'F'
              elif lang == 'de':
                  l = 'A'
-            regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
+            regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
              return any(re.match(r, f['versionCode']) for r in regexes)
          # Some formats may not be in the same language as the url
          formats = filter(_match_lang, formats)
@@ -119,7 +120,7 @@ class ArteTvIE(InfoExtractor):
          ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
          ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
          ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
-        config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
+        config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
          config_xml_url = config_node.attrib['ref']
          config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
  
diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py

index 34f555e891e1b6c1c079ef54ab976c53694b5daa..53a898de3707ce9a2f235d95e1d7fa0be58edb20 100644 (file)
--- a/youtube_dl/extractor/breakcom.py
+++ b/youtube_dl/extractor/breakcom.py
@@ -1,6 +1,8 @@
  import re
+import json
  
  from .common import InfoExtractor
+from ..utils import determine_ext
  
  
  class BreakIE(InfoExtractor):
@@ -17,17 +19,20 @@ class BreakIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group(1).split("-")[-1]
-        webpage = self._download_webpage(url, video_id)
-        video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1)
-        key = re.search(r"icon: '(.+?)',",webpage).group(1)
-        final_url = str(video_url)+"?"+str(key)
-        thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1)
-        title = re.search(r"sVidTitle: '(.+)',",webpage).group(1)
-        ext = video_url.split('.')[-1]
+        embed_url = 'http://www.break.com/embed/%s' % video_id
+        webpage = self._download_webpage(embed_url, video_id)
+        info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
+                                       u'info json', flags=re.DOTALL)
+        info = json.loads(info_json)
+        video_url = info['videoUri']
+        m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
+        if m_youtube is not None:
+            return self.url_result(m_youtube.group(1), 'Youtube')
+        final_url = video_url + '?' + info['AuthToken']
          return [{
              'id':        video_id,
              'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
+            'ext':       determine_ext(final_url),
+            'title':     info['contentName'],
+            'thumbnail': info['thumbUri'],
          }]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index f85acbb5db3dcb8e68b1e6e19f4eb91095fa6cfd..71e3c7883338154eea0c3d369a0fdd0bee828e26 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -1,28 +1,82 @@
  import re
  import json
+import xml.etree.ElementTree
  
  from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    find_xpath_attr,
+    compat_urlparse,
+)
  
  class BrightcoveIE(InfoExtractor):
-    _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+    _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+    _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+    _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
+    
+    # There is a test for Brigtcove in GenericIE, that way we test both the download
+    # and the detection of videos, and we don't have to find an URL that is always valid
+
+    @classmethod
+    def _build_brighcove_url(cls, object_str):
+        """
+        Build a Brightcove url from a xml string containing
+        <object class="BrightcoveExperience">{params}</object>
+        """
+        object_doc = xml.etree.ElementTree.fromstring(object_str)
+        assert u'BrightcoveExperience' in object_doc.attrib['class']
+        params = {'flashID': object_doc.attrib['id'],
+                  'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
+                  }
+        playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+        # Not all pages define this value
+        if playerKey is not None:
+            params['playerKey'] = playerKey.attrib['value']
+        videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+        if videoPlayer is not None:
+            params['@videoPlayer'] = videoPlayer.attrib['value']
+        data = compat_urllib_parse.urlencode(params)
+        return cls._FEDERATED_URL_TEMPLATE % data
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        query = mobj.group('query')
-        video_id = mobj.group('id')
+        query_str = mobj.group('query')
+        query = compat_urlparse.parse_qs(query_str)
+
+        videoPlayer = query.get('@videoPlayer')
+        if videoPlayer:
+            return self._get_video_info(videoPlayer[0], query_str)
+        else:
+            player_key = query['playerKey']
+            return self._get_playlist_info(player_key[0])
  
-        request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+    def _get_video_info(self, video_id, query):
+        request_url = self._FEDERATED_URL_TEMPLATE % query
          webpage = self._download_webpage(request_url, video_id)
  
          self.report_extraction(video_id)
          info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
          info = json.loads(info)['data']
          video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+
+        return self._extract_video_info(video_info)
+
+    def _get_playlist_info(self, player_key):
+        playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
+                                               player_key, u'Downloading playlist information')
+
+        playlist_info = json.loads(playlist_info)['videoList']
+        videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+
+        return self.playlist_result(videos, playlist_id=playlist_info['id'],
+                                    playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+
+    def _extract_video_info(self, video_info):
          renditions = video_info['renditions']
          renditions = sorted(renditions, key=lambda r: r['size'])
          best_format = renditions[-1]
-        
-        return {'id': video_id,
+
+        return {'id': video_info['id'],
                  'title': video_info['displayName'],
                  'url': best_format['defaultURL'], 
                  'ext': 'mp4',
diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py

new file mode 100644 (file)

index 0000000..4c8a8af
--- /dev/null
+++ b/youtube_dl/extractor/c56.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class C56IE(InfoExtractor):
+    _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+    IE_NAME = u'56.com'
+
+    _TEST ={
+        u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html',
+        u'file': u'93440716.mp4',
+        u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e',
+        u'info_dict': {
+            u'title': u'网事知多少 第32期：车怒',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        text_id = mobj.group('textid')
+        info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
+                                           text_id, u'Downloading video info')
+        info = json.loads(info_page)['info']
+        best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1]
+        video_url = best_format['url']
+
+        return {'id': info['vid'],
+                'title': info['Subject'],
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                'thumbnail': info.get('bimg') or info.get('img'),
+                }
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py

new file mode 100644 (file)

index 0000000..3b1c888
--- /dev/null
+++ b/youtube_dl/extractor/canalplus.py
@@ -0,0 +1,46 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import unified_strdate
+
+class CanalplusIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
+    IE_NAME = u'canalplus.fr'
+
+    _TEST = {
+        u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
+        u'file': u'889861.flv',
+        u'md5': u'590a888158b5f0d6832f84001fbf3e99',
+        u'info_dict': {
+            u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
+            u'upload_date': u'20130620',
+        },
+        u'skip': u'Requires rtmpdump'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        info_url = self._VIDEO_INFO_TEMPLATE % video_id
+        info_page = self._download_webpage(info_url,video_id, 
+                                           u'Downloading video info')
+
+        self.report_extraction(video_id)
+        doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
+        video_info = [video for video in doc if video.find('ID').text == video_id][0]
+        infos = video_info.find('INFOS')
+        media = video_info.find('MEDIA')
+        formats = [media.find('VIDEOS/%s' % format)
+            for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
+        video_url = [format.text for format in formats if format is not None][-1]
+
+        return {'id': video_id,
+                'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
+                                       infos.find('TITRAGE/SOUS_TITRE').text),
+                'url': video_url,
+                'ext': 'flv',
+                'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
+                'thumbnail': media.find('IMAGES/GRAND').text,
+                }
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py

index 7ae0972e501ef641ea5e1eb43ec9124fba5bc39c..5badde03a028b80c7ec19a6329da9753310a227a 100644 (file)
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,26 +1,26 @@
  import re
-import socket
  import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
      compat_urllib_parse_urlparse,
-    compat_urllib_request,
  
      ExtractorError,
  )
  
  
  class CollegeHumorIE(InfoExtractor):
-    _WORKING = False
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
  
-    def report_manifest(self, video_id):
-        """Report information extraction."""
-        self.to_screen(u'%s: Downloading XML manifest' % video_id)
+    _TEST = {
+        u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+        u'file': u'6902724.mp4',
+        u'md5': u'1264c12ad95dca142a9f0bf7968105a0',
+        u'info_dict': {
+            u'title': u'Comic-Con Cosplay Catastrophe',
+            u'description': u'Fans get creative this year at San Diego.  Too creative.  And yes, that\'s really Joss Whedon.',
+        },
+    }
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -36,14 +36,16 @@ class CollegeHumorIE(InfoExtractor):
  
          self.report_extraction(video_id)
          xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
-        try:
-            metaXml = compat_urllib_request.urlopen(xmlUrl).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+        metaXml = self._download_webpage(xmlUrl, video_id,
+                                         u'Downloading info XML',
+                                         u'Unable to download video info XML')
  
          mdoc = xml.etree.ElementTree.fromstring(metaXml)
          try:
              videoNode = mdoc.findall('./video')[0]
+            youtubeIdNode = videoNode.find('./youtubeID')
+            if youtubeIdNode is not None:
+                return self.url_result(youtubeIdNode.text, 'Youtube')
              info['description'] = videoNode.findall('./description')[0].text
              info['title'] = videoNode.findall('./caption')[0].text
              info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
@@ -52,11 +54,9 @@ class CollegeHumorIE(InfoExtractor):
              raise ExtractorError(u'Invalid metadata XML file')
  
          manifest_url += '?hdcore=2.10.3'
-        self.report_manifest(video_id)
-        try:
-            manifestXml = compat_urllib_request.urlopen(manifest_url).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+        manifestXml = self._download_webpage(manifest_url, video_id,
+                                             u'Downloading XML manifest',
+                                             u'Unable to download video info XML')
  
          adoc = xml.etree.ElementTree.fromstring(manifestXml)
          try:
@@ -66,9 +66,8 @@ class CollegeHumorIE(InfoExtractor):
          except IndexError as err:
              raise ExtractorError(u'Invalid manifest file')
  
-        url_pr = compat_urllib_parse_urlparse(manifest_url)
-        url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
+        url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
  
-        info['url'] = url
-        info['ext'] = 'f4f'
+        info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
+        info['ext'] = 'mp4'
          return [info]
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py

index 93d9e3d5e6ab96404034b83c10bcc4a28a87d78c..bf8d711eea44c8d60855f458407391d66ef2664d 100644 (file)
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor):
                           (full-episodes/(?P<episode>.*)|
                            (?P<clip>
                                (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
-                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
+                              |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
+                          (?P<interview>
+                              extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
                       $"""
      _TEST = {
          u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
@@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor):
              else:
                  epTitle = mobj.group('cntitle')
              dlNewest = False
+        elif mobj.group('interview'):
+            epTitle = mobj.group('interview_title')
+            dlNewest = False
          else:
              dlNewest = not mobj.group('episode')
              if dlNewest:
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 1d98222ce6518398fd2a1381100400d5bacb99c0..da50abfc1cd492b8d360ef601b44841a938c055b 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,6 +14,7 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      ExtractorError,
+    unescapeHTML,
  )
  
  class InfoExtractor(object):
@@ -125,6 +126,11 @@ class InfoExtractor(object):
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
          """ Returns a tuple (page content as string, URL handle) """
+
+        # Strip hashes from the URL (#1038)
+        if isinstance(url_or_request, (compat_str, str)):
+            url_or_request = url_or_request.partition('#')[0]
+
          urlh = self._request_webpage(url_or_request, video_id, note, errnote)
          content_type = urlh.headers.get('Content-Type', '')
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -169,11 +175,6 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    #They set the correct value of the '_type' key
-    def video_result(self, video_info):
-        """Returns a video"""
-        video_info['_type'] = 'video'
-        return video_info
      def url_result(self, url, ie=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
@@ -262,6 +263,31 @@ class InfoExtractor(object):
          
          return (username, password)
  
+    # Helper functions for extracting OpenGraph info
+    @staticmethod
+    def _og_regex(prop):
+        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+
+    def _og_search_property(self, prop, html, name=None, **kargs):
+        if name is None:
+            name = 'OpenGraph %s' % prop
+        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+        return unescapeHTML(escaped)
+
+    def _og_search_thumbnail(self, html, **kargs):
+        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+
+    def _og_search_description(self, html, **kargs):
+        return self._og_search_property('description', html, fatal=False, **kargs)
+
+    def _og_search_title(self, html, **kargs):
+        return self._og_search_property('title', html, **kargs)
+
+    def _og_search_video_url(self, html, name='video url', **kargs):
+        return self._html_search_regex([self._og_regex('video:secure_url'),
+                                        self._og_regex('video')],
+                                       html, name, **kargs)
+
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py

new file mode 100644 (file)

index 0000000..f336a3c
--- /dev/null
+++ b/youtube_dl/extractor/condenast.py
@@ -0,0 +1,106 @@
+# coding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    orderedSet,
+    compat_urllib_parse_urlparse,
+    compat_urlparse,
+)
+
+
+class CondeNastIE(InfoExtractor):
+    """
+    Condé Nast is a media group, some of its sites use a custom HTML5 player
+    that works the same in all of them.
+    """
+
+    # The keys are the supported sites and the values are the name to be shown
+    # to the user and in the extractor description.
+    _SITES = {'wired': u'WIRED',
+              'gq': u'GQ',
+              'vogue': u'Vogue',
+              'glamour': u'Glamour',
+              'wmagazine': u'W Magazine',
+              'vanityfair': u'Vanity Fair',
+              }
+
+    _VALID_URL = r'http://(video|www).(?P<site>%s).com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys())
+    IE_DESC = u'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
+
+    _TEST = {
+        u'url': u'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
+        u'file': u'5171b343c2b4c00dd0c1ccb3.mp4',
+        u'md5': u'1921f713ed48aabd715691f774c451f7',
+        u'info_dict': {
+            u'title': u'3D Printed Speakers Lit With LED',
+            u'description': u'Check out these beautiful 3D printed LED speakers.  You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
+        }
+    }
+
+    def _extract_series(self, url, webpage):
+        title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
+                                        webpage, u'series title', flags=re.DOTALL)
+        url_object = compat_urllib_parse_urlparse(url)
+        base_url = '%s://%s' % (url_object.scheme, url_object.netloc)
+        m_paths = re.finditer(r'<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]',
+                              webpage, flags=re.DOTALL)
+        paths = orderedSet(m.group(1) for m in m_paths)
+        build_url = lambda path: compat_urlparse.urljoin(base_url, path)
+        entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
+        return self.playlist_result(entries, playlist_title=title)
+
+    def _extract_video(self, webpage):
+        description = self._html_search_regex([r'<div class="cne-video-description">(.+?)</div>',
+                                               r'<div class="video-post-content">(.+?)</div>',
+                                               ],
+                                              webpage, u'description',
+                                              fatal=False, flags=re.DOTALL)
+        params = self._search_regex(r'var params = {(.+?)}[;,]', webpage,
+                                    u'player params', flags=re.DOTALL)
+        video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, u'video id')
+        player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, u'player id')
+        target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, u'target')
+        data = compat_urllib_parse.urlencode({'videoId': video_id,
+                                              'playerId': player_id,
+                                              'target': target,
+                                              })
+        base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]',
+                                           webpage, u'base info url',
+                                           default='http://player.cnevids.com/player/loader.js?')
+        info_url = base_info_url + data
+        info_page = self._download_webpage(info_url, video_id,
+                                           u'Downloading video info')
+        video_info = self._search_regex(r'var video = ({.+?});', info_page, u'video info')
+        video_info = json.loads(video_info)
+
+        def _formats_sort_key(f):
+            type_ord = 1 if f['type'] == 'video/mp4' else 0
+            quality_ord = 1 if f['quality'] == 'high' else 0
+            return (quality_ord, type_ord)
+        best_format = sorted(video_info['sources'][0], key=_formats_sort_key)[-1]
+
+        return {'id': video_id,
+                'url': best_format['src'],
+                'ext': best_format['type'].split('/')[-1],
+                'title': video_info['title'],
+                'thumbnail': video_info['poster_frame'],
+                'description': description,
+                }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        site = mobj.group('site')
+        url_type = mobj.group('type')
+        id = mobj.group('id')
+
+        self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site])
+        webpage = self._download_webpage(url, id)
+
+        if url_type == 'series':
+            return self._extract_series(url, webpage)
+        else:
+            return self._extract_video(webpage)
diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py

new file mode 100644 (file)

index 0000000..31fe3d5
--- /dev/null
+++ b/youtube_dl/extractor/criterion.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class CriterionIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+'
+    _TEST = {
+        u'url': u'http://www.criterion.com/films/184-le-samourai',
+        u'file': u'184.mp4',
+        u'md5': u'bc51beba55685509883a9a7830919ec3',
+        u'info_dict': {
+            u"title": u"Le Samouraï",
+            u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+        webpage = self._download_webpage(url, video_id)
+
+        final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;',
+                                webpage, 'video url')
+        title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />',
+                                webpage, 'video title')
+        description = self._html_search_regex(r'<meta name="description" content="(.+?)" />',
+                                webpage, 'video description')
+        thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
+                                webpage, 'thumbnail url')
+
+        return {'id': video_id,
+                'url' : final_url,
+                'title': title,
+                'ext': determine_ext(final_url),
+                'description': description,
+                'thumbnail': thumbnail,
+                }
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py

index a4853279bbfc0bce6517a2b1d032ea1eafe07482..7bf03c584c7388b162c9b3912a4aa0f410ed5b22 100644 (file)
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor):
          description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
                                                webpage, 'description',
                                                flags=re.MULTILINE|re.DOTALL)
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
-                                            webpage, 'thumbnail')
  
          url = self._search_regex(r'<string name="URL">(.*?)</string>',
                                   video_info, 'video url')
@@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor):
                  'url': url,
                  'play_path': path,
                  'description': description,
-                'thumbnail': thumbnail,
+                'thumbnail': self._og_search_thumbnail(webpage),
                  }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index 5fd2221a798403ff4832bf6992b8724bdf74f964..fa8c630d053168bf30d835952debd67536555c0c 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,9 +1,12 @@
  import re
  import json
+import itertools
  
  from .common import InfoExtractor
  from ..utils import (
      compat_urllib_request,
+    get_element_by_attribute,
+    get_element_by_id,
  
      ExtractorError,
  )
@@ -39,9 +42,6 @@ class DailymotionIE(InfoExtractor):
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
-                                              webpage, 'title')
-
          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                               # Looking for official user
                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,7 +76,35 @@ class DailymotionIE(InfoExtractor):
              'url':      video_url,
              'uploader': video_uploader,
              'upload_date':  video_upload_date,
-            'title':    video_title,
+            'title':    self._og_search_title(webpage),
              'ext':      video_extension,
              'thumbnail': info['thumbnail_url']
          }]
+
+
+class DailymotionPlaylistIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
+    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id =  mobj.group('id')
+        video_ids = []
+
+        for pagenum in itertools.count(1):
+            webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum),
+                                             playlist_id, u'Downloading page %s' % pagenum)
+
+            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+            video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))
+
+            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+                break
+
+        entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+                   for video_id in video_ids]
+        return {'_type': 'playlist',
+                'id': playlist_id,
+                'title': get_element_by_id(u'playlist_name', webpage),
+                'entries': entries,
+                }
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py

index 847f733a78e44a423a7bfa8be3409f3fe01c9365..64b4658053cd98d0313071dccc05548384098ae7 100644 (file)
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -67,6 +67,7 @@ class DreiSatIE(InfoExtractor):
          formats.sort(key=_sortkey)
  
          info = {
+            '_type': 'video',
              'id': video_id,
              'title': video_title,
              'formats': formats,
@@ -81,4 +82,4 @@ class DreiSatIE(InfoExtractor):
          info['url'] = formats[-1]['url']
          info['ext'] = determine_ext(formats[-1]['url'])
  
-        return self.video_result(info)
-\ No newline at end of file
+        return info
+\ No newline at end of file
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py

new file mode 100644 (file)

index 0000000..2bb77ae
--- /dev/null
+++ b/youtube_dl/extractor/ehow.py
@@ -0,0 +1,46 @@
+import re
+
+from ..utils import (
+    compat_urllib_parse,
+    determine_ext
+)
+from .common import InfoExtractor
+
+
+class EHowIE(InfoExtractor):
+    IE_NAME = u'eHow'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+    _TEST = {
+        u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+        u'file': u'12245069.flv',
+        u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
+        u'info_dict': {
+            u"title": u"Hardwood Flooring Basics",
+            u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
+                       u"uploader": u"Erick Nathan"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
+            webpage, u'video URL')
+        final_url = compat_urllib_parse.unquote(video_url)        
+        uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
+            webpage, u'uploader')
+        title = self._og_search_title(webpage).replace(' | eHow', '')
+        ext = determine_ext(final_url)
+
+        return {
+            '_type':       'video',
+            'id':          video_id,
+            'url':         final_url,
+            'ext':         ext,
+            'title':       title,
+            'thumbnail':   self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
+            'uploader':    uploader,
+        }
+
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

index 794460e8459b65130b117b0806e4ef1630160685..3aa2da52c0117bc9926df9c250eeb70da6cc2299 100644 (file)
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor):
          videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
  
-        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
-            webpage, u'thumbnail', fatal=False)
-
-        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
-            webpage, u'player url')
+        playerUrl = self._og_search_video_url(webpage, name='player url')
  
          title = self._html_search_regex('<meta name="title" content="([^"]*)"',
              webpage, u'player url').split(' : ')[-1]
@@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor):
              'upload_date': None,
              'title': title,
              'ext': 'mp4',
-            'thumbnail': imgUrl,
+            'thumbnail': self._og_search_thumbnail(webpage),
              'description': videoDesc,
              'player_url': playerUrl,
          }
diff --git a/youtube_dl/extractor/exfm.py b/youtube_dl/extractor/exfm.py

new file mode 100644 (file)

index 0000000..3443f19
--- /dev/null
+++ b/youtube_dl/extractor/exfm.py
@@ -0,0 +1,54 @@
+import re
+import json
+
+from .common import InfoExtractor
+
+
+class ExfmIE(InfoExtractor):
+    IE_NAME = u'exfm'
+    IE_DESC = u'ex.fm'
+    _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
+    _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
+    _TESTS = [
+        {
+            u'url': u'http://ex.fm/song/1bgtzg',
+            u'file': u'95223130.mp3',
+            u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf',
+            u'info_dict': {
+                u"title": u"We Can't Stop - Miley Cyrus",
+                u"uploader": u"Miley Cyrus",
+                u'upload_date': u'20130603',
+                u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC',
+            },
+            u'note': u'Soundcloud song',
+        },
+        {
+            u'url': u'http://ex.fm/song/wddt8',
+            u'file': u'wddt8.mp3',
+            u'md5': u'966bd70741ac5b8570d8e45bfaed3643',
+            u'info_dict': {
+                u'title': u'Safe and Sound',
+                u'uploader': u'Capital Cities',
+            },
+        },
+    ]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        song_id = mobj.group(1)
+        info_url = "http://ex.fm/api/v3/song/%s" %(song_id)
+        webpage = self._download_webpage(info_url, song_id)
+        info = json.loads(webpage)
+        song_url = info['song']['url']
+        if re.match(self._SOUNDCLOUD_URL, song_url) is not None:
+            self.to_screen('Soundcloud song detected')
+            return self.url_result(song_url.replace('/stream',''), 'Soundcloud')
+        return [{
+            'id':          song_id,
+            'url':         song_url,
+            'ext':         'mp3',
+            'title':       info['song']['title'],
+            'thumbnail':   info['song']['image']['large'],
+            'uploader':    info['song']['artist'],
+            'view_count':  info['song']['loved_count'],
+        }]
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py

index bd97bff9a78a9098ae6e5a6d8aa8612683405012..80d96baf739522b97f933878faa8a4083a0e8959 100644 (file)
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor):
              raise ExtractorError(u'Unable to extract video url')
          video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'video title')
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'description', fatal=False)
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'thumbnail', fatal=False)
-
          return [{
              'id':          video_id,
              'url':         video_url,
              'ext':         'mp4',
-            'title':       video_title,
-            'description': video_description,
-            'thumbnail':   thumbnail,
+            'title':       self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail':   self._og_search_thumbnail(webpage),
              'uploader_id': video_uploader_id,
          }]
diff --git a/youtube_dl/extractor/freesound.py b/youtube_dl/extractor/freesound.py

new file mode 100644 (file)

index 0000000..de14b12
--- /dev/null
+++ b/youtube_dl/extractor/freesound.py
@@ -0,0 +1,36 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+class FreesoundIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
+    _TEST = {
+        u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/',
+        u'file': u'194503.mp3',
+        u'md5': u'12280ceb42c81f19a515c745eae07650',
+        u'info_dict': {
+            u"title": u"gulls in the city.wav",
+            u"uploader" : u"miklovan",
+            u'description': u'the sounds of seagulls in the city',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        music_id = mobj.group('id')
+        webpage = self._download_webpage(url, music_id)
+        title = self._html_search_regex(r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
+                                webpage, 'music title', flags=re.DOTALL)
+        music_url = self._og_search_property('audio', webpage, 'music url')
+        description = self._html_search_regex(r'<div id="sound_description">(.*?)</div>',
+                                webpage, 'description', fatal=False, flags=re.DOTALL)
+
+        return [{
+            'id':       music_id,
+            'title':    title,            
+            'url':      music_url,
+            'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'),
+            'ext':      determine_ext(music_url),
+            'description': description,
+        }]
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py

index 388aacf2f1b513c0797bc92d27a8217e49628f08..67a7e5f76fc604ae058a8e05842b02feb7fecffe 100644 (file)
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor):
          title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
              r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
  
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', fatal=False, flags=re.DOTALL)
-
          info = {
              'id': video_id,
              'url': video_url,
              'ext': 'mp4',
              'title': title,
-            'description': video_description,
+            'description': self._og_search_description(webpage),
          }
          return [info]
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py

index 3ce93b492eac0bb5e1595e453949658e4bf68140..3cc02d97e04aace34e0eb03cccab254f4927f77d 100644 (file)
--- a/youtube_dl/extractor/gametrailers.py
+++ b/youtube_dl/extractor/gametrailers.py
@@ -1,68 +1,36 @@
  import re
  
-from .common import InfoExtractor
-from ..utils import (
-    compat_urllib_parse,
+from .mtv import MTVIE, _media_xml_tag
  
-    ExtractorError,
-)
-
-class GametrailersIE(InfoExtractor):
+class GametrailersIE(MTVIE):
+    """
+    Gametrailers use the same videos system as MTVIE, it just changes the feed
+    url, where the uri is and the method to get the thumbnails.
+    """
      _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
      _TEST = {
          u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
-        u'file': u'zbvr8i.flv',
-        u'md5': u'c3edbc995ab4081976e16779bd96a878',
+        u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
+        u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
          u'info_dict': {
-            u"title": u"E3 2013: Debut Trailer"
+            u'title': u'E3 2013: Debut Trailer',
+            u'description': u'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
          },
-        u'skip': u'Requires rtmpdump'
      }
+    # Overwrite MTVIE properties we don't want
+    _TESTS = []
+
+    _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+        return itemdoc.find(search_path).attrib['url']
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group('id')
-        video_type = mobj.group('type')
          webpage = self._download_webpage(url, video_id)
-        if video_type == 'full-episodes':
-            mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
-        else:
-            mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
-        mgid = self._search_regex(mgid_re, webpage, u'mgid')
-        data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
-
-        info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
-                                           video_id, u'Downloading video info')
-        links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
-                                               video_id, u'Downloading video urls info')
-
-        self.report_extraction(video_id)
-        info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
-                      <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
-                      <image>.*
-                        <url>(?P<thumb>.*?)</url>.*
-                      </image>'''
-
-        m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
-        if m_info is None:
-            raise ExtractorError(u'Unable to extract video info')
-        video_title = m_info.group('title')
-        video_description = m_info.group('description')
-        video_thumb = m_info.group('thumb')
-
-        m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
-        if m_urls is None or len(m_urls) == 0:
-            raise ExtractorError(u'Unable to extract video url')
-        # They are sorted from worst to best quality
-        video_url = m_urls[-1].group('url')
-
-        return {'url':         video_url,
-                'id':          video_id,
-                'title':       video_title,
-                # Videos are actually flv not mp4
-                'ext':         'flv',
-                'thumbnail':   video_thumb,
-                'description': video_description,
-                }
+        mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
+                                   r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
+                                  webpage, u'mgid')
+        return self._get_videos_info(mgid)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 20bc533300aa38d5d8b2d6a13eefee44fe439f72..b633e896c6eb3d5b2158585d01ffba9615fe8f31 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1,3 +1,5 @@
+# encoding: utf-8
+
  import os
  import re
  
@@ -9,20 +11,34 @@ from ..utils import (
  
      ExtractorError,
  )
+from .brightcove import BrightcoveIE
  
  class GenericIE(InfoExtractor):
      IE_DESC = u'Generic downloader that works on some sites'
      _VALID_URL = r'.*'
      IE_NAME = u'generic'
-    _TEST = {
-        u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
-        u'file': u'13601338388002.mp4',
-        u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
-        u'info_dict': {
-            u"uploader": u"www.hodiho.fr", 
-            u"title": u"R\u00e9gis plante sa Jeep"
-        }
-    }
+    _TESTS = [
+        {
+            u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+            u'file': u'13601338388002.mp4',
+            u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+            u'info_dict': {
+                u"uploader": u"www.hodiho.fr", 
+                u"title": u"R\u00e9gis plante sa Jeep"
+            }
+        },
+        {
+            u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
+            u'file': u'2371591881001.mp4',
+            u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+            u'note': u'Test Brightcove downloads and detection in GenericIE',
+            u'info_dict': {
+                u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+                u'uploader': u'8TV',
+                u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+            }
+        },
+    ]
  
      def report_download_webpage(self, video_id):
          """Report webpage download."""
@@ -103,6 +119,13 @@ class GenericIE(InfoExtractor):
              raise ExtractorError(u'Invalid URL: %s' % url)
  
          self.report_extraction(video_id)
+        # Look for BrigthCove:
+        m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
+        if m_brightcove is not None:
+            self.to_screen(u'Brightcove video detected.')
+            bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
+            return self.url_result(bc_url, 'Brightcove')
+
          # Start with something easy: JW Player in SWFObject
          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
          if mobj is None:
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py

index ca3abb7d7fdaaf5c84869e1b4eda125d5076573a..ccca1d7e0bb41dae5694c2bd582728cc939b87da 100644 (file)
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor):
  
          video_title = self._html_search_regex(r"<title>(.*)</title>",
              webpage_src, u'title')
-        
-        # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._html_search_regex(r'"og:image" content="(.*)"',
-            webpage_src, u'thumbnail', fatal=False)
  
          results = [{
                      'id': video_id,
                      'url' : video_url,
                      'title' : video_title,
-                    'thumbnail' : thumbnail,
+                    'thumbnail' : self._og_search_thumbnail(webpage_src),
                      'ext' : 'mp3',
                      }]
-        return results
-\ No newline at end of file
+        return results
diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py

new file mode 100644 (file)

index 0000000..62abab6
--- /dev/null
+++ b/youtube_dl/extractor/ign.py
@@ -0,0 +1,91 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+
+class IGNIE(InfoExtractor):
+    """
+    Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
+    Some videos of it.ign.com are also supported
+    """
+
+    _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)'
+    IE_NAME = u'ign.com'
+
+    _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
+    _DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
+                       r'id="my_show_video">.*?<p>(.*?)</p>',
+                       ]
+
+    _TEST = {
+        u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+        u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
+        u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
+        u'info_dict': {
+            u'title': u'The Last of Us Review',
+            u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
+        }
+    }
+
+    def _find_video_id(self, webpage):
+        res_id = [r'data-video-id="(.+?)"',
+                  r'<object id="vid_(.+?)"',
+                  r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+                  ]
+        return self._search_regex(res_id, webpage, 'video id')
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name_or_id = mobj.group('name_or_id')
+        webpage = self._download_webpage(url, name_or_id)
+        video_id = self._find_video_id(webpage)
+        result = self._get_video_info(video_id)
+        description = self._html_search_regex(self._DESCRIPTION_RE,
+                                              webpage, 'video description',
+                                              flags=re.DOTALL)
+        result['description'] = description
+        return result
+
+    def _get_video_info(self, video_id):
+        config_url = self._CONFIG_URL_TEMPLATE % video_id
+        config = json.loads(self._download_webpage(config_url, video_id,
+                            u'Downloading video info'))
+        media = config['playlist']['media']
+        video_url = media['url']
+
+        return {'id': media['metadata']['videoId'],
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                'title': media['metadata']['title'],
+                'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
+                }
+
+
+class OneUPIE(IGNIE):
+    """Extractor for 1up.com, it uses the ign videos system."""
+
+    _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)'
+    IE_NAME = '1up.com'
+
+    _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
+
+    _TEST = {
+        u'url': u'http://gamevideos.1up.com/video/id/34976',
+        u'file': u'34976.mp4',
+        u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
+        u'info_dict': {
+            u'title': u'Sniper Elite V2 - Trailer',
+            u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        id = mobj.group('name_or_id')
+        result = super(OneUPIE, self)._real_extract(url)
+        result['id'] = id
+        return result
diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py

index 962c5921447e72a3f15ce20ca1f8e293acf26c44..652f19b7b8ea689d7861b04f6ff421c144c300d9 100644 (file)
--- a/youtube_dl/extractor/ina.py
+++ b/youtube_dl/extractor/ina.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
  
  class InaIE(InfoExtractor):
      """Information Extractor for Ina.fr"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
+    _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*'
      _TEST = {
          u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
          u'file': u'I12055569.mp4',
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

index 6ae704efddce7a1b636cc9bd81b5244bbad95b2d..ddc42882a436a216cbd24b0b28d03da89ec27b0d 100644 (file)
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -5,12 +5,13 @@ from .common import InfoExtractor
  class InstagramIE(InfoExtractor):
      _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
      _TEST = {
-        u'url': u'http://instagram.com/p/aye83DjauH/#',
+        u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
          u'file': u'aye83DjauH.mp4',
          u'md5': u'0d2da106a9d2631273e192b372806516',
          u'info_dict': {
              u"uploader_id": u"naomipq", 
-            u"title": u"Video by naomipq"
+            u"title": u"Video by naomipq",
+            u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
          }
      }
  
@@ -18,25 +19,17 @@ class InstagramIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group(1)
          webpage = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'<meta property="og:video" content="(.+?)"',
-            webpage, u'video URL')
-        thumbnail_url = self._html_search_regex(
-            r'<meta property="og:image" content="(.+?)" />',
-            webpage, u'thumbnail URL', fatal=False)
-        html_title = self._html_search_regex(
-            r'<title>(.+?)</title>',
-            webpage, u'title', flags=re.DOTALL)
-        title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip()
-        uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram',
-            webpage, u'uploader name', fatal=False)
-        ext = 'mp4'
+        uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
+            webpage, u'uploader id', fatal=False)
+        desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description',
+            fatal=False)
  
          return [{
              'id':        video_id,
-            'url':       video_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-            'uploader_id' : uploader_id
+            'url':       self._og_search_video_url(webpage),
+            'ext':       'mp4',
+            'title':     u'Video by %s' % uploader_id,
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'uploader_id' : uploader_id,
+            'description': desc,
          }]
diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py

new file mode 100644 (file)

index 0000000..8537ba5
--- /dev/null
+++ b/youtube_dl/extractor/kankan.py
@@ -0,0 +1,37 @@
+import re
+
+from .common import InfoExtractor
+from ..utils import determine_ext
+
+
+class KankanIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
+    
+    _TEST = {
+        u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml',
+        u'file': u'48863.flv',
+        u'md5': u'29aca1e47ae68fc28804aca89f29507e',
+        u'info_dict': {
+            u'title': u'Ready To Go',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title')
+        gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid')
+
+        video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
+                                                 video_id, u'Downloading video url info')
+        ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')
+        path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path')
+        video_url = 'http://%s%s' % (ip, path)
+
+        return {'id': video_id,
+                'title': title,
+                'url': video_url,
+                'ext': determine_ext(video_url),
+                }
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py

index 72ad6a3d00b25f30f8d56e06bf5a15da32b8a911..a7b88d2d96c728dab476d425cf0be3842dd57c6d 100644 (file)
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -4,10 +4,10 @@ from .common import InfoExtractor
  
  
  class KeekIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
+    _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
      IE_NAME = u'keek'
      _TEST = {
-        u'url': u'http://www.keek.com/ytdl/keeks/NODfbab',
+        u'url': u'https://www.keek.com/ytdl/keeks/NODfbab',
          u'file': u'NODfbab.mp4',
          u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83',
          u'info_dict': {
@@ -24,8 +24,7 @@ class KeekIE(InfoExtractor):
          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
+        video_title = self._og_search_title(webpage)
  
          uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
              webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py

index cf8a2c9312a53d8fe3f16b363cce31b2dd7c989d..dd062a14e736ba84b3aacb9d3bf426bca4c8f86f 100644 (file)
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor):
          video_url = self._search_regex(r'file: "(.*?)",',
              webpage, u'video URL')
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title').replace('LiveLeak.com -', '').strip()
+        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
  
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', fatal=False)
+        video_description = self._og_search_description(webpage)
  
          video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
              webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

new file mode 100644 (file)

index 0000000..3099210
--- /dev/null
+++ b/youtube_dl/extractor/livestream.py
@@ -0,0 +1,52 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse_urlparse, compat_urlparse
+
+
+class LivestreamIE(InfoExtractor):
+    _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+    _TEST = {
+        u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
+        u'file': u'4719370.mp4',
+        u'md5': u'0d2186e3187d185a04b3cdd02b828836',
+        u'info_dict': {
+            u'title': u'Live from Webster Hall NYC',
+            u'upload_date': u'20121012',
+        }
+    }
+
+    def _extract_video_info(self, video_data):
+        video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url')
+        return {'id': video_data['id'],
+                'url': video_url,
+                'ext': 'mp4',
+                'title': video_data['caption'],
+                'thumbnail': video_data['thumbnail_url'],
+                'upload_date': video_data['updated_at'].replace('-','')[:8],
+                }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        event_name = mobj.group('event_name')
+        webpage = self._download_webpage(url, video_id or event_name)
+
+        if video_id is None:
+            # This is an event page:
+            api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'',
+                                         webpage, 'api url')
+            info = json.loads(self._download_webpage(api_url, event_name,
+                                                     u'Downloading event info'))
+            videos = [self._extract_video_info(video_data['data'])
+                for video_data in info['feed']['data'] if video_data['type'] == u'video']
+            return self.playlist_result(videos, info['id'], info['full_name'])
+        else:
+            og_video = self._og_search_video_url(webpage, name=u'player url')
+            query_str = compat_urllib_parse_urlparse(og_video).query
+            query = compat_urlparse.parse_qs(query_str)
+            api_url = query['play_url'][0].replace('.smil', '')
+            info = json.loads(self._download_webpage(api_url, video_id,
+                                                     u'Downloading video info'))
+            return self._extract_video_info(info)
diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py

index 4c3f81b989ed3ff51dda83909d7da88a2a2f6eb8..e38dc98b4c2702be6b488e2e516e1a6ea95c9d8d 100644 (file)
--- a/youtube_dl/extractor/metacafe.py
+++ b/youtube_dl/extractor/metacafe.py
@@ -9,7 +9,7 @@ from ..utils import (
      compat_urllib_parse,
      compat_urllib_request,
      compat_str,
-
+    determine_ext,
      ExtractorError,
  )
  
@@ -20,7 +20,7 @@ class MetacafeIE(InfoExtractor):
      _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
      _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
      IE_NAME = u'metacafe'
-    _TEST = {
+    _TESTS = [{
          u"add_ie": ["Youtube"],
          u"url":  u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
          u"file":  u"_aUehQsCQtM.flv",
@@ -31,7 +31,16 @@ class MetacafeIE(InfoExtractor):
              u"uploader": u"PBS",
              u"uploader_id": u"PBS"
          }
-    }
+    },
+    {
+        u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
+        u"file": u"an-dVVXnuY7Jh77J.mp4",
+        u"info_dict": {
+            u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
+            u"uploader": u"anyclip",
+            u"description": u"md5:38c711dd98f5bb87acf973d573442e67"
+        }
+    }]
  
  
      def report_disclaimer(self):
@@ -73,14 +82,16 @@ class MetacafeIE(InfoExtractor):
              return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
  
          # Retrieve video webpage to extract further information
-        webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
+        req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
+        req.headers['Cookie'] = 'flashVersion=0;'
+        webpage = self._download_webpage(req, video_id)
  
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
          mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
          if mobj is not None:
              mediaURL = compat_urllib_parse.unquote(mobj.group(1))
-            video_extension = mediaURL[-3:]
+            video_ext = mediaURL[-3:]
  
              # Extract gdaKey if available
              mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
@@ -90,34 +101,37 @@ class MetacafeIE(InfoExtractor):
                  gdaKey = mobj.group(1)
                  video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
          else:
-            mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract media URL')
-            vardict = compat_parse_qs(mobj.group(1))
-            if 'mediaData' not in vardict:
-                raise ExtractorError(u'Unable to extract media URL')
-            mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
-            if mobj is None:
-                raise ExtractorError(u'Unable to extract media URL')
-            mediaURL = mobj.group('mediaURL').replace('\\/', '/')
-            video_extension = mediaURL[-3:]
-            video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
-
-        mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1).decode('utf-8')
-
-        mobj = re.search(r'submitter=(.*?);', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract uploader nickname')
-        video_uploader = mobj.group(1)
-
-        return [{
-            'id':       video_id.decode('utf-8'),
-            'url':      video_url.decode('utf-8'),
-            'uploader': video_uploader.decode('utf-8'),
+            mobj = re.search(r'<video src="([^"]+)"', webpage)
+            if mobj:
+                video_url = mobj.group(1)
+                video_ext = 'mp4'
+            else:
+                mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
+                if mobj is None:
+                    raise ExtractorError(u'Unable to extract media URL')
+                vardict = compat_parse_qs(mobj.group(1))
+                if 'mediaData' not in vardict:
+                    raise ExtractorError(u'Unable to extract media URL')
+                mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
+                if mobj is None:
+                    raise ExtractorError(u'Unable to extract media URL')
+                mediaURL = mobj.group('mediaURL').replace('\\/', '/')
+                video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
+                video_ext = determine_ext(video_url)
+
+        video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
+        description = self._og_search_description(webpage)
+        video_uploader = self._html_search_regex(
+                r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);',
+                webpage, u'uploader nickname', fatal=False)
+
+        return {
+            '_type':    'video',
+            'id':       video_id,
+            'url':      video_url,
+            'description': description,
+            'uploader': video_uploader,
              'upload_date':  None,
              'title':    video_title,
-            'ext':      video_extension.decode('utf-8'),
-        }]
+            'ext':      video_ext,
+        }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index 969db71139b1f81d290ad6549ed3b3d8207da0c7..8f956571d54dc4a42a4f3726642929e4b2497f13 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -1,28 +1,110 @@
  import re
-import socket
  import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
-    compat_http_client,
-    compat_str,
-    compat_urllib_error,
-    compat_urllib_request,
-
+    compat_urllib_parse,
      ExtractorError,
  )
  
+def _media_xml_tag(tag):
+    return '{http://search.yahoo.com/mrss/}%s' % tag
  
  class MTVIE(InfoExtractor):
-    _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
-    _WORKING = False
+    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+
+    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+    _TESTS = [
+        {
+            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+            u'file': u'853555.mp4',
+            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+            u'info_dict': {
+                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+            },
+        },
+        {
+            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+            u'file': u'USCJY1331283.mp4',
+            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+            u'info_dict': {
+                u'title': u'Everything Has Changed',
+                u'upload_date': u'20130606',
+                u'uploader': u'Taylor Swift',
+            },
+            u'skip': u'VEVO is only available in some countries',
+        },
+    ]
+
+    @staticmethod
+    def _id_from_uri(uri):
+        return uri.split(':')[-1]
+
+    # This was originally implemented for ComedyCentral, but it also works here
+    @staticmethod
+    def _transform_rtmp_url(rtmp_video_url):
+        m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
+        if not m:
+            raise ExtractorError(u'Cannot transform RTMP url')
+        base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+        return base + m.group('finalid')
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        return 'http://mtv.mtvnimages.com/uri/' + uri
+
+    def _extract_video_url(self, metadataXml):
+        if '/error_country_block.swf' in metadataXml:
+            raise ExtractorError(u'This video is not available from your country.', expected=True)
+        mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
+        renditions = mdoc.findall('.//rendition')
+
+        # For now, always pick the highest quality.
+        rendition = renditions[-1]
+
+        try:
+            _,_,ext = rendition.attrib['type'].partition('/')
+            format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
+            rtmp_video_url = rendition.find('./src').text
+        except KeyError:
+            raise ExtractorError('Invalid rendition field.')
+        video_url = self._transform_rtmp_url(rtmp_video_url)
+        return {'ext': ext, 'url': video_url, 'format': format}
+
+    def _get_video_info(self, itemdoc):
+        uri = itemdoc.find('guid').text
+        video_id = self._id_from_uri(uri)
+        self.report_extraction(video_id)
+        mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+        if 'acceptMethods' not in mediagen_url:
+            mediagen_url += '&acceptMethods=fms'
+        mediagen_page = self._download_webpage(mediagen_url, video_id,
+                                               u'Downloading video urls')
+        video_info = self._extract_video_url(mediagen_page)
+
+        description_node = itemdoc.find('description')
+        if description_node is not None:
+            description = description_node.text
+        else:
+            description = None
+        video_info.update({'title': itemdoc.find('title').text,
+                           'id': video_id,
+                           'thumbnail': self._get_thumbnail_url(uri, itemdoc),
+                           'description': description,
+                           })
+        return video_info
+
+    def _get_videos_info(self, uri):
+        video_id = self._id_from_uri(uri)
+        data = compat_urllib_parse.urlencode({'uri': uri})
+        infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
+                                         u'Downloading info')
+        idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
+        return [self._get_video_info(item) for item in idoc.findall('.//item')]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        if not mobj.group('proto'):
-            url = 'http://' + url
          video_id = mobj.group('videoid')
  
          webpage = self._download_webpage(url, video_id)
@@ -35,46 +117,5 @@ class MTVIE(InfoExtractor):
              self.to_screen(u'Vevo video detected: %s' % vevo_id)
              return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
  
-        #song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
-        #    webpage, u'song name', fatal=False)
-
-        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
-            webpage, u'title')
-
-        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
-            webpage, u'mtvn_uri', fatal=False)
-
-        content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
-            webpage, u'content id', fatal=False)
-
-        videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
-        self.report_extraction(video_id)
-        request = compat_urllib_request.Request(videogen_url)
-        try:
-            metadataXml = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
-
-        mdoc = xml.etree.ElementTree.fromstring(metadataXml)
-        renditions = mdoc.findall('.//rendition')
-
-        # For now, always pick the highest quality.
-        rendition = renditions[-1]
-
-        try:
-            _,_,ext = rendition.attrib['type'].partition('/')
-            format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
-            video_url = rendition.find('./src').text
-        except KeyError:
-            raise ExtractorError('Invalid rendition field.')
-
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'upload_date': None,
-            'title': video_title,
-            'ext': ext,
-            'format': format,
-        }
-
-        return [info]
+        uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+        return self._get_videos_info(uri)
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py

index 122b7dd2628e3b1cd43ffb9dbb67035047745c9f..0f178905bfe0b049499dd58f71df42da1c419639 100644 (file)
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -30,8 +30,7 @@ class NBAIE(InfoExtractor):
          video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
  
          shortened_video_id = video_id.rpartition('/')[2]
-        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
-            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+        title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
  
          # It isn't there in the HTML it returns to us
          # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
diff --git a/youtube_dl/extractor/roxwel.py b/youtube_dl/extractor/roxwel.py

new file mode 100644 (file)

index 0000000..d339e6c
--- /dev/null
+++ b/youtube_dl/extractor/roxwel.py
@@ -0,0 +1,49 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import unified_strdate, determine_ext
+
+
+class RoxwelIE(InfoExtractor):
+    _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
+
+    _TEST = {
+        u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html',
+        u'file': u'passionpittakeawalklive.flv',
+        u'md5': u'd9dea8360a1e7d485d2206db7fe13035',
+        u'info_dict': {
+            u'title': u'Take A Walk (live)',
+            u'uploader': u'Passion Pit',
+            u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+        },
+        u'skip': u'Requires rtmpdump',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        filename = mobj.group('filename')
+        info_url = 'http://www.roxwel.com/api/videos/%s' % filename
+        info_page = self._download_webpage(info_url, filename,
+                                           u'Downloading video info')
+
+        self.report_extraction(filename)
+        info = json.loads(info_page)
+        rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
+        best_rate = rtmp_rates[-1]
+        url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
+        rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url')
+        ext = determine_ext(rtmp_url)
+        if ext == 'f4v':
+            rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
+
+        return {'id': filename,
+                'title': info['title'],
+                'url': rtmp_url,
+                'ext': 'flv',
+                'description': info['description'],
+                'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+                'uploader': info['artist'],
+                'uploader_id': info['artistname'],
+                'upload_date': unified_strdate(info['dbdate']),
+                }
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py

new file mode 100644 (file)

index 0000000..14b1c65
--- /dev/null
+++ b/youtube_dl/extractor/sina.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_request,
+    compat_urllib_parse,
+)
+
+
+class SinaIE(InfoExtractor):
+    _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/
+                        (
+                            (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=))(?P<id>\d+?)($|&))))
+                            |
+                            # This is used by external sites like Weibo
+                            (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+                        )
+                  '''
+
+    _TEST = {
+        u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
+        u'file': u'110028898.flv',
+        u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f',
+        u'info_dict': {
+            u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+        }
+    }
+
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
+
+    def _extract_video(self, video_id):
+        data = compat_urllib_parse.urlencode({'vid': video_id})
+        url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+            video_id, u'Downloading video url')
+        image_page = self._download_webpage(
+            'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
+            video_id, u'Downloading thumbnail info')
+        url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
+
+        return {'id': video_id,
+                'url': url_doc.find('./durl/url').text,
+                'ext': 'flv',
+                'title': url_doc.find('./vname').text,
+                'thumbnail': image_page.split('=')[1],
+                }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        video_id = mobj.group('id')
+        if mobj.group('token') is not None:
+            # The video id is in the redirected url
+            self.to_screen(u'Getting video id')
+            request = compat_urllib_request.Request(url)
+            request.get_method = lambda: 'HEAD'
+            (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+            return self._real_extract(urlh.geturl())
+        elif video_id is None:
+            pseudo_id = mobj.group('pseudo_id')
+            webpage = self._download_webpage(url, pseudo_id)
+            video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id')
+
+        return self._extract_video(video_id)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

index d47c49c03f6e5445e72255e79495baae3a16e115..7c9f1c6b65998d57515b65dea5e9120772e0b019 100644 (file)
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -19,7 +19,11 @@ class SoundcloudIE(InfoExtractor):
         of the stream token and uid
       """
  
-    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$'
+    _VALID_URL = r'''^(?:https?://)?
+                    (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
+                       |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
+                    )
+                    '''
      IE_NAME = u'soundcloud'
      _TEST = {
          u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
@@ -33,59 +37,65 @@ class SoundcloudIE(InfoExtractor):
          }
      }
  
+    _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+
+    @classmethod
+    def suitable(cls, url):
+        return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
+
      def report_resolve(self, video_id):
          """Report information extraction."""
          self.to_screen(u'%s: Resolving id' % video_id)
  
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        # extract uploader (which is in the url)
-        uploader = mobj.group(1)
-        # extract simple title (uploader + slug of song title)
-        slug_title =  mobj.group(2)
-        full_title = '%s/%s' % (uploader, slug_title)
-
-        self.report_resolve(full_title)
-
-        url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
-        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-        info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
+    @classmethod
+    def _resolv_url(cls, url):
+        return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
  
-        info = json.loads(info_json)
+    def _extract_info_dict(self, info, full_title=None):
          video_id = info['id']
-        self.report_extraction(full_title)
-
-        streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-        stream_json = self._download_webpage(streams_url, full_title,
-                                             u'Downloading stream definitions',
-                                             u'unable to download stream definitions')
-
-        streams = json.loads(stream_json)
-        mediaURL = streams['http_mp3_128_url']
-        upload_date = unified_strdate(info['created_at'])
+        name = full_title or video_id
+        self.report_extraction(name)
  
-        return [{
+        thumbnail = info['artwork_url']
+        if thumbnail is not None:
+            thumbnail = thumbnail.replace('-large', '-t500x500')
+        return {
              'id':       info['id'],
-            'url':      mediaURL,
+            'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,
              'uploader': info['user']['username'],
-            'upload_date': upload_date,
+            'upload_date': unified_strdate(info['created_at']),
              'title':    info['title'],
              'ext':      u'mp3',
              'description': info['description'],
-        }]
+            'thumbnail': thumbnail,
+        }
  
-class SoundcloudSetIE(InfoExtractor):
-    """Information extractor for soundcloud.com sets
-       To access the media, the uid of the song and a stream token
-       must be extracted from the page source and the script must make
-       a request to media.soundcloud.com/crossdomain.xml. Then
-       the media can be grabbed by requesting from an url composed
-       of the stream token and uid
-     """
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+
+        track_id = mobj.group('track_id')
+        if track_id is not None:
+            info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
+            full_title = track_id
+        else:
+            # extract uploader (which is in the url)
+            uploader = mobj.group(1)
+            # extract simple title (uploader + slug of song title)
+            slug_title =  mobj.group(2)
+            full_title = '%s/%s' % (uploader, slug_title)
+    
+            self.report_resolve(full_title)
+    
+            url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
+            info_json_url = self._resolv_url(url)
+        info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON')
  
+        info = json.loads(info_json)
+        return self._extract_info_dict(info, full_title)
+
+class SoundcloudSetIE(SoundcloudIE):
      _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
      IE_NAME = u'soundcloud:set'
      _TEST = {
@@ -153,10 +163,6 @@ class SoundcloudSetIE(InfoExtractor):
          ]
      }
  
-    def report_resolve(self, video_id):
-        """Report information extraction."""
-        self.to_screen(u'%s: Resolving id' % video_id)
-
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          if mobj is None:
@@ -171,7 +177,7 @@ class SoundcloudSetIE(InfoExtractor):
          self.report_resolve(full_title)
  
          url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
-        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
+        resolv_url = self._resolv_url(url)
          info_json = self._download_webpage(resolv_url, full_title)
  
          videos = []
@@ -182,23 +188,8 @@ class SoundcloudSetIE(InfoExtractor):
              return
  
          self.report_extraction(full_title)
-        for track in info['tracks']:
-            video_id = track['id']
-
-            streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
-            stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
-
-            self.report_extraction(video_id)
-            streams = json.loads(stream_json)
-            mediaURL = streams['http_mp3_128_url']
-
-            videos.append({
-                'id':       video_id,
-                'url':      mediaURL,
-                'uploader': track['user']['username'],
-                'upload_date':  unified_strdate(track['created_at']),
-                'title':    track['title'],
-                'ext':      u'mp3',
-                'description': track['description'],
-            })
-        return videos
+        return {'_type': 'playlist',
+                'entries': [self._extract_info_dict(track) for track in info['tracks']],
+                'id': info['id'],
+                'title': info['title'],
+                }
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py

index ae9a63e8b4e018c1cc3625aa8bc75fe37d62922a..b8e6b3bf91a05a793db631db9325bb2bc605b8b9 100644 (file)
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/statigram.py
@@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group(1)
          webpage = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'<meta property="og:video:secure_url" content="(.+?)">',
-            webpage, u'video URL')
-        thumbnail_url = self._html_search_regex(
-            r'<meta property="og:image" content="(.+?)" />',
-            webpage, u'thumbnail URL', fatal=False)
          html_title = self._html_search_regex(
              r'<title>(.+?)</title>',
              webpage, u'title')
@@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor):
  
          return [{
              'id':        video_id,
-            'url':       video_url,
+            'url':       self._og_search_video_url(webpage),
              'ext':       ext,
              'title':     title,
-            'thumbnail': thumbnail_url,
+            'thumbnail': self._og_search_thumbnail(webpage),
              'uploader_id' : uploader_id
          }]
diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py

index ecac4ec40b48d34d7b4e849cb4af0a4ba565b3b9..91658f8925cac6199bda5f7aa05aa0a2a73e85e4 100644 (file)
--- a/youtube_dl/extractor/steam.py
+++ b/youtube_dl/extractor/steam.py
@@ -23,14 +23,16 @@ class SteamIE(InfoExtractor):
                  u"file": u"81300.flv",
                  u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
                  u"info_dict": {
-                        u"title": u"Terraria 1.1 Trailer"
+                        u"title": u"Terraria 1.1 Trailer",
+                        u'playlist_index': 1,
                  }
              },
              {
                  u"file": u"80859.flv",
                  u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
                  u"info_dict": {
-                    u"title": u"Terraria Trailer"
+                    u"title": u"Terraria Trailer",
+                    u'playlist_index': 2,
                  }
              }
          ]
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index 1dd5e1b685e7aa99804d51d99a594945f30961a6..c910110ca9775d9ad03011238aacdc3c9ef4dae1 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -30,26 +30,17 @@ class TeamcocoIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
-            webpage, u'thumbnail', fatal=False)
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
-            webpage, u'description', fatal=False)
-
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
          data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
  
-        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
+        video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>',
              data, u'video URL')
  
          return [{
              'id':          video_id,
              'url':         video_url,
              'ext':         'mp4',
-            'title':       video_title,
-            'thumbnail':   thumbnail,
-            'description': video_description,
+            'title':       self._og_search_title(webpage),
+            'thumbnail':   self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
          }]
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py

index 8b73b8340c40badad0023a53cc5b10b363e57b6a..4c11f7a03c37136c0c80677e55b66598c647edeb 100644 (file)
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -67,7 +67,7 @@ class TEDIE(InfoExtractor):
          webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
          self.report_extraction(video_name)
          # If the url includes the language we get the title translated
-        title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+        title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
                                          webpage, 'title')
          json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
                                      webpage, 'json data')
diff --git a/youtube_dl/extractor/tf1.py b/youtube_dl/extractor/tf1.py

index e0ffeced50eabae1a9f7d0a9920e1df18c5fa4c6..772134a128e6f75d3a15d4fbb4ee37a776edfe10 100644 (file)
--- a/youtube_dl/extractor/tf1.py
+++ b/youtube_dl/extractor/tf1.py
@@ -6,19 +6,17 @@ import re
  from .common import InfoExtractor
  
  class TF1IE(InfoExtractor):
-    """
-    TF1 uses the wat.tv player, currently it can only download videos with the
-    html5 player enabled, it cannot download HD videos.
-    """
+    """TF1 uses the wat.tv player."""
      _VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
      _TEST = {
          u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
          u'file': u'10635995.mp4',
-        u'md5': u'66789d3e91278d332f75e1feb7aea327',
+        u'md5': u'2e378cc28b9957607d5e88f274e637d8',
          u'info_dict': {
              u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
              u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
-        }
+        },
+        u'skip': u'Sometimes wat serves the whole file with the --test option',
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/thisav.py b/youtube_dl/extractor/thisav.py

new file mode 100644 (file)

index 0000000..9dcfc28
--- /dev/null
+++ b/youtube_dl/extractor/thisav.py
@@ -0,0 +1,47 @@
+#coding: utf-8
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+)
+
+class ThisAVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
+    _TEST = {
+        u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html",
+        u"file": u"47734.flv",
+        u"md5": u"0480f1ef3932d901f0e0e719f188f19b",
+        u"info_dict": {
+            u"title": u"高樹マリア - Just fit",
+            u"uploader": u"dj7970",
+            u"uploader_id": u"dj7970"
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title')
+        video_url = self._html_search_regex(
+            r"addVariable\('file','([^']+)'\);", webpage, u'video url')
+        uploader = self._html_search_regex(
+            r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
+            webpage, u'uploader name', fatal=False)
+        uploader_id = self._html_search_regex(
+            r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
+            webpage, u'uploader id', fatal=False)
+        ext = determine_ext(video_url)
+        
+        return {
+            '_type':       'video',
+            'id':          video_id,
+            'url':         video_url,
+            'uploader':    uploader,
+            'uploader_id': uploader_id,
+            'title':       title,
+            'ext':         ext,
+        }
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py

index 9dd26c1637f58e5a823a6cf75ba8fd4ee42f0750..35f89e9eecb0145988f8f503dcb6b18aea5d0e93 100644 (file)
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -4,11 +4,11 @@ from .common import InfoExtractor
  
  
  class TrailerAddictIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)'
+    _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
      _TEST = {
          u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
          u'file': u'76184.mp4',
-        u'md5': u'41365557f3c8c397d091da510e73ceb4',
+        u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71',
          u'info_dict': {
              u"title": u"Prince Avalanche Trailer",
              u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind."
@@ -17,33 +17,36 @@ class TrailerAddictIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group(1)
-        webpage = self._download_webpage(url, video_id)
-        
+        name = mobj.group('movie') + '/' + mobj.group('trailer_name')
+        webpage = self._download_webpage(url, name)
+
          title = self._search_regex(r'<title>(.+?)</title>',
                  webpage, 'video title').replace(' - Trailer Addict','')
          view_count = self._search_regex(r'Views: (.+?)<br />',
                  webpage, 'Views Count')
-        description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
-                webpage, 'video description')
-        video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />',
-                webpage, 'Video id').split('=')[1]
-        
-        info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))
+        video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
+
+        # Presence of (no)watchplus function indicates HD quality is available
+        if re.search(r'function (no)?watchplus()', webpage):
+            fvar = "fvarhd"
+        else:
+            fvar = "fvar"
+
+        info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id))
          info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
-        
+
          final_url = self._search_regex(r'&fileurl=(.+)',
                  info_webpage, 'Download url').replace('%3F','?')
          thumbnail_url = self._search_regex(r'&image=(.+?)&',
                  info_webpage, 'thumbnail url')
          ext = final_url.split('.')[-1].split('?')[0]
-        
+
          return [{
              'id'          : video_id,
              'url'         : final_url,
              'ext'         : ext,
              'title'       : title,
              'thumbnail'   : thumbnail_url,
-            'description' : description,
+            'description' : self._og_search_description(webpage),
              'view_count'  : view_count,
          }]
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py

index fcaa6ac01af6d778e43aa7b35d92d3dcc9478911..4e404fbf5912fd32b695c701466309a38179e799 100644 (file)
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -22,8 +22,6 @@ class TutvIE(InfoExtractor):
          video_id = mobj.group('id')
  
          webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(
-            r'<meta property="og:title" content="(.*?)">', webpage, u'title')
          internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
  
          data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
@@ -36,6 +34,6 @@ class TutvIE(InfoExtractor):
              'id': internal_id,
              'url': video_url,
              'ext': ext,
-            'title': title,
+            'title': self._og_search_title(webpage),
          }
          return [info]
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index 3b16dcfbc160b34c787d7dd99e9b54fb2dea1c6b..67537eae5afecc158c13864c4dc494fd366076c3 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -35,12 +35,12 @@ class VevoIE(InfoExtractor):
  
          self.report_extraction(video_id)
          video_info = json.loads(info_json)
-        m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):(?P<url>.*?)"', links_webpage))
+        m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage))
          if m_urls is None or len(m_urls) == 0:
              raise ExtractorError(u'Unable to extract video url')
          # They are sorted from worst to best quality
          m_url = m_urls[-1]
-        video_url = base_url + m_url.group('url')
+        video_url = base_url + '/' + m_url.group('url')
          ext = m_url.group('ext')
  
          return {'url': video_url,
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index ac32043c1e651abaadf1a223c18e4983b9301ba7..cc9c8d0188749761b79b90652d3c0c843a24eda3 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -1,5 +1,6 @@
  import json
  import re
+import itertools
  
  from .common import InfoExtractor
  from ..utils import (
@@ -171,3 +172,31 @@ class VimeoIE(InfoExtractor):
              'thumbnail':    video_thumbnail,
              'description':  video_description,
          }]
+
+
+class VimeoChannelIE(InfoExtractor):
+    IE_NAME = u'vimeo:channel'
+    _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
+    _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel_id =  mobj.group('id')
+        video_ids = []
+
+        for pagenum in itertools.count(1):
+            webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
+                                             channel_id, u'Downloading page %s' % pagenum)
+            video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
+            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+                break
+
+        entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
+                   for video_id in video_ids]
+        channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
+                                                webpage, u'channel title')
+        return {'_type': 'playlist',
+                'id': channel_id,
+                'title': channel_title,
+                'entries': entries,
+                }
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py

index bdd3522ebf5a7385a80c54a6e85c808d54346cc4..c4ec1f06ffe3ccce17598aeb319047f0890f9a02 100644 (file)
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,12 +27,6 @@ class VineIE(InfoExtractor):
          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
              webpage, u'video URL')
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
-            webpage, u'thumbnail', fatal=False)
-
          uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
  
@@ -40,7 +34,7 @@ class VineIE(InfoExtractor):
              'id':        video_id,
              'url':       video_url,
              'ext':       'mp4',
-            'title':     video_title,
-            'thumbnail': thumbnail,
+            'title':     self._og_search_title(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
              'uploader':  uploader,
          }]
diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py

index 0d1302cd20ab6cdaca814916a5440cf54bd11c92..7d228edac1fb8b291189487482d0c718b3707e08 100644 (file)
--- a/youtube_dl/extractor/wat.py
+++ b/youtube_dl/extractor/wat.py
@@ -17,11 +17,12 @@ class WatIE(InfoExtractor):
      _TEST = {
          u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
          u'file': u'10631273.mp4',
-        u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
+        u'md5': u'd8b2231e1e333acd12aad94b80937e19',
          u'info_dict': {
              u'title': u'World War Z - Philadelphia VOST',
              u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
-        }
+        },
+        u'skip': u'Sometimes wat serves the whole file with the --test option',
      }
      
      def download_video_info(self, real_id):
@@ -58,20 +59,8 @@ class WatIE(InfoExtractor):
  
          # Otherwise we can continue and extract just one part, we have to use
          # the short id for getting the video url
-        player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
-                                                     'html5': '1'})
-        player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
-                                             real_id, u'Downloading player info')
-        player = json.loads(player_info)['player']
-        html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
-                                               'html5 player')
-        player_webpage = self._download_webpage(html5_player, real_id,
-                                                u'Downloading player webpage')
-
-        video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
-                                       'video url')
          info = {'id': real_id,
-                'url': video_url,
+                'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
                  'ext': 'mp4',
                  'title': first_chapter['title'],
                  'thumbnail': first_chapter['preview'],
diff --git a/youtube_dl/extractor/weibo.py b/youtube_dl/extractor/weibo.py

new file mode 100644 (file)

index 0000000..0757495
--- /dev/null
+++ b/youtube_dl/extractor/weibo.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+
+import re
+import json
+
+from .common import InfoExtractor
+
+class WeiboIE(InfoExtractor):
+    """
+    The videos in Weibo come from different sites, this IE just finds the link
+    to the external video and returns it.
+    """
+    _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
+
+    _TEST = {
+        u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
+        u'file': u'98322879.flv',
+        u'info_dict': {
+            u'title': u'魔声耳机最新广告“All Eyes On Us”',
+        },
+        u'note': u'Sina video',
+        u'params': {
+            u'skip_download': True,
+        },
+    }
+
+    # Additional example videos from different sites
+    # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm
+    # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+        video_id = mobj.group('id')
+        info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
+        info_page = self._download_webpage(info_url, video_id)
+        info = json.loads(info_page)
+
+        videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
+        #Prefer sina video since they have thumbnails
+        videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u)
+        player_url = videos_urls[-1]
+        m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url)
+        if m_sina is not None:
+            self.to_screen('Sina video detected')
+            sina_id = m_sina.group(1)
+            player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
+        return self.url_result(player_url)
+
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py

index 5b9779c05853ab815aa56d710476356c56c8449b..3237596a3ace9796001f8ab78921ca9b6c84d2d1 100644 (file)
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -21,6 +21,13 @@ class WorldStarHipHopIE(InfoExtractor):
  
          webpage_src = self._download_webpage(url, video_id)
  
+        m_vevo_id = re.search(r'videoId=(.*?)&amp?',
+            webpage_src)
+        
+        if m_vevo_id is not None:
+            self.to_screen(u'Vevo video detected:')
+            return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
+
          video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
              webpage_src, u'video URL')
  
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py

index 6f022670cb9ef076a002c387e6357d99ec87d402..1265639e821bd873b74aeea08811f8c22e966ba1 100644 (file)
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -40,8 +40,20 @@ class YouJizzIE(InfoExtractor):
          webpage = self._download_webpage(embed_page_url, video_id)
  
          # Get the video URL
-        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
-            webpage, u'video URL')
+        m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
+        if m_playlist is not None:
+            playlist_url = m_playlist.group('playlist')
+            playlist_page = self._download_webpage(playlist_url, video_id,
+                                                   u'Downloading playlist page')
+            m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page))
+            if len(m_levels) == 0:
+                raise ExtractorError(u'Unable to extract video url')
+            videos = [(int(m.group(1)), m.group(2)) for m in m_levels]
+            (_, video_url) = sorted(videos)[0]
+            video_url = video_url.replace('%252F', '%2F')
+        else:
+            video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+                                           webpage, u'video URL')
  
          info = {'id': video_id,
                  'url': video_url,
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py

index eb98298019c04334276688a7d9c6a5db8bd90664..996d384784cb827ed4baa3304b61782542ea0767 100644 (file)
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -13,7 +13,7 @@ from ..utils import (
  
  
  class YoukuIE(InfoExtractor):
-    _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
+    _VALID_URL =  r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
      _TEST =   {
          u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
          u"file": u"XNDgyMDQ2NTQw_part00.flv",
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index afb655c04f8f5495b7872d4ce72fbe7d8a7db583..bc89a14ffc0977eb51e48e8a2ea37a1ec8b33466 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -23,8 +23,114 @@ from ..utils import (
      orderedSet,
  )
  
+class YoutubeBaseInfoExtractor(InfoExtractor):
+    """Provide base functions for Youtube extractors"""
+    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+    _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+    _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+    _NETRC_MACHINE = 'youtube'
+    # If True it will raise an error if no login info is provided
+    _LOGIN_REQUIRED = False
+
+    def report_lang(self):
+        """Report attempt to set language."""
+        self.to_screen(u'Setting language')
+
+    def _set_language(self):
+        request = compat_urllib_request.Request(self._LANG_URL)
+        try:
+            self.report_lang()
+            compat_urllib_request.urlopen(request).read()
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
+            return False
+        return True
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        # No authentication to be performed
+        if username is None:
+            if self._LOGIN_REQUIRED:
+                raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return False
+
+        request = compat_urllib_request.Request(self._LOGIN_URL)
+        try:
+            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
+            return False
+
+        galx = None
+        dsh = None
+        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
+        if match:
+          galx = match.group(1)
+        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
+        if match:
+          dsh = match.group(1)
+
+        # Log in
+        login_form_strs = {
+                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+                u'Email': username,
+                u'GALX': galx,
+                u'Passwd': password,
+                u'PersistentCookie': u'yes',
+                u'_utf8': u'霱',
+                u'bgresponse': u'js_disabled',
+                u'checkConnection': u'',
+                u'checkedDomains': u'youtube',
+                u'dnConn': u'',
+                u'dsh': dsh,
+                u'pstMsg': u'0',
+                u'rmShown': u'1',
+                u'secTok': u'',
+                u'signIn': u'Sign in',
+                u'timeStmp': u'',
+                u'service': u'youtube',
+                u'uilel': u'3',
+                u'hl': u'en_US',
+        }
+        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+        # chokes on unicode
+        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+        login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+        try:
+            self.report_login()
+            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
+            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+                self._downloader.report_warning(u'unable to log in: bad username or password')
+                return False
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+            return False
+        return True
  
-class YoutubeIE(InfoExtractor):
+    def _confirm_age(self):
+        age_form = {
+                'next_url':     '/',
+                'action_confirm':   'Confirm',
+                }
+        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+        try:
+            self.report_age_confirmation()
+            compat_urllib_request.urlopen(request).read().decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+        return True
+
+    def _real_initialize(self):
+        if self._downloader is None:
+            return
+        if not self._set_language():
+            return
+        if not self._login():
+            return
+        self._confirm_age()
+
+class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com'
      _VALID_URL = r"""^
                       (
@@ -45,14 +151,16 @@ class YoutubeIE(InfoExtractor):
                       ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
                       (?(1).+)?                                                # if we found the ID, everything can follow
                       $"""
-    _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
-    _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
-    _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
      _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
-    _NETRC_MACHINE = 'youtube'
      # Listed in order of quality
-    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
-    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
+    _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13',
+                          '95', '94', '93', '92', '132', '151',
+                          '85', '84', '102', '83', '101', '82', '100',
+                          ]
+    _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13',
+                                      '95', '94', '93', '92', '132', '151',
+                                      '85', '102', '84', '101', '83', '100', '82',
+                                      ]
      _video_extensions = {
          '13': '3gp',
          '17': 'mp4',
@@ -64,6 +172,24 @@ class YoutubeIE(InfoExtractor):
          '44': 'webm',
          '45': 'webm',
          '46': 'webm',
+
+        # 3d videos
+        '82': 'mp4',
+        '83': 'mp4',
+        '84': 'mp4',
+        '85': 'mp4',
+        '100': 'webm',
+        '101': 'webm',
+        '102': 'webm',
+        
+        # videos that use m3u8
+        '92': 'mp4',
+        '93': 'mp4',
+        '94': 'mp4',
+        '95': 'mp4',
+        '96': 'mp4',
+        '132': 'mp4',
+        '151': 'mp4',
      }
      _video_dimensions = {
          '5': '240x400',
@@ -80,7 +206,22 @@ class YoutubeIE(InfoExtractor):
          '44': '480x854',
          '45': '720x1280',
          '46': '1080x1920',
+        '82': '360p',
+        '83': '480p',
+        '84': '720p',
+        '85': '1080p',
+        '92': '240p',
+        '93': '360p',
+        '94': '480p',
+        '95': '720p',
+        '96': '1080p',
+        '100': '360p',
+        '101': '480p',
+        '102': '720p',        
+        '132': '240p',
+        '151': '72p',
      }
+    _3d_itags = ['85', '84', '102', '83', '101', '82', '100']
      IE_NAME = u'youtube'
      _TESTS = [
          {
@@ -130,6 +271,21 @@ class YoutubeIE(InfoExtractor):
                  u"uploader_id": u"justintimberlakeVEVO"
              }
          },
+        {
+            u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
+            u'file': u'TGi3HqYrWHE.mp4',
+            u'note': u'm3u8 video',
+            u'info_dict': {
+                u'title': u'Triathlon - Men - London 2012 Olympic Games',
+                u'description': u'- Men -  TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
+                u'uploader': u'olympic',
+                u'upload_date': u'20120807',
+                u'uploader_id': u'olympic',
+            },
+            u'params': {
+                u'skip_download': True,
+            },
+        },
      ]
  
  
@@ -139,10 +295,6 @@ class YoutubeIE(InfoExtractor):
          if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  
-    def report_lang(self):
-        """Report attempt to set language."""
-        self.to_screen(u'Setting language')
-
      def report_video_webpage_download(self, video_id):
          """Report attempt to download video webpage."""
          self.to_screen(u'%s: Downloading video webpage' % video_id)
@@ -179,24 +331,42 @@ class YoutubeIE(InfoExtractor):
      def _decrypt_signature(self, s):
          """Turn the encrypted s field into a working signature"""
  
-        if len(s) == 88:
+        if len(s) == 92:
+            return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
+        elif len(s) == 90:
+            return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
+        elif len(s) == 88:
              return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
          elif len(s) == 87:
-            return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1]
+            return s[4:23] + s[86] + s[24:85]
          elif len(s) == 86:
-            return s[2:63] + s[82] + s[64:82] + s[63]
+            return s[83:85] + s[26] + s[79:46:-1] + s[85] + s[45:36:-1] + s[30] + s[35:30:-1] + s[46] + s[29:26:-1] + s[82] + s[25:1:-1]
          elif len(s) == 85:
-            return s[76] + s[82:76:-1] + s[83] + s[75:60:-1] + s[0] + s[59:50:-1] + s[1] + s[49:2:-1]
+            return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21]
          elif len(s) == 84:
              return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
          elif len(s) == 83:
-            return s[:81]
+            return s[:15] + s[80] + s[16:80] + s[15]
          elif len(s) == 82:
              return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
+        elif len(s) == 81:
+            return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
+        elif len(s) == 79:
+            return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
  
          else:
              raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
  
+    def _decrypt_signature_age_gate(self, s):
+        # The videos with age protection use another player, so the algorithms
+        # can be different.
+        if len(s) == 86:
+            return s[2:63] + s[82] + s[64:82] + s[63]
+        else:
+            # Fallback to the other algortihms
+            return self._decrypt_signature(s)
+
+
      def _get_available_subtitles(self, video_id):
          self.report_video_subtitles_download(video_id)
          request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
@@ -298,92 +468,9 @@ class YoutubeIE(InfoExtractor):
      def _print_formats(self, formats):
          print('Available formats:')
          for x in formats:
-            print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
-
-    def _real_initialize(self):
-        if self._downloader is None:
-            return
-
-        # Set language
-        request = compat_urllib_request.Request(self._LANG_URL)
-        try:
-            self.report_lang()
-            compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
-            return
-
-        (username, password) = self._get_login_info()
-
-        # No authentication to be performed
-        if username is None:
-            return
-
-        request = compat_urllib_request.Request(self._LOGIN_URL)
-        try:
-            login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
-            return
-
-        galx = None
-        dsh = None
-        match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          galx = match.group(1)
-
-        match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
-        if match:
-          dsh = match.group(1)
-
-        # Log in
-        login_form_strs = {
-                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
-                u'Email': username,
-                u'GALX': galx,
-                u'Passwd': password,
-                u'PersistentCookie': u'yes',
-                u'_utf8': u'霱',
-                u'bgresponse': u'js_disabled',
-                u'checkConnection': u'',
-                u'checkedDomains': u'youtube',
-                u'dnConn': u'',
-                u'dsh': dsh,
-                u'pstMsg': u'0',
-                u'rmShown': u'1',
-                u'secTok': u'',
-                u'signIn': u'Sign in',
-                u'timeStmp': u'',
-                u'service': u'youtube',
-                u'uilel': u'3',
-                u'hl': u'en_US',
-        }
-        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
-        # chokes on unicode
-        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
-        login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
-        request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
-        try:
-            self.report_login()
-            login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
-            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
-                self._downloader.report_warning(u'unable to log in: bad username or password')
-                return
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
-            return
-
-        # Confirm age
-        age_form = {
-                'next_url':     '/',
-                'action_confirm':   'Confirm',
-                }
-        request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
-        try:
-            self.report_age_confirmation()
-            compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+            print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
+                                        self._video_dimensions.get(x, '???'),
+                                        ' (3D)' if x in self._3d_itags else ''))
  
      def _extract_id(self, url):
          mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -392,6 +479,57 @@ class YoutubeIE(InfoExtractor):
          video_id = mobj.group(2)
          return video_id
  
+    def _get_video_url_list(self, url_map):
+        """
+        Transform a dictionary in the format {itag:url} to a list of (itag, url)
+        with the requested formats.
+        """
+        req_format = self._downloader.params.get('format', None)
+        format_limit = self._downloader.params.get('format_limit', None)
+        available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+        if format_limit is not None and format_limit in available_formats:
+            format_list = available_formats[available_formats.index(format_limit):]
+        else:
+            format_list = available_formats
+        existing_formats = [x for x in format_list if x in url_map]
+        if len(existing_formats) == 0:
+            raise ExtractorError(u'no known formats available for video')
+        if self._downloader.params.get('listformats', None):
+            self._print_formats(existing_formats)
+            return
+        if req_format is None or req_format == 'best':
+            video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
+        elif req_format == 'worst':
+            video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
+        elif req_format in ('-1', 'all'):
+            video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+        else:
+            # Specific formats. We pick the first in a slash-delimeted sequence.
+            # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
+            req_formats = req_format.split('/')
+            video_url_list = None
+            for rf in req_formats:
+                if rf in url_map:
+                    video_url_list = [(rf, url_map[rf])]
+                    break
+            if video_url_list is None:
+                raise ExtractorError(u'requested format not available')
+        return video_url_list
+
+    def _extract_from_m3u8(self, manifest_url, video_id):
+        url_map = {}
+        def _get_urls(_manifest):
+            lines = _manifest.split('\n')
+            urls = filter(lambda l: l and not l.startswith('#'),
+                            lines)
+            return urls
+        manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
+        formats_urls = _get_urls(manifest)
+        for format_url in formats_urls:
+            itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
+            url_map[itag] = format_url
+        return url_map
+
      def _real_extract(self, url):
          if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
              self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like  youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply  youtube-dl BaW_jenozKc  ).')
@@ -546,7 +684,6 @@ class YoutubeIE(InfoExtractor):
              video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
  
          # Decide which formats to download
-        req_format = self._downloader.params.get('format', None)
  
          try:
              mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
@@ -567,6 +704,8 @@ class YoutubeIE(InfoExtractor):
              self.report_rtmp_download()
              video_url_list = [(None, video_info['conn'][0])]
          elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
+            if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
+                raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
              url_map = {}
              for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
                  url_data = compat_parse_qs(url_data_str)
@@ -585,43 +724,28 @@ class YoutubeIE(InfoExtractor):
                              else:
                                  player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
                                      'html5 player', fatal=False)
-                            self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
-                                (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
-                        signature = self._decrypt_signature(url_data['s'][0])
+                            parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
+                            self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
+                                (len(s), parts_sizes, url_data['itag'][0], player))
+                        encrypted_sig = url_data['s'][0]
+                        if age_gate:
+                            signature = self._decrypt_signature_age_gate(encrypted_sig)
+                        else:
+                            signature = self._decrypt_signature(encrypted_sig)
                          url += '&signature=' + signature
                      if 'ratebypass' not in url:
                          url += '&ratebypass=yes'
                      url_map[url_data['itag'][0]] = url
-
-            format_limit = self._downloader.params.get('format_limit', None)
-            available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
-            if format_limit is not None and format_limit in available_formats:
-                format_list = available_formats[available_formats.index(format_limit):]
-            else:
-                format_list = available_formats
-            existing_formats = [x for x in format_list if x in url_map]
-            if len(existing_formats) == 0:
-                raise ExtractorError(u'no known formats available for video')
-            if self._downloader.params.get('listformats', None):
-                self._print_formats(existing_formats)
+            video_url_list = self._get_video_url_list(url_map)
+            if not video_url_list:
                  return
-            if req_format is None or req_format == 'best':
-                video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
-            elif req_format == 'worst':
-                video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
-            elif req_format in ('-1', 'all'):
-                video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
-            else:
-                # Specific formats. We pick the first in a slash-delimeted sequence.
-                # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
-                req_formats = req_format.split('/')
-                video_url_list = None
-                for rf in req_formats:
-                    if rf in url_map:
-                        video_url_list = [(rf, url_map[rf])]
-                        break
-                if video_url_list is None:
-                    raise ExtractorError(u'requested format not available')
+        elif video_info.get('hlsvp'):
+            manifest_url = video_info['hlsvp'][0]
+            url_map = self._extract_from_m3u8(manifest_url, video_id)
+            video_url_list = self._get_video_url_list(url_map)
+            if not video_url_list:
+                return
+
          else:
              raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
  
@@ -630,8 +754,9 @@ class YoutubeIE(InfoExtractor):
              # Extension
              video_extension = self._video_extensions.get(format_param, 'flv')
  
-            video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
-                                              self._video_dimensions.get(format_param, '???'))
+            video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
+                                              self._video_dimensions.get(format_param, '???'),
+                                              ' (3D)' if format_param in self._3d_itags else '')
  
              results.append({
                  'id':       video_id,
@@ -661,10 +786,10 @@ class YoutubePlaylistIE(InfoExtractor):
                             \? (?:.*?&)*? (?:p|a|list)=
                          |  p/
                          )
-                        ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
+                        ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
                          .*
                       |
-                        ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
+                        ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
                       )"""
      _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
      _MAX_RESULTS = 50
@@ -683,11 +808,14 @@ class YoutubePlaylistIE(InfoExtractor):
  
          # Download playlist videos from API
          playlist_id = mobj.group(1) or mobj.group(2)
-        page_num = 1
          videos = []
  
-        while True:
-            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
+        for page_num in itertools.count(1):
+            start_index = self._MAX_RESULTS * (page_num - 1) + 1
+            if start_index >= 1000:
+                self._downloader.report_warning(u'Max number of results reached')
+                break
+            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
  
              try:
@@ -707,10 +835,6 @@ class YoutubePlaylistIE(InfoExtractor):
                  if 'media$group' in entry and 'media$player' in entry['media$group']:
                      videos.append((index, entry['media$group']['media$player']['url']))
  
-            if len(response['feed']['entry']) < self._MAX_RESULTS:
-                break
-            page_num += 1
-
          videos = [v[1] for v in sorted(videos)]
  
          url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
@@ -722,7 +846,7 @@ class YoutubeChannelIE(InfoExtractor):
      _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
      _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
      _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
-    _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
+    _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
      IE_NAME = u'youtube:channel'
  
      def extract_videos_from_page(self, page):
@@ -753,9 +877,7 @@ class YoutubeChannelIE(InfoExtractor):
  
          # Download any subsequent channel pages using the json-based channel_ajax query
          if self._MORE_PAGES_INDICATOR in page:
-            while True:
-                pagenum = pagenum + 1
-
+            for pagenum in itertools.count(1):
                  url = self._MORE_PAGES_URL % (pagenum, channel_id)
                  page = self._download_webpage(url, channel_id,
                                                u'Downloading page #%s' % pagenum)
@@ -798,9 +920,8 @@ class YoutubeUserIE(InfoExtractor):
          # all of them.
  
          video_ids = []
-        pagenum = 0
  
-        while True:
+        for pagenum in itertools.count(0):
              start_index = pagenum * self._GDATA_PAGE_SIZE + 1
  
              gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
@@ -825,8 +946,6 @@ class YoutubeUserIE(InfoExtractor):
              if len(ids_in_page) < self._GDATA_PAGE_SIZE:
                  break
  
-            pagenum += 1
-
          urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
          url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
          return [self.playlist_result(url_results, playlist_title = username)]
@@ -889,38 +1008,75 @@ class YoutubeShowIE(InfoExtractor):
          return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
  
  
-class YoutubeSubscriptionsIE(YoutubeIE):
-    """It's a subclass of YoutubeIE because we need to login"""
-    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    IE_NAME = u'youtube:subscriptions'
-    _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+    """
+    Base class for extractors that fetch info from
+    http://www.youtube.com/feed_ajax
+    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
+    """
+    _LOGIN_REQUIRED = True
      _PAGING_STEP = 30
+    # use action_load_personal_feed instead of action_load_system_feed
+    _PERSONAL_FEED = False
  
-    # Overwrite YoutubeIE properties we don't want
-    _TESTS = []
-    @classmethod
-    def suitable(cls, url):
-        return re.match(cls._VALID_URL, url) is not None
+    @property
+    def _FEED_TEMPLATE(self):
+        action = 'action_load_system_feed'
+        if self._PERSONAL_FEED:
+            action = 'action_load_personal_feed'
+        return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
+
+    @property
+    def IE_NAME(self):
+        return u'youtube:%s' % self._FEED_NAME
  
      def _real_initialize(self):
-        (username, password) = self._get_login_info()
-        if username is None:
-            raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
-        super(YoutubeSubscriptionsIE, self)._real_initialize()
+        self._login()
  
      def _real_extract(self, url):
          feed_entries = []
          # The step argument is available only in 2.7 or higher
          for i in itertools.count(0):
              paging = i*self._PAGING_STEP
-            info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+            info = self._download_webpage(self._FEED_TEMPLATE % paging,
+                                          u'%s feed' % self._FEED_NAME,
                                            u'Downloading page %s' % i)
              info = json.loads(info)
              feed_html = info['feed_html']
-            m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
              feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
              if info['paging'] is None:
                  break
-        return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')
+        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = u'Youtube Subscriptions'
+
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = u'Youtube Recommended videos'
+
+class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
+    _FEED_NAME = 'watch_later'
+    _PLAYLIST_TITLE = u'Youtube Watch Later'
+    _PAGING_STEP = 100
+    _PERSONAL_FEED = True
+
+class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
+    IE_NAME = u'youtube:favorites'
+    IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?'
+    _LOGIN_REQUIRED = True
+
+    def _real_extract(self, url):
+        webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
+        playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
+        return self.url_result(playlist_id, 'YoutubePlaylist')
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index b9bff5fde87d91a5956e978c98880f05034ac6ab..cf2ea654e892c5f8882a4ecaae19bdcb5afbbbfd 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -35,6 +35,11 @@ try:
  except ImportError: # Python 2
      from urlparse import urlparse as compat_urllib_parse_urlparse
  
+try:
+    import urllib.parse as compat_urlparse
+except ImportError: # Python 2
+    import urlparse as compat_urlparse
+
  try:
      import http.cookiejar as compat_cookiejar
  except ImportError: # Python 2
@@ -198,6 +203,20 @@ else:
          with open(fn, 'w', encoding='utf-8') as f:
              json.dump(obj, f)
  
+if sys.version_info >= (2,7):
+    def find_xpath_attr(node, xpath, key, val):
+        """ Find the xpath xpath[@key=val] """
+        assert re.match(r'^[a-zA-Z]+$', key)
+        assert re.match(r'^[a-zA-Z@]*$', val)
+        expr = xpath + u"[@%s='%s']" % (key, val)
+        return node.find(expr)
+else:
+    def find_xpath_attr(node, xpath, key, val):
+        for f in node.findall(xpath):
+            if f.attrib.get(key) == val:
+                return f
+        return None
+
  def htmlentity_transform(matchobj):
      """Transforms an HTML entity to a character.
  
@@ -631,12 +650,12 @@ def unified_strdate(date_str):
              pass
      return upload_date
  
-def determine_ext(url):
+def determine_ext(url, default_ext=u'unknown_video'):
      guess = url.partition(u'?')[0].rpartition(u'.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
      else:
-        return u'unknown_video'
+        return default_ext
  
  def date_from_str(date_str):
      """
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 2f20826c24a316f575e004ae2f02adf93e942b9a..4fc85fac1f6bf9c78142e94d25c7910ed802def0 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.07.10'
+__version__ = '2013.08.02'
author	Rogério Brito <rbrito@ime.usp.br>
	Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)
committer	Rogério Brito <rbrito@ime.usp.br>
	Tue, 6 Aug 2013 20:36:01 +0000 (17:36 -0300)
README.md		patch \| blob \| history
README.txt		patch \| blob \| history
devscripts/youtube_genalgo.py		patch \| blob \| history
test/test_playlists.py	[new file with mode: 0644]	patch \| blob
test/test_utils.py		patch \| blob \| history
test/test_youtube_sig.py	[changed mode: 0755->0644]	patch \| blob \| history
youtube-dl		patch \| blob \| history
youtube-dl.1		patch \| blob \| history
youtube_dl/FileDownloader.py		patch \| blob \| history
youtube_dl/PostProcessor.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/archiveorg.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/breakcom.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/c56.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/canalplus.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/collegehumor.py		patch \| blob \| history
youtube_dl/extractor/comedycentral.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/condenast.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/criterion.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/cspan.py		patch \| blob \| history
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/dreisat.py		patch \| blob \| history
youtube_dl/extractor/ehow.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/escapist.py		patch \| blob \| history
youtube_dl/extractor/exfm.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/flickr.py		patch \| blob \| history
youtube_dl/extractor/freesound.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/funnyordie.py		patch \| blob \| history
youtube_dl/extractor/gametrailers.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/hotnewhiphop.py		patch \| blob \| history
youtube_dl/extractor/ign.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/ina.py		patch \| blob \| history
youtube_dl/extractor/instagram.py		patch \| blob \| history
youtube_dl/extractor/kankan.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/keek.py		patch \| blob \| history
youtube_dl/extractor/liveleak.py		patch \| blob \| history
youtube_dl/extractor/livestream.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/metacafe.py		patch \| blob \| history
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/extractor/nba.py		patch \| blob \| history
youtube_dl/extractor/roxwel.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/sina.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/soundcloud.py		patch \| blob \| history
youtube_dl/extractor/statigram.py		patch \| blob \| history
youtube_dl/extractor/steam.py		patch \| blob \| history
youtube_dl/extractor/teamcoco.py		patch \| blob \| history
youtube_dl/extractor/ted.py		patch \| blob \| history
youtube_dl/extractor/tf1.py		patch \| blob \| history
youtube_dl/extractor/thisav.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/traileraddict.py		patch \| blob \| history
youtube_dl/extractor/tutv.py		patch \| blob \| history
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/vine.py		patch \| blob \| history
youtube_dl/extractor/wat.py		patch \| blob \| history
youtube_dl/extractor/weibo.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/worldstarhiphop.py		patch \| blob \| history
youtube_dl/extractor/youjizz.py		patch \| blob \| history
youtube_dl/extractor/youku.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history