Imported Upstream version 2013.12.04

author Rogério Brito <rbrito@ime.usp.br>

Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)

committer Rogério Brito <rbrito@ime.usp.br>

Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)
author Rogério Brito <rbrito@ime.usp.br>
Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)
committer Rogério Brito <rbrito@ime.usp.br>
Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)
diff --git a/README.md b/README.md

index 6632e5865585fa45eed03b25121b4e4528628fbc..029c418d16e332c73942bf6b60ac6470d8b8429f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -30,7 +30,8 @@ which means you can modify it, redistribute it or use it however you like.
      --list-extractors          List all supported extractors and the URLs they
                                 would handle
      --extractor-descriptions   Output descriptions of all supported extractors
-    --proxy URL                Use the specified HTTP/HTTPS proxy
+    --proxy URL                Use the specified HTTP/HTTPS proxy. Pass in an
+                               empty string (--proxy "") for direct connection
      --no-check-certificate     Suppress HTTPS certificate validation.
      --cache-dir DIR            Location in the filesystem where youtube-dl can
                                 store downloaded information permanently. By
@@ -55,8 +56,9 @@ which means you can modify it, redistribute it or use it however you like.
      --dateafter DATE           download only videos uploaded after this date
      --no-playlist              download only the currently playing video
      --age-limit YEARS          download only videos suitable for the given age
-    --download-archive FILE    Download only videos not present in the archive
-                               file. Record all downloaded videos in it.
+    --download-archive FILE    Download only videos not listed in the archive
+                               file. Record the IDs of all downloaded videos in
+                               it.
  
  ## Download Options:
      -r, --rate-limit LIMIT     maximum download rate in bytes per second (e.g.
@@ -123,17 +125,18 @@ which means you can modify it, redistribute it or use it however you like.
      --get-description          simulate, quiet but print video description
      --get-filename             simulate, quiet but print output filename
      --get-format               simulate, quiet but print output format
+    -j, --dump-json            simulate, quiet but print JSON information
      --newline                  output progress bar as new lines
      --no-progress              do not print progress bar
      --console-title            display progress in console titlebar
      -v, --verbose              print various debugging information
      --dump-intermediate-pages  print downloaded pages to debug problems(very
                                 verbose)
-    --write-pages              Write downloaded pages to files in the current
-                               directory
+    --write-pages              Write downloaded intermediary pages to files in
+                               the current directory to debug problems
  
  ## Video Format Options:
-    -f, --format FORMAT        video format code, specifiy the order of
+    -f, --format FORMAT        video format code, specify the order of
                                 preference using slashes: "-f 22/17/18". "-f mp4"
                                 and "-f flv" are also supported
      --all-formats              download all available video formats
@@ -181,7 +184,7 @@ which means you can modify it, redistribute it or use it however you like.
  
  # CONFIGURATION
  
-You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`.
+You can configure youtube-dl by placing default arguments (such as `--extract-audio --no-mtime` to always extract the audio and not copy the mtime) into `/etc/youtube-dl.conf` and/or `~/.config/youtube-dl.conf`. On Windows, the configuration file locations are `%APPDATA%\youtube-dl\config.txt` and `C:\Users\<Yourname>\youtube-dl.conf`.
  
  # OUTPUT TEMPLATE
  
diff --git a/README.txt b/README.txt

index 4b400fd9fe5ad8bba8e112190f34ee7f3d050a93..fc9f17e558e28b26a7459ddd519fb1cf84185be8 100644 (file)
--- a/README.txt
+++ b/README.txt
@@ -37,7 +37,8 @@ OPTIONS
      --list-extractors          List all supported extractors and the URLs they
                                 would handle
      --extractor-descriptions   Output descriptions of all supported extractors
-    --proxy URL                Use the specified HTTP/HTTPS proxy
+    --proxy URL                Use the specified HTTP/HTTPS proxy. Pass in an
+                               empty string (--proxy "") for direct connection
      --no-check-certificate     Suppress HTTPS certificate validation.
      --cache-dir DIR            Location in the filesystem where youtube-dl can
                                 store downloaded information permanently. By
@@ -64,8 +65,9 @@ Video Selection:
      --dateafter DATE           download only videos uploaded after this date
      --no-playlist              download only the currently playing video
      --age-limit YEARS          download only videos suitable for the given age
-    --download-archive FILE    Download only videos not present in the archive
-                               file. Record all downloaded videos in it.
+    --download-archive FILE    Download only videos not listed in the archive
+                               file. Record the IDs of all downloaded videos in
+                               it.
  
  Download Options:
  -----------------
@@ -138,19 +140,20 @@ Verbosity / Simulation Options:
      --get-description          simulate, quiet but print video description
      --get-filename             simulate, quiet but print output filename
      --get-format               simulate, quiet but print output format
+    -j, --dump-json            simulate, quiet but print JSON information
      --newline                  output progress bar as new lines
      --no-progress              do not print progress bar
      --console-title            display progress in console titlebar
      -v, --verbose              print various debugging information
      --dump-intermediate-pages  print downloaded pages to debug problems(very
                                 verbose)
-    --write-pages              Write downloaded pages to files in the current
-                               directory
+    --write-pages              Write downloaded intermediary pages to files in
+                               the current directory to debug problems
  
  Video Format Options:
  ---------------------
  
-    -f, --format FORMAT        video format code, specifiy the order of
+    -f, --format FORMAT        video format code, specify the order of
                                 preference using slashes: "-f 22/17/18". "-f mp4"
                                 and "-f flv" are also supported
      --all-formats              download all available video formats
@@ -207,7 +210,9 @@ CONFIGURATION
  
  You can configure youtube-dl by placing default arguments (such as
  --extract-audio --no-mtime to always extract the audio and not copy the
-mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf.
+mtime) into /etc/youtube-dl.conf and/or ~/.config/youtube-dl.conf. On
+Windows, the configuration file locations are
+%APPDATA%\youtube-dl\config.txt and C:\Users\<Yourname>\youtube-dl.conf.
  
  OUTPUT TEMPLATE
  ===============
diff --git a/devscripts/bash-completion.in b/devscripts/bash-completion.in

index ce893fcbe1a681e535452c35f5b833eea54b2d95..3af87a3783a83cddf364e5f39b2bffd10b480950 100644 (file)
--- a/devscripts/bash-completion.in
+++ b/devscripts/bash-completion.in
@@ -1,10 +1,21 @@
  __youtube_dl()
  {
-    local cur prev opts
+    local cur prev opts fileopts diropts keywords
      COMPREPLY=()
      cur="${COMP_WORDS[COMP_CWORD]}"
+    prev="${COMP_WORDS[COMP_CWORD-1]}"
      opts="{{flags}}"
-    keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater"
+    keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
+    fileopts="-a|--batch-file|--download-archive|--cookies"
+    diropts="--cache-dir"
+
+    if [[ ${prev} =~ ${fileopts} ]]; then
+        COMPREPLY=( $(compgen -f -- ${cur}) )
+        return 0
+    elif [[ ${prev} =~ ${diropts} ]]; then
+        COMPREPLY=( $(compgen -d -- ${cur}) )
+        return 0
+    fi
  
      if [[ ${cur} =~ : ]]; then
          COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) )
diff --git a/setup.py b/setup.py

index aa7cfca0862b1f4ba2cfd220fd570ca63bcfda7e..8e24fe67918eeefa2f3f8b445ccfb480b8c841a8 100644 (file)
--- a/setup.py
+++ b/setup.py
@@ -48,7 +48,7 @@ else:
          'data_files': [  # Installing system-wide would require sudo...
              ('etc/bash_completion.d', ['youtube-dl.bash-completion']),
              ('share/doc/youtube_dl', ['README.txt']),
-            ('share/man/man1/', ['youtube-dl.1'])
+            ('share/man/man1', ['youtube-dl.1'])
          ]
      }
      if setuptools_available:
diff --git a/test/helper.py b/test/helper.py

index d7bf7a82802e58f0a80d788de83146d3a9d3fadf..b1f421ac58331bad23328502f42a0e1316df853d 100644 (file)
--- a/test/helper.py
+++ b/test/helper.py
@@ -12,10 +12,6 @@ from youtube_dl import YoutubeDL
  from youtube_dl.utils import preferredencoding
  
  
-def global_setup():
-    youtube_dl._setup_opener(timeout=10)
-
-
  def get_params(override=None):
      PARAMETERS_FILE = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                     "parameters.json")
diff --git a/test/parameters.json b/test/parameters.json

index f042880edbf0bcab661900b86809ade8ec7697af..487a46d56670c1ded91cd71ed35055e54232187b 100644 (file)
--- a/test/parameters.json
+++ b/test/parameters.json
@@ -39,5 +39,6 @@
      "writeinfojson": true, 
      "writesubtitles": false,
      "allsubtitles": false,
-    "listssubtitles": false
+    "listssubtitles": false,
+    "socket_timeout": 20
  }
diff --git a/test/test_age_restriction.py b/test/test_age_restriction.py

index d500c6edceb6018510b9226d925d9f407b72fcbd..c9cdb96cb30578d58724ddadb4328ad790316a39 100644 (file)
--- a/test/test_age_restriction.py
+++ b/test/test_age_restriction.py
@@ -6,8 +6,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import global_setup, try_rm
-global_setup()
+from test.helper import try_rm
  
  
  from youtube_dl import YoutubeDL
@@ -24,7 +23,7 @@ def _download_restricted(url, filename, age):
      }
      ydl = YoutubeDL(params)
      ydl.add_default_info_extractors()
-    json_filename = filename + '.info.json'
+    json_filename = os.path.splitext(filename)[0] + '.info.json'
      try_rm(json_filename)
      ydl.download([url])
      res = os.path.exists(json_filename)
diff --git a/test/test_all_urls.py b/test/test_all_urls.py

index 56e5f80e1f6ddb17fef3ee5c499c238996c12051..6b9764c67e98ba47a63b227d824bb97b82757b53 100644 (file)
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -100,10 +100,15 @@ class TestAllURLsMatching(unittest.TestCase):
      def test_keywords(self):
          self.assertMatch(':ytsubs', ['youtube:subscriptions'])
          self.assertMatch(':ytsubscriptions', ['youtube:subscriptions'])
-        self.assertMatch(':thedailyshow', ['ComedyCentral'])
-        self.assertMatch(':tds', ['ComedyCentral'])
-        self.assertMatch(':colbertreport', ['ComedyCentral'])
-        self.assertMatch(':cr', ['ComedyCentral'])
+        self.assertMatch(':ythistory', ['youtube:history'])
+        self.assertMatch(':thedailyshow', ['ComedyCentralShows'])
+        self.assertMatch(':tds', ['ComedyCentralShows'])
+        self.assertMatch(':colbertreport', ['ComedyCentralShows'])
+        self.assertMatch(':cr', ['ComedyCentralShows'])
+
+    def test_vimeo_matching(self):
+        self.assertMatch('http://vimeo.com/channels/tributes', ['vimeo:channel'])
+        self.assertMatch('http://vimeo.com/user7108434', ['vimeo:user'])
  
  
  if __name__ == '__main__':
diff --git a/test/test_download.py b/test/test_download.py

index 16f2008094fca751f69ee01a38f16a6fa0c3fb53..dd5818dba91c166936e45f1c7d8779c752fa3b86 100644 (file)
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -9,12 +9,10 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  from test.helper import (
      get_params,
      get_testcases,
-    global_setup,
      try_rm,
      md5,
      report_warning
  )
-global_setup()
  
  
  import hashlib
@@ -103,7 +101,7 @@ def generator(test_case):
                  tc_filename = get_tc_filename(tc)
                  try_rm(tc_filename)
                  try_rm(tc_filename + '.part')
-                try_rm(tc_filename + '.info.json')
+                try_rm(os.path.splitext(tc_filename)[0] + '.info.json')
          try_rm_tcs_files()
          try:
              try_num = 1
@@ -130,11 +128,12 @@ def generator(test_case):
                  if not test_case.get('params', {}).get('skip_download', False):
                      self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
                      self.assertTrue(tc_filename in finished_hook_called)
-                self.assertTrue(os.path.exists(tc_filename + '.info.json'))
+                info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
+                self.assertTrue(os.path.exists(info_json_fn))
                  if 'md5' in tc:
                      md5_for_file = _file_md5(tc_filename)
                      self.assertEqual(md5_for_file, tc['md5'])
-                with io.open(tc_filename + '.info.json', encoding='utf-8') as infof:
+                with io.open(info_json_fn, encoding='utf-8') as infof:
                      info_dict = json.load(infof)
                  for (info_field, expected) in tc.get('info_dict', {}).items():
                      if isinstance(expected, compat_str) and expected.startswith('md5:'):
diff --git a/test/test_playlists.py b/test/test_playlists.py

index 706b6bdca1399284263106b755fdf9278c5d17d5..00c950109111c709be0228f351220532c638467f 100644 (file)
--- a/test/test_playlists.py
+++ b/test/test_playlists.py
@@ -8,20 +8,23 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
  
  
  from youtube_dl.extractor import (
      DailymotionPlaylistIE,
      DailymotionUserIE,
      VimeoChannelIE,
+    VimeoUserIE,
      UstreamChannelIE,
      SoundcloudSetIE,
      SoundcloudUserIE,
      LivestreamIE,
      NHLVideocenterIE,
      BambuserChannelIE,
+    BandcampAlbumIE,
+    SmotriCommunityIE,
+    SmotriUserIE
  )
  
  
@@ -54,6 +57,14 @@ class TestPlaylists(unittest.TestCase):
          self.assertEqual(result['title'], u'Vimeo Tributes')
          self.assertTrue(len(result['entries']) > 24)
  
+    def test_vimeo_user(self):
+        dl = FakeYDL()
+        ie = VimeoUserIE(dl)
+        result = ie.extract('http://vimeo.com/nkistudio/videos')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Nki')
+        self.assertTrue(len(result['entries']) > 65)
+
      def test_ustream_channel(self):
          dl = FakeYDL()
          ie = UstreamChannelIE(dl)
@@ -101,7 +112,33 @@ class TestPlaylists(unittest.TestCase):
          result = ie.extract('http://bambuser.com/channel/pixelversity')
          self.assertIsPlaylist(result)
          self.assertEqual(result['title'], u'pixelversity')
-        self.assertTrue(len(result['entries']) >= 66)
+        self.assertTrue(len(result['entries']) >= 60)
+
+    def test_bandcamp_album(self):
+        dl = FakeYDL()
+        ie = BandcampAlbumIE(dl)
+        result = ie.extract('http://mpallante.bandcamp.com/album/nightmare-night-ep')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['title'], u'Nightmare Night EP')
+        self.assertTrue(len(result['entries']) >= 4)
+        
+    def test_smotri_community(self):
+        dl = FakeYDL()
+        ie = SmotriCommunityIE(dl)
+        result = ie.extract('http://smotri.com/community/video/kommuna')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'kommuna')
+        self.assertEqual(result['title'], u'КПРФ')
+        self.assertTrue(len(result['entries']) >= 4)
+        
+    def test_smotri_user(self):
+        dl = FakeYDL()
+        ie = SmotriUserIE(dl)
+        result = ie.extract('http://smotri.com/user/inspector')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], u'inspector')
+        self.assertEqual(result['title'], u'Inspector')
+        self.assertTrue(len(result['entries']) >= 9)
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_subtitles.py b/test/test_subtitles.py

index 06a304879d122b743f62703f08cdcf00301b0d39..23a6531248ccd162155d61bcc3ac5621fb08a8f2 100644 (file)
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -6,8 +6,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import FakeYDL, global_setup, md5
-global_setup()
+from test.helper import FakeYDL, md5
  
  
  from youtube_dl.extractor import (
@@ -73,7 +72,7 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
          self.DL.params['writesubtitles'] = True
          self.DL.params['subtitlesformat'] = 'vtt'
          subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '356cdc577fde0c6783b9b822e7206ff7')
+        self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
  
      def test_youtube_list_subtitles(self):
          self.DL.expect_warning(u'Video doesn\'t have automatic captions')
diff --git a/test/test_utils.py b/test/test_utils.py

index f3fbff042ccc8193d8d08527fdc04421c9832305..e9e590e749f131a0950c79bcf4fee1e9fb9004c2 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -24,6 +24,8 @@ from youtube_dl.utils import (
      xpath_with_ns,
      smuggle_url,
      unsmuggle_url,
+    shell_quote,
+    encodeFilename,
  )
  
  if sys.version_info < (3, 0):
@@ -170,6 +172,10 @@ class TestUtil(unittest.TestCase):
          self.assertEqual(res_url, url)
          self.assertEqual(res_data, None)
  
+    def test_shell_quote(self):
+        args = ['ffmpeg', '-i', encodeFilename(u'ñ€ß\'.mp4')]
+        self.assertEqual(shell_quote(args), u"""ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+
  
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_write_annotations.py b/test/test_write_annotations.py

index 35defb8953402a74ff71b7a9a14cec105a5f1703..eac53b285ab6740b368f278784aced9625abb9a6 100644 (file)
--- a/test/test_write_annotations.py
+++ b/test/test_write_annotations.py
@@ -7,8 +7,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, global_setup, try_rm
-global_setup()
+from test.helper import get_params, try_rm
  
  
  import io
diff --git a/test/test_write_info_json.py b/test/test_write_info_json.py

index a5b6f6972df48f6b7cdcfebc3ea32d11c6a27afa..d7177611b5e1a90aa3bdf612ae873336ff44d686 100644 (file)
--- a/test/test_write_info_json.py
+++ b/test/test_write_info_json.py
@@ -7,8 +7,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import get_params, global_setup
-global_setup()
+from test.helper import get_params
  
  
  import io
@@ -31,7 +30,7 @@ params = get_params({
  
  
  TEST_ID = 'BaW_jenozKc'
-INFO_JSON_FILE = TEST_ID + '.mp4.info.json'
+INFO_JSON_FILE = TEST_ID + '.info.json'
  DESCRIPTION_FILE = TEST_ID + '.mp4.description'
  EXPECTED_DESCRIPTION = u'''test chars:  "'/\ä↭𝕐
  
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py

index 4b7a7847bd3a33a9a2bff3e99f9f4cff0de7eebf..95f07d129b61df97376c6699f05a656043be4773 100644 (file)
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -6,8 +6,7 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import FakeYDL, global_setup
-global_setup()
+from test.helper import FakeYDL
  
  
  from youtube_dl.extractor import (
@@ -27,7 +26,7 @@ class TestYoutubeLists(unittest.TestCase):
      def test_youtube_playlist(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
          self.assertIsPlaylist(result)
          self.assertEqual(result['title'], 'ytdl test PL')
          ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
@@ -44,13 +43,13 @@ class TestYoutubeLists(unittest.TestCase):
      def test_issue_673(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('PLBB231211A4F62143')[0]
+        result = ie.extract('PLBB231211A4F62143')
          self.assertTrue(len(result['entries']) > 25)
  
      def test_youtube_playlist_long(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
          self.assertIsPlaylist(result)
          self.assertTrue(len(result['entries']) >= 799)
  
@@ -58,7 +57,7 @@ class TestYoutubeLists(unittest.TestCase):
          #651
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
          ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']]
          self.assertFalse('pElCt5oNDuI' in ytie_results)
          self.assertFalse('KdPEApIVdWM' in ytie_results)
@@ -66,7 +65,7 @@ class TestYoutubeLists(unittest.TestCase):
      def test_youtube_playlist_empty(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0]
+        result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')
          self.assertIsPlaylist(result)
          self.assertEqual(len(result['entries']), 0)
  
@@ -74,7 +73,7 @@ class TestYoutubeLists(unittest.TestCase):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
          # TODO find a > 100 (paginating?) videos course
-        result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0]
+        result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
          entries = result['entries']
          self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs')
          self.assertEqual(len(entries), 25)
@@ -84,22 +83,22 @@ class TestYoutubeLists(unittest.TestCase):
          dl = FakeYDL()
          ie = YoutubeChannelIE(dl)
          #test paginated channel
-        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0]
+        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')
          self.assertTrue(len(result['entries']) > 90)
          #test autogenerated channel
-        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0]
+        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
          self.assertTrue(len(result['entries']) >= 18)
  
      def test_youtube_user(self):
          dl = FakeYDL()
          ie = YoutubeUserIE(dl)
-        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
+        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')
          self.assertTrue(len(result['entries']) >= 320)
  
      def test_youtube_safe_search(self):
          dl = FakeYDL()
          ie = YoutubePlaylistIE(dl)
-        result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
+        result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')
          self.assertEqual(len(result['entries']), 2)
  
      def test_youtube_show(self):
@@ -108,5 +107,14 @@ class TestYoutubeLists(unittest.TestCase):
          result = ie.extract('http://www.youtube.com/show/airdisasters')
          self.assertTrue(len(result) >= 3)
  
+    def test_youtube_mix(self):
+        dl = FakeYDL()
+        ie = YoutubePlaylistIE(dl)
+        result = ie.extract('http://www.youtube.com/watch?v=lLJf9qJHR3E&list=RDrjFaenf1T-Y')
+        entries = result['entries']
+        self.assertTrue(len(entries) >= 20)
+        original_video = entries[0]
+        self.assertEqual(original_video['id'], 'rjFaenf1T-Y')
+
  if __name__ == '__main__':
      unittest.main()
diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py

index 5e1ff5eb0ede5bcb020cd027ca00d5b4159f9812..056700614b43fa0a3dbceeb82ef991e34fdb53f9 100644 (file)
--- a/test/test_youtube_signature.py
+++ b/test/test_youtube_signature.py
@@ -6,9 +6,6 @@ import sys
  import unittest
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
  
-from test.helper import global_setup
-global_setup()
-
  
  import io
  import re
diff --git a/youtube-dl b/youtube-dl

index 0924a5d8a7fa976e2878e3607a5410a96559acf0..899ca28bd80ce245f7afd7cbadc2eae53755a5ff 100755 (executable)

Binary files a/youtube-dl and b/youtube-dl differ
diff --git a/youtube-dl.1 b/youtube-dl.1

index bdd5a2da1d902677247977f812a46598db8b59af..a172bf858dd95b9b5e55d353cd801b57643b308d 100644 (file)
--- a/youtube-dl.1
+++ b/youtube-dl.1
@@ -34,7 +34,8 @@ redistribute it or use it however you like.
  \-\-list\-extractors\ \ \ \ \ \ \ \ \ \ List\ all\ supported\ extractors\ and\ the\ URLs\ they
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ would\ handle
  \-\-extractor\-descriptions\ \ \ Output\ descriptions\ of\ all\ supported\ extractors
-\-\-proxy\ URL\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ the\ specified\ HTTP/HTTPS\ proxy
+\-\-proxy\ URL\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ Use\ the\ specified\ HTTP/HTTPS\ proxy.\ Pass\ in\ an
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ empty\ string\ (\-\-proxy\ "")\ for\ direct\ connection
  \-\-no\-check\-certificate\ \ \ \ \ Suppress\ HTTPS\ certificate\ validation.
  \-\-cache\-dir\ DIR\ \ \ \ \ \ \ \ \ \ \ \ Location\ in\ the\ filesystem\ where\ youtube\-dl\ can
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ store\ downloaded\ information\ permanently.\ By
@@ -63,8 +64,9 @@ redistribute it or use it however you like.
  \-\-dateafter\ DATE\ \ \ \ \ \ \ \ \ \ \ download\ only\ videos\ uploaded\ after\ this\ date
  \-\-no\-playlist\ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ only\ the\ currently\ playing\ video
  \-\-age\-limit\ YEARS\ \ \ \ \ \ \ \ \ \ download\ only\ videos\ suitable\ for\ the\ given\ age
-\-\-download\-archive\ FILE\ \ \ \ Download\ only\ videos\ not\ present\ in\ the\ archive
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ file.\ Record\ all\ downloaded\ videos\ in\ it.
+\-\-download\-archive\ FILE\ \ \ \ Download\ only\ videos\ not\ listed\ in\ the\ archive
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ file.\ Record\ the\ IDs\ of\ all\ downloaded\ videos\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ it.
  \f[]
  .fi
  .SS Download Options:
@@ -143,21 +145,22 @@ redistribute it or use it however you like.
  \-\-get\-description\ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ video\ description
  \-\-get\-filename\ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ filename
  \-\-get\-format\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ output\ format
+\-j,\ \-\-dump\-json\ \ \ \ \ \ \ \ \ \ \ \ simulate,\ quiet\ but\ print\ JSON\ information
  \-\-newline\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ output\ progress\ bar\ as\ new\ lines
  \-\-no\-progress\ \ \ \ \ \ \ \ \ \ \ \ \ \ do\ not\ print\ progress\ bar
  \-\-console\-title\ \ \ \ \ \ \ \ \ \ \ \ display\ progress\ in\ console\ titlebar
  \-v,\ \-\-verbose\ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ various\ debugging\ information
  \-\-dump\-intermediate\-pages\ \ print\ downloaded\ pages\ to\ debug\ problems(very
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ verbose)
-\-\-write\-pages\ \ \ \ \ \ \ \ \ \ \ \ \ \ Write\ downloaded\ pages\ to\ files\ in\ the\ current
-\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ directory
+\-\-write\-pages\ \ \ \ \ \ \ \ \ \ \ \ \ \ Write\ downloaded\ intermediary\ pages\ to\ files\ in
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ the\ current\ directory\ to\ debug\ problems
  \f[]
  .fi
  .SS Video Format Options:
  .IP
  .nf
  \f[C]
-\-f,\ \-\-format\ FORMAT\ \ \ \ \ \ \ \ video\ format\ code,\ specifiy\ the\ order\ of
+\-f,\ \-\-format\ FORMAT\ \ \ \ \ \ \ \ video\ format\ code,\ specify\ the\ order\ of
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ preference\ using\ slashes:\ "\-f\ 22/17/18".\ "\-f\ mp4"
  \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ and\ "\-f\ flv"\ are\ also\ supported
  \-\-all\-formats\ \ \ \ \ \ \ \ \ \ \ \ \ \ download\ all\ available\ video\ formats
@@ -222,6 +225,9 @@ You can configure youtube\-dl by placing default arguments (such as
  \f[C]\-\-extract\-audio\ \-\-no\-mtime\f[] to always extract the audio
  and not copy the mtime) into \f[C]/etc/youtube\-dl.conf\f[] and/or
  \f[C]~/.config/youtube\-dl.conf\f[].
+On Windows, the configuration file locations are
+\f[C]%APPDATA%\\youtube\-dl\\config.txt\f[] and
+\f[C]C:\\Users\\<Yourname>\\youtube\-dl.conf\f[].
  .SH OUTPUT TEMPLATE
  .PP
  The \f[C]\-o\f[] option allows users to indicate a template for the
diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion

index 0123996620626f55166ba05de7ba3e3e6885771e..9c30e7b529f0f38c996d1c4ddce6b0ff7791e221 100644 (file)
--- a/youtube-dl.bash-completion
+++ b/youtube-dl.bash-completion
@@ -1,10 +1,21 @@
  __youtube_dl()
  {
-    local cur prev opts
+    local cur prev opts fileopts diropts keywords
      COMPREPLY=()
      cur="${COMP_WORDS[COMP_CWORD]}"
-    opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --no-playlist --age-limit --download-archive --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-filename --get-format --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata"
-    keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater"
+    prev="${COMP_WORDS[COMP_CWORD-1]}"
+    opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --user-agent --referer --list-extractors --extractor-descriptions --proxy --no-check-certificate --cache-dir --no-cache-dir --socket-timeout --playlist-start --playlist-end --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --no-playlist --age-limit --download-archive --rate-limit --retries --buffer-size --no-resize-buffer --test --title --id --literal --auto-number --output --autonumber-size --restrict-filenames --batch-file --no-overwrites --continue --no-continue --cookies --no-part --no-mtime --write-description --write-info-json --write-annotations --write-thumbnail --quiet --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-filename --get-format --dump-json --newline --no-progress --console-title --verbose --dump-intermediate-pages --write-pages --youtube-print-sig-code --format --all-formats --prefer-free-formats --max-quality --list-formats --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --netrc --video-password --extract-audio --audio-format --audio-quality --recode-video --keep-video --no-post-overwrites --embed-subs --add-metadata"
+    keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
+    fileopts="-a|--batch-file|--download-archive|--cookies"
+    diropts="--cache-dir"
+
+    if [[ ${prev} =~ ${fileopts} ]]; then
+        COMPREPLY=( $(compgen -f -- ${cur}) )
+        return 0
+    elif [[ ${prev} =~ ${diropts} ]]; then
+        COMPREPLY=( $(compgen -d -- ${cur}) )
+        return 0
+    fi
  
      if [[ ${cur} =~ : ]]; then
          COMPREPLY=( $(compgen -W "${keywords}" -- ${cur}) )
diff --git a/youtube_dl/FileDownloader.py b/youtube_dl/FileDownloader.py

index 088f595866372e360e425a3aeee196374f504b7c..3ff9716b33b22e39a0a6d925bfa33aba8fa092f9 100644 (file)
--- a/youtube_dl/FileDownloader.py
+++ b/youtube_dl/FileDownloader.py
@@ -1,19 +1,16 @@
-import math
  import os
  import re
  import subprocess
  import sys
  import time
  
-if os.name == 'nt':
-    import ctypes
-
  from .utils import (
      compat_urllib_error,
      compat_urllib_request,
      ContentTooShortError,
      determine_ext,
      encodeFilename,
+    format_bytes,
      sanitize_open,
      timeconvert,
  )
@@ -56,20 +53,6 @@ class FileDownloader(object):
          self._progress_hooks = []
          self.params = params
  
-    @staticmethod
-    def format_bytes(bytes):
-        if bytes is None:
-            return 'N/A'
-        if type(bytes) is str:
-            bytes = float(bytes)
-        if bytes == 0.0:
-            exponent = 0
-        else:
-            exponent = int(math.log(bytes, 1024.0))
-        suffix = ['B','KiB','MiB','GiB','TiB','PiB','EiB','ZiB','YiB'][exponent]
-        converted = float(bytes) / float(1024 ** exponent)
-        return '%.2f%s' % (converted, suffix)
-
      @staticmethod
      def format_seconds(seconds):
          (mins, secs) = divmod(seconds, 60)
@@ -120,7 +103,7 @@ class FileDownloader(object):
      def format_speed(speed):
          if speed is None:
              return '%10s' % '---b/s'
-        return '%10s' % ('%s/s' % FileDownloader.format_bytes(speed))
+        return '%10s' % ('%s/s' % format_bytes(speed))
  
      @staticmethod
      def best_block_size(elapsed_time, bytes):
@@ -151,16 +134,8 @@ class FileDownloader(object):
      def to_stderr(self, message):
          self.ydl.to_screen(message)
  
-    def to_cons_title(self, message):
-        """Set console/terminal window title to message."""
-        if not self.params.get('consoletitle', False):
-            return
-        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
-            # c_wchar_p() might not be necessary if `message` is
-            # already of type unicode()
-            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
-        elif 'TERM' in os.environ:
-            self.to_screen('\033]0;%s\007' % message, skip_eol=True)
+    def to_console_title(self, message):
+        self.ydl.to_console_title(message)
  
      def trouble(self, *args, **kargs):
          self.ydl.trouble(*args, **kargs)
@@ -249,7 +224,7 @@ class FileDownloader(object):
          else:
              self.to_screen(u'\r%s[download] %s of %s at %s ETA %s' %
                  (clear_line, percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
-        self.to_cons_title(u'youtube-dl - %s of %s at %s ETA %s' %
+        self.to_console_title(u'youtube-dl - %s of %s at %s ETA %s' %
                  (percent_str.strip(), data_len_str.strip(), speed_str.strip(), eta_str.strip()))
  
      def report_resuming_byte(self, resume_len):
@@ -281,6 +256,61 @@ class FileDownloader(object):
                  (clear_line, data_len_str, self.format_seconds(tot_time)))
  
      def _download_with_rtmpdump(self, filename, url, player_url, page_url, play_path, tc_url, live):
+        def run_rtmpdump(args):
+            start = time.time()
+            resume_percent = None
+            resume_downloaded_data_len = None
+            proc = subprocess.Popen(args, stderr=subprocess.PIPE)
+            cursor_in_new_line = True
+            proc_stderr_closed = False
+            while not proc_stderr_closed:
+                # read line from stderr
+                line = u''
+                while True:
+                    char = proc.stderr.read(1)
+                    if not char:
+                        proc_stderr_closed = True
+                        break
+                    if char in [b'\r', b'\n']:
+                        break
+                    line += char.decode('ascii', 'replace')
+                if not line:
+                    # proc_stderr_closed is True
+                    continue
+                mobj = re.search(r'([0-9]+\.[0-9]{3}) kB / [0-9]+\.[0-9]{2} sec \(([0-9]{1,2}\.[0-9])%\)', line)
+                if mobj:
+                    downloaded_data_len = int(float(mobj.group(1))*1024)
+                    percent = float(mobj.group(2))
+                    if not resume_percent:
+                        resume_percent = percent
+                        resume_downloaded_data_len = downloaded_data_len
+                    eta = self.calc_eta(start, time.time(), 100-resume_percent, percent-resume_percent)
+                    speed = self.calc_speed(start, time.time(), downloaded_data_len-resume_downloaded_data_len)
+                    data_len = None
+                    if percent > 0:
+                        data_len = int(downloaded_data_len * 100 / percent)
+                    data_len_str = u'~' + format_bytes(data_len)
+                    self.report_progress(percent, data_len_str, speed, eta)
+                    cursor_in_new_line = False
+                    self._hook_progress({
+                        'downloaded_bytes': downloaded_data_len,
+                        'total_bytes': data_len,
+                        'tmpfilename': tmpfilename,
+                        'filename': filename,
+                        'status': 'downloading',
+                        'eta': eta,
+                        'speed': speed,
+                    })
+                elif self.params.get('verbose', False):
+                    if not cursor_in_new_line:
+                        self.to_screen(u'')
+                    cursor_in_new_line = True
+                    self.to_screen(u'[rtmpdump] '+line)
+            proc.wait()
+            if not cursor_in_new_line:
+                self.to_screen(u'')
+            return proc.returncode
+
          self.report_destination(filename)
          tmpfilename = self.temp_name(filename)
          test = self.params.get('test', False)
@@ -291,12 +321,11 @@ class FileDownloader(object):
          except (OSError, IOError):
              self.report_error(u'RTMP download detected but "rtmpdump" could not be run')
              return False
-        verbosity_option = '--verbose' if self.params.get('verbose', False) else '--quiet'
  
          # Download using rtmpdump. rtmpdump returns exit code 2 when
          # the connection was interrumpted and resuming appears to be
          # possible. This is part of rtmpdump's normal usage, AFAIK.
-        basic_args = ['rtmpdump', verbosity_option, '-r', url, '-o', tmpfilename]
+        basic_args = ['rtmpdump', '--verbose', '-r', url, '-o', tmpfilename]
          if player_url is not None:
              basic_args += ['--swfVfy', player_url]
          if page_url is not None:
@@ -310,30 +339,48 @@ class FileDownloader(object):
          if live:
              basic_args += ['--live']
          args = basic_args + [[], ['--resume', '--skip', '1']][self.params.get('continuedl', False)]
+
+        if sys.platform == 'win32' and sys.version_info < (3, 0):
+            # Windows subprocess module does not actually support Unicode
+            # on Python 2.x
+            # See http://stackoverflow.com/a/9951851/35070
+            subprocess_encoding = sys.getfilesystemencoding()
+            args = [a.encode(subprocess_encoding, 'ignore') for a in args]
+        else:
+            subprocess_encoding = None
+
          if self.params.get('verbose', False):
+            if subprocess_encoding:
+                str_args = [
+                    a.decode(subprocess_encoding) if isinstance(a, bytes) else a
+                    for a in args]
+            else:
+                str_args = args
              try:
                  import pipes
-                shell_quote = lambda args: ' '.join(map(pipes.quote, args))
+                shell_quote = lambda args: ' '.join(map(pipes.quote, str_args))
              except ImportError:
                  shell_quote = repr
-            self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(args))
-        retval = subprocess.call(args)
+            self.to_screen(u'[debug] rtmpdump command line: ' + shell_quote(str_args))
+
+        retval = run_rtmpdump(args)
+
          while (retval == 2 or retval == 1) and not test:
              prevsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True)
+            self.to_screen(u'[rtmpdump] %s bytes' % prevsize)
              time.sleep(5.0) # This seems to be needed
-            retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
+            retval = run_rtmpdump(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
              cursize = os.path.getsize(encodeFilename(tmpfilename))
              if prevsize == cursize and retval == 1:
                  break
               # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those
              if prevsize == cursize and retval == 2 and cursize > 1024:
-                self.to_screen(u'\r[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
+                self.to_screen(u'[rtmpdump] Could not download the whole video. This can happen for some advertisements.')
                  retval = 0
                  break
          if retval == 0 or (test and retval == 2):
              fsize = os.path.getsize(encodeFilename(tmpfilename))
-            self.to_screen(u'\r[rtmpdump] %s bytes' % fsize)
+            self.to_screen(u'[rtmpdump] %s bytes' % fsize)
              self.try_rename(tmpfilename, filename)
              self._hook_progress({
                  'downloaded_bytes': fsize,
@@ -536,7 +583,7 @@ class FileDownloader(object):
                  self.to_screen(u'\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
                  return False
  
-        data_len_str = self.format_bytes(data_len)
+        data_len_str = format_bytes(data_len)
          byte_counter = 0 + resume_len
          block_size = self.params.get('buffersize', 1024)
          start = time.time()
diff --git a/youtube_dl/PostProcessor.py b/youtube_dl/PostProcessor.py

index 13b56ede5fdb3d66064a8072cdda87787eee1bae..69aedf87a44c72060e2af135cd95f6f820e9ab0c 100644 (file)
--- a/youtube_dl/PostProcessor.py
+++ b/youtube_dl/PostProcessor.py
@@ -501,7 +501,7 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
  
          options = ['-c', 'copy']
          for (name, value) in metadata.items():
-            options.extend(['-metadata', '%s="%s"' % (name, value)])
+            options.extend(['-metadata', '%s=%s' % (name, value)])
          options.extend(['-f', ext])
  
          self._downloader.to_screen(u'[ffmpeg] Adding metadata to \'%s\'' % filename)
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 5253c39e1c8c3fdfe083d62d69c6b7552ef303b6..b68b110a461f50af2d163997d1acfa18ece3afaf 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -5,17 +5,53 @@ from __future__ import absolute_import
  
  import errno
  import io
+import json
  import os
+import platform
  import re
  import shutil
+import subprocess
  import socket
  import sys
  import time
  import traceback
  
-from .utils import *
+if os.name == 'nt':
+    import ctypes
+
+from .utils import (
+    compat_cookiejar,
+    compat_http_client,
+    compat_print,
+    compat_str,
+    compat_urllib_error,
+    compat_urllib_request,
+    ContentTooShortError,
+    date_from_str,
+    DateRange,
+    determine_ext,
+    DownloadError,
+    encodeFilename,
+    ExtractorError,
+    format_bytes,
+    locked_file,
+    make_HTTPS_handler,
+    MaxDownloadsReached,
+    PostProcessingError,
+    platform_name,
+    preferredencoding,
+    SameFileError,
+    sanitize_filename,
+    subtitles_filename,
+    takewhile_inclusive,
+    UnavailableVideoError,
+    write_json_file,
+    write_string,
+    YoutubeDLHandler,
+)
  from .extractor import get_info_extractor, gen_extractors
  from .FileDownloader import FileDownloader
+from .version import __version__
  
  
  class YoutubeDL(object):
@@ -57,6 +93,7 @@ class YoutubeDL(object):
      forcethumbnail:    Force printing thumbnail URL.
      forcedescription:  Force printing description.
      forcefilename:     Force printing final filename.
+    forcejson:         Force printing info_dict as JSON.
      simulate:          Do not download the video files.
      format:            Video format code.
      format_limit:      Highest quality format to try.
@@ -68,6 +105,7 @@ class YoutubeDL(object):
      playlistend:       Playlist item to end at.
      matchtitle:        Download only matching titles.
      rejecttitle:       Reject downloads for matching titles.
+    logger:            Log messages to a logging.Logger instance.
      logtostderr:       Log messages to stderr instead of stdout.
      writedescription:  Write the video description to a .description file
      writeinfojson:     Write the video description to a .info.json file
@@ -88,9 +126,13 @@ class YoutubeDL(object):
      noplaylist:        Download single video instead of a playlist if in doubt.
      age_limit:         An integer representing the user's age in years.
                         Unsuitable videos for the given age are skipped.
-    downloadarchive:   File name of a file where all downloads are recorded.
+    download_archive:   File name of a file where all downloads are recorded.
                         Videos already present in the file are not downloaded
                         again.
+    cookiefile:        File name where cookies should be read from and dumped to.
+    nocheckcertificate:Do not verify SSL certificates
+    proxy:             URL of the proxy server to use
+    socket_timeout:    Time to wait for unresponsive hosts, in seconds
  
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
@@ -105,7 +147,7 @@ class YoutubeDL(object):
      _num_downloads = None
      _screen_file = None
  
-    def __init__(self, params):
+    def __init__(self, params=None):
          """Create a FileDownloader object with the given options."""
          self._ies = []
          self._ies_instances = {}
@@ -114,6 +156,7 @@ class YoutubeDL(object):
          self._download_retcode = 0
          self._num_downloads = 0
          self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+        self.params = {} if params is None else params
  
          if (sys.version_info >= (3,) and sys.platform != 'win32' and
                  sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
@@ -123,14 +166,15 @@ class YoutubeDL(object):
                  u'Assuming --restrict-filenames since file system encoding '
                  u'cannot encode all charactes. '
                  u'Set the LC_ALL environment variable to fix this.')
-            params['restrictfilenames'] = True
+            self.params['restrictfilenames'] = True
  
-        self.params = params
          self.fd = FileDownloader(self, self.params)
  
-        if '%(stitle)s' in self.params['outtmpl']:
+        if '%(stitle)s' in self.params.get('outtmpl', ''):
              self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
  
+        self._setup_opener()
+
      def add_info_extractor(self, ie):
          """Add an InfoExtractor object to the end of the list."""
          self._ies.append(ie)
@@ -163,7 +207,9 @@ class YoutubeDL(object):
  
      def to_screen(self, message, skip_eol=False):
          """Print message to stdout if not in quiet mode."""
-        if not self.params.get('quiet', False):
+        if self.params.get('logger'):
+            self.params['logger'].debug(message)
+        elif not self.params.get('quiet', False):
              terminator = [u'\n', u''][skip_eol]
              output = message + terminator
              write_string(output, self._screen_file)
@@ -171,14 +217,47 @@ class YoutubeDL(object):
      def to_stderr(self, message):
          """Print message to stderr."""
          assert type(message) == type(u'')
-        output = message + u'\n'
-        if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
-            output = output.encode(preferredencoding())
-        sys.stderr.write(output)
+        if self.params.get('logger'):
+            self.params['logger'].error(message)
+        else:
+            output = message + u'\n'
+            if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
+                output = output.encode(preferredencoding())
+            sys.stderr.write(output)
+
+    def to_console_title(self, message):
+        if not self.params.get('consoletitle', False):
+            return
+        if os.name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
+            # c_wchar_p() might not be necessary if `message` is
+            # already of type unicode()
+            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+        elif 'TERM' in os.environ:
+            write_string(u'\033]0;%s\007' % message, self._screen_file)
+
+    def save_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if 'TERM' in os.environ:
+            # Save the title on stack
+            write_string(u'\033[22;0t', self._screen_file)
+
+    def restore_console_title(self):
+        if not self.params.get('consoletitle', False):
+            return
+        if 'TERM' in os.environ:
+            # Restore the title from stack
+            write_string(u'\033[23;0t', self._screen_file)
  
-    def fixed_template(self):
-        """Checks if the output template is fixed."""
-        return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
+    def __enter__(self):
+        self.save_console_title()
+        return self
+
+    def __exit__(self, *args):
+        self.restore_console_title()
+    
+        if self.params.get('cookiefile') is not None:
+            self.cookiejar.save()
  
      def trouble(self, message=None, tb=None):
          """Determine action to take when a download problem appears.
@@ -254,7 +333,7 @@ class YoutubeDL(object):
          """Report file has already been fully downloaded."""
          try:
              self.to_screen(u'[download] %s has already been downloaded' % file_name)
-        except (UnicodeEncodeError) as err:
+        except UnicodeEncodeError:
              self.to_screen(u'[download] The file has already been downloaded')
  
      def increment_downloads(self):
@@ -295,15 +374,17 @@ class YoutubeDL(object):
      def _match_entry(self, info_dict):
          """ Returns None iff the file should be downloaded """
  
-        title = info_dict['title']
-        matchtitle = self.params.get('matchtitle', False)
-        if matchtitle:
-            if not re.search(matchtitle, title, re.IGNORECASE):
-                return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
-        rejecttitle = self.params.get('rejecttitle', False)
-        if rejecttitle:
-            if re.search(rejecttitle, title, re.IGNORECASE):
-                return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+        if 'title' in info_dict:
+            # This can happen when we're just evaluating the playlist
+            title = info_dict['title']
+            matchtitle = self.params.get('matchtitle', False)
+            if matchtitle:
+                if not re.search(matchtitle, title, re.IGNORECASE):
+                    return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+            rejecttitle = self.params.get('rejecttitle', False)
+            if rejecttitle:
+                if re.search(rejecttitle, title, re.IGNORECASE):
+                    return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
          date = info_dict.get('upload_date', None)
          if date is not None:
              dateRange = self.params.get('daterange', DateRange())
@@ -314,8 +395,8 @@ class YoutubeDL(object):
              if age_limit < info_dict.get('age_limit', 0):
                  return u'Skipping "' + title + '" because it is age restricted'
          if self.in_download_archive(info_dict):
-            return (u'%(title)s has already been recorded in archive'
-                    % info_dict)
+            return (u'%s has already been recorded in archive'
+                    % info_dict.get('title', info_dict.get('id', u'video')))
          return None
  
      @staticmethod
@@ -385,7 +466,7 @@ class YoutubeDL(object):
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
              self.add_extra_info(ie_result, extra_info)
-            return self.process_video_result(ie_result)
+            return self.process_video_result(ie_result, download=download)
          elif result_type == 'url':
              # We have to add extra_info to the results because it may be
              # contained in a playlist
@@ -394,7 +475,7 @@ class YoutubeDL(object):
                                       ie_key=ie_result.get('ie_key'),
                                       extra_info=extra_info)
          elif result_type == 'playlist':
-            self.add_extra_info(ie_result, extra_info)
+
              # We process each entry in the playlist
              playlist = ie_result.get('title', None) or ie_result.get('id', None)
              self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -424,6 +505,12 @@ class YoutubeDL(object):
                      'webpage_url': ie_result['webpage_url'],
                      'extractor_key': ie_result['extractor_key'],
                  }
+
+                reason = self._match_entry(entry)
+                if reason is not None:
+                    self.to_screen(u'[download] ' + reason)
+                    continue
+
                  entry_result = self.process_ie_result(entry,
                                                        download=download,
                                                        extra_info=extra)
@@ -579,7 +666,7 @@ class YoutubeDL(object):
  
          # Forced printings
          if self.params.get('forcetitle', False):
-            compat_print(info_dict['title'])
+            compat_print(info_dict['fulltitle'])
          if self.params.get('forceid', False):
              compat_print(info_dict['id'])
          if self.params.get('forceurl', False):
@@ -593,6 +680,8 @@ class YoutubeDL(object):
              compat_print(filename)
          if self.params.get('forceformat', False):
              compat_print(info_dict['format'])
+        if self.params.get('forcejson', False):
+            compat_print(json.dumps(info_dict))
  
          # Do nothing else if in simulate mode
          if self.params.get('simulate', False):
@@ -640,7 +729,7 @@ class YoutubeDL(object):
              # subtitles download errors are already managed as troubles in relevant IE
              # that way it will silently go on when used with unsupporting IE
              subtitles = info_dict['subtitles']
-            sub_format = self.params.get('subtitlesformat')
+            sub_format = self.params.get('subtitlesformat', 'srt')
              for sub_lang in subtitles.keys():
                  sub = subtitles[sub_lang]
                  if sub is None:
@@ -655,7 +744,7 @@ class YoutubeDL(object):
                      return
  
          if self.params.get('writeinfojson', False):
-            infofn = filename + u'.info.json'
+            infofn = os.path.splitext(filename)[0] + u'.info.json'
              self.report_writeinfojson(infofn)
              try:
                  json_info_dict = dict((k, v) for k, v in info_dict.items() if not k in ['urlhandle'])
@@ -706,13 +795,15 @@ class YoutubeDL(object):
  
      def download(self, url_list):
          """Download a given list of URLs."""
-        if len(url_list) > 1 and self.fixed_template():
+        if (len(url_list) > 1 and
+                '%' not in self.params['outtmpl']
+                and self.params.get('max_downloads') != 1):
              raise SameFileError(self.params['outtmpl'])
  
          for url in url_list:
              try:
                  #It also downloads the videos
-                videos = self.extract_info(url)
+                self.extract_info(url)
              except UnavailableVideoError:
                  self.report_error(u'unable to download video')
              except MaxDownloadsReached:
@@ -744,11 +835,26 @@ class YoutubeDL(object):
              except (IOError, OSError):
                  self.report_warning(u'Unable to remove downloaded video file')
  
+    def _make_archive_id(self, info_dict):
+        # Future-proof against any change in case
+        # and backwards compatibility with prior versions
+        extractor = info_dict.get('extractor_key')
+        if extractor is None:
+            if 'id' in info_dict:
+                extractor = info_dict.get('ie_key')  # key in a playlist
+        if extractor is None:
+            return None  # Incomplete video information
+        return extractor.lower() + u' ' + info_dict['id']
+
      def in_download_archive(self, info_dict):
          fn = self.params.get('download_archive')
          if fn is None:
              return False
-        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+
+        vid_id = self._make_archive_id(info_dict)
+        if vid_id is None:
+            return False  # Incomplete video information
+
          try:
              with locked_file(fn, 'r', encoding='utf-8') as archive_file:
                  for line in archive_file:
@@ -763,12 +869,15 @@ class YoutubeDL(object):
          fn = self.params.get('download_archive')
          if fn is None:
              return
-        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+        vid_id = self._make_archive_id(info_dict)
+        assert vid_id
          with locked_file(fn, 'a', encoding='utf-8') as archive_file:
              archive_file.write(vid_id + u'\n')
  
      @staticmethod
      def format_resolution(format, default='unknown'):
+        if format.get('vcodec') == 'none':
+            return 'audio only'
          if format.get('_resolution') is not None:
              return format['_resolution']
          if format.get('height') is not None:
@@ -781,23 +890,124 @@ class YoutubeDL(object):
          return res
  
      def list_formats(self, info_dict):
-        def line(format):
-            return (u'%-20s%-10s%-12s%s' % (
+        def format_note(fdict):
+            res = u''
+            if fdict.get('format_note') is not None:
+                res += fdict['format_note'] + u' '
+            if (fdict.get('vcodec') is not None and
+                    fdict.get('vcodec') != 'none'):
+                res += u'%-5s' % fdict['vcodec']
+            elif fdict.get('vbr') is not None:
+                res += u'video'
+            if fdict.get('vbr') is not None:
+                res += u'@%4dk' % fdict['vbr']
+            if fdict.get('acodec') is not None:
+                if res:
+                    res += u', '
+                res += u'%-5s' % fdict['acodec']
+            elif fdict.get('abr') is not None:
+                if res:
+                    res += u', '
+                res += 'audio'
+            if fdict.get('abr') is not None:
+                res += u'@%3dk' % fdict['abr']
+            if fdict.get('filesize') is not None:
+                if res:
+                    res += u', '
+                res += format_bytes(fdict['filesize'])
+            return res
+
+        def line(format, idlen=20):
+            return ((u'%-' + compat_str(idlen + 1) + u's%-10s%-12s%s') % (
                  format['format_id'],
                  format['ext'],
                  self.format_resolution(format),
-                format.get('format_note', ''),
-                )
-            )
+                format_note(format),
+            ))
  
          formats = info_dict.get('formats', [info_dict])
-        formats_s = list(map(line, formats))
+        idlen = max(len(u'format code'),
+                    max(len(f['format_id']) for f in formats))
+        formats_s = [line(f, idlen) for f in formats]
          if len(formats) > 1:
-            formats_s[0] += (' ' if formats[0].get('format_note') else '') + '(worst)'
-            formats_s[-1] += (' ' if formats[-1].get('format_note') else '') + '(best)'
+            formats_s[0] += (' ' if format_note(formats[0]) else '') + '(worst)'
+            formats_s[-1] += (' ' if format_note(formats[-1]) else '') + '(best)'
  
          header_line = line({
              'format_id': u'format code', 'ext': u'extension',
-            '_resolution': u'resolution', 'format_note': u'note'})
+            '_resolution': u'resolution', 'format_note': u'note'}, idlen=idlen)
          self.to_screen(u'[info] Available formats for %s:\n%s\n%s' %
                         (info_dict['id'], header_line, u"\n".join(formats_s)))
+
+    def urlopen(self, req):
+        """ Start an HTTP download """
+        return self._opener.open(req)
+
+    def print_debug_header(self):
+        if not self.params.get('verbose'):
+            return
+        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
+        try:
+            sp = subprocess.Popen(
+                ['git', 'rev-parse', '--short', 'HEAD'],
+                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
+                cwd=os.path.dirname(os.path.abspath(__file__)))
+            out, err = sp.communicate()
+            out = out.decode().strip()
+            if re.match('[0-9a-f]+', out):
+                write_string(u'[debug] Git HEAD: ' + out + u'\n')
+        except:
+            try:
+                sys.exc_clear()
+            except:
+                pass
+        write_string(u'[debug] Python version %s - %s' %
+                     (platform.python_version(), platform_name()) + u'\n')
+
+        proxy_map = {}
+        for handler in self._opener.handlers:
+            if hasattr(handler, 'proxies'):
+                proxy_map.update(handler.proxies)
+        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
+
+    def _setup_opener(self):
+        timeout_val = self.params.get('socket_timeout')
+        timeout = 600 if timeout_val is None else float(timeout_val)
+
+        opts_cookiefile = self.params.get('cookiefile')
+        opts_proxy = self.params.get('proxy')
+
+        if opts_cookiefile is None:
+            self.cookiejar = compat_cookiejar.CookieJar()
+        else:
+            self.cookiejar = compat_cookiejar.MozillaCookieJar(
+                opts_cookiefile)
+            if os.access(opts_cookiefile, os.R_OK):
+                self.cookiejar.load()
+
+        cookie_processor = compat_urllib_request.HTTPCookieProcessor(
+            self.cookiejar)
+        if opts_proxy is not None:
+            if opts_proxy == '':
+                proxies = {}
+            else:
+                proxies = {'http': opts_proxy, 'https': opts_proxy}
+        else:
+            proxies = compat_urllib_request.getproxies()
+            # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+            if 'http' in proxies and 'https' not in proxies:
+                proxies['https'] = proxies['http']
+        proxy_handler = compat_urllib_request.ProxyHandler(proxies)
+        https_handler = make_HTTPS_handler(
+            self.params.get('nocheckcertificate', False))
+        opener = compat_urllib_request.build_opener(
+            https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
+        # Delete the default user-agent header, which would otherwise apply in
+        # cases where our custom HTTP handler doesn't come into play
+        # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+        opener.addheaders = []
+        self._opener = opener
+
+        # TODO remove this global modification
+        compat_urllib_request.install_opener(opener)
+        socket.setdefaulttimeout(timeout)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py

index 1f1db9f676d55c909514bf3fde589550d6d6ca71..d2446b6706a6eb239cf52a00fb775ef0eb9cac9f 100644 (file)
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -32,50 +32,45 @@ __authors__  = (
      'Ismael Mejía',
      'Steffan \'Ruirize\' James',
      'Andras Elso',
+    'Jelle van der Waa',
+    'Marcin Cieślak',
+    'Anton Larionov',
+    'Takuya Tsuchida',
+    'Sergey M.',
  )
  
  __license__ = 'Public Domain'
  
  import codecs
-import collections
  import getpass
  import optparse
  import os
  import random
  import re
  import shlex
-import socket
  import subprocess
  import sys
-import traceback
-import platform
  
  
  from .utils import (
-    compat_cookiejar,
      compat_print,
-    compat_str,
-    compat_urllib_request,
      DateRange,
      decodeOption,
      determine_ext,
      DownloadError,
      get_cachedir,
-    make_HTTPS_handler,
      MaxDownloadsReached,
-    platform_name,
      preferredencoding,
      SameFileError,
      std_headers,
      write_string,
-    YoutubeDLHandler,
  )
  from .update import update_self
-from .version import __version__
  from .FileDownloader import (
      FileDownloader,
  )
  from .extractor import gen_extractors
+from .version import __version__
  from .YoutubeDL import YoutubeDL
  from .PostProcessor import (
      FFmpegMetadataPP,
@@ -86,11 +81,11 @@ from .PostProcessor import (
  
  
  def parseOpts(overrideArguments=None):
-    def _readOptions(filename_bytes):
+    def _readOptions(filename_bytes, default=[]):
          try:
              optionf = open(filename_bytes)
          except IOError:
-            return [] # silently skip if file is not present
+            return default  # silently skip if file is not present
          try:
              res = []
              for l in optionf:
@@ -196,7 +191,9 @@ def parseOpts(overrideArguments=None):
      general.add_option('--extractor-descriptions',
              action='store_true', dest='list_extractor_descriptions',
              help='Output descriptions of all supported extractors', default=False)
-    general.add_option('--proxy', dest='proxy', default=None, help='Use the specified HTTP/HTTPS proxy', metavar='URL')
+    general.add_option(
+        '--proxy', dest='proxy', default=None, metavar='URL',
+        help='Use the specified HTTP/HTTPS proxy. Pass in an empty string (--proxy "") for direct connection')
      general.add_option('--no-check-certificate', action='store_true', dest='no_check_certificate', default=False, help='Suppress HTTPS certificate validation.')
      general.add_option(
          '--cache-dir', dest='cachedir', default=get_cachedir(), metavar='DIR',
@@ -204,6 +201,9 @@ def parseOpts(overrideArguments=None):
      general.add_option(
          '--no-cache-dir', action='store_const', const=None, dest='cachedir',
          help='Disable filesystem caching')
+    general.add_option(
+        '--socket-timeout', dest='socket_timeout',
+        type=float, default=None, help=optparse.SUPPRESS_HELP)
  
  
      selection.add_option('--playlist-start',
@@ -212,7 +212,9 @@ def parseOpts(overrideArguments=None):
              dest='playlistend', metavar='NUMBER', help='playlist video to end at (default is last)', default=-1)
      selection.add_option('--match-title', dest='matchtitle', metavar='REGEX',help='download only matching titles (regex or caseless sub-string)')
      selection.add_option('--reject-title', dest='rejecttitle', metavar='REGEX',help='skip download for matching titles (regex or caseless sub-string)')
-    selection.add_option('--max-downloads', metavar='NUMBER', dest='max_downloads', help='Abort after downloading NUMBER files', default=None)
+    selection.add_option('--max-downloads', metavar='NUMBER',
+                         dest='max_downloads', type=int, default=None,
+                         help='Abort after downloading NUMBER files')
      selection.add_option('--min-filesize', metavar='SIZE', dest='min_filesize', help="Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)", default=None)
      selection.add_option('--max-filesize', metavar='SIZE', dest='max_filesize', help="Do not download any videos larger than SIZE (e.g. 50k or 44.6m)", default=None)
      selection.add_option('--date', metavar='DATE', dest='date', help='download only videos uploaded in this date', default=None)
@@ -224,7 +226,7 @@ def parseOpts(overrideArguments=None):
                           default=None, type=int)
      selection.add_option('--download-archive', metavar='FILE',
                           dest='download_archive',
-                         help='Download only videos not present in the archive file. Record all downloaded videos in it.')
+                         help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
  
  
      authentication.add_option('-u', '--username',
@@ -239,7 +241,7 @@ def parseOpts(overrideArguments=None):
  
      video_format.add_option('-f', '--format',
              action='store', dest='format', metavar='FORMAT', default='best',
-            help='video format code, specifiy the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
+            help='video format code, specify the order of preference using slashes: "-f 22/17/18". "-f mp4" and "-f flv" are also supported')
      video_format.add_option('--all-formats',
              action='store_const', dest='format', help='download all available video formats', const='all')
      video_format.add_option('--prefer-free-formats',
@@ -304,6 +306,9 @@ def parseOpts(overrideArguments=None):
      verbosity.add_option('--get-format',
              action='store_true', dest='getformat',
              help='simulate, quiet but print output format', default=False)
+    verbosity.add_option('-j', '--dump-json',
+            action='store_true', dest='dumpjson',
+            help='simulate, quiet but print JSON information', default=False)
      verbosity.add_option('--newline',
              action='store_true', dest='progress_with_newline', help='output progress bar as new lines', default=False)
      verbosity.add_option('--no-progress',
@@ -318,7 +323,7 @@ def parseOpts(overrideArguments=None):
              help='print downloaded pages to debug problems(very verbose)')
      verbosity.add_option('--write-pages',
              action='store_true', dest='write_pages', default=False,
-            help='Write downloaded pages to files in the current directory')
+            help='Write downloaded intermediary pages to files in the current directory to debug problems')
      verbosity.add_option('--youtube-print-sig-code',
              action='store_true', dest='youtube_print_sig_code', default=False,
              help=optparse.SUPPRESS_HELP)
@@ -416,6 +421,8 @@ def parseOpts(overrideArguments=None):
          if opts.verbose:
              write_string(u'[debug] Override config: ' + repr(overrideArguments) + '\n')
      else:
+        systemConf = _readOptions('/etc/youtube-dl.conf')
+
          xdg_config_home = os.environ.get('XDG_CONFIG_HOME')
          if xdg_config_home:
              userConfFile = os.path.join(xdg_config_home, 'youtube-dl', 'config')
@@ -425,8 +432,31 @@ def parseOpts(overrideArguments=None):
              userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl', 'config')
              if not os.path.isfile(userConfFile):
                  userConfFile = os.path.join(os.path.expanduser('~'), '.config', 'youtube-dl.conf')
-        systemConf = _readOptions('/etc/youtube-dl.conf')
-        userConf = _readOptions(userConfFile)
+        userConf = _readOptions(userConfFile, None)
+
+        if userConf is None:
+            appdata_dir = os.environ.get('appdata')
+            if appdata_dir:
+                userConf = _readOptions(
+                    os.path.join(appdata_dir, 'youtube-dl', 'config'),
+                    default=None)
+                if userConf is None:
+                    userConf = _readOptions(
+                        os.path.join(appdata_dir, 'youtube-dl', 'config.txt'),
+                        default=None)
+
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(os.path.expanduser('~'), 'youtube-dl.conf'),
+                default=None)
+        if userConf is None:
+            userConf = _readOptions(
+                os.path.join(os.path.expanduser('~'), 'youtube-dl.conf.txt'),
+                default=None)
+
+        if userConf is None:
+            userConf = []
+
          commandLineConf = sys.argv[1:]
          argv = systemConf + userConf + commandLineConf
          opts, args = parser.parse_args(argv)
@@ -445,19 +475,6 @@ def _real_main(argv=None):
  
      parser, opts, args = parseOpts(argv)
  
-    # Open appropriate CookieJar
-    if opts.cookiefile is None:
-        jar = compat_cookiejar.CookieJar()
-    else:
-        try:
-            jar = compat_cookiejar.MozillaCookieJar(opts.cookiefile)
-            if os.access(opts.cookiefile, os.R_OK):
-                jar.load()
-        except (IOError, OSError) as err:
-            if opts.verbose:
-                traceback.print_exc()
-            write_string(u'ERROR: unable to open cookie file\n')
-            sys.exit(101)
      # Set user agent
      if opts.user_agent is not None:
          std_headers['User-Agent'] = opts.user_agent
@@ -489,8 +506,6 @@ def _real_main(argv=None):
      all_urls = batchurls + args
      all_urls = [url.strip() for url in all_urls]
  
-    opener = _setup_opener(jar=jar, opts=opts)
-
      extractors = gen_extractors()
  
      if opts.list_extractors:
@@ -545,7 +560,7 @@ def _real_main(argv=None):
      if opts.retries is not None:
          try:
              opts.retries = int(opts.retries)
-        except (TypeError, ValueError) as err:
+        except (TypeError, ValueError):
              parser.error(u'invalid retry count specified')
      if opts.buffersize is not None:
          numeric_buffersize = FileDownloader.parse_bytes(opts.buffersize)
@@ -556,13 +571,13 @@ def _real_main(argv=None):
          opts.playliststart = int(opts.playliststart)
          if opts.playliststart <= 0:
              raise ValueError(u'Playlist start must be positive')
-    except (TypeError, ValueError) as err:
+    except (TypeError, ValueError):
          parser.error(u'invalid playlist start number specified')
      try:
          opts.playlistend = int(opts.playlistend)
          if opts.playlistend != -1 and (opts.playlistend <= 0 or opts.playlistend < opts.playliststart):
              raise ValueError(u'Playlist end must be greater than playlist start')
-    except (TypeError, ValueError) as err:
+    except (TypeError, ValueError):
          parser.error(u'invalid playlist end number specified')
      if opts.extractaudio:
          if opts.audioformat not in ['best', 'aac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav']:
@@ -601,13 +616,12 @@ def _real_main(argv=None):
                       u' file! Use "%%(ext)s" instead of %r' %
                       determine_ext(outtmpl, u''))
  
-    # YoutubeDL
-    ydl = YoutubeDL({
+    ydl_opts = {
          'usenetrc': opts.usenetrc,
          'username': opts.username,
          'password': opts.password,
          'videopassword': opts.videopassword,
-        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+        'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
          'forceurl': opts.geturl,
          'forcetitle': opts.gettitle,
          'forceid': opts.getid,
@@ -615,8 +629,9 @@ def _real_main(argv=None):
          'forcedescription': opts.getdescription,
          'forcefilename': opts.getfilename,
          'forceformat': opts.getformat,
+        'forcejson': opts.dumpjson,
          'simulate': opts.simulate,
-        'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat),
+        'skip_download': (opts.skip_download or opts.simulate or opts.geturl or opts.gettitle or opts.getid or opts.getthumbnail or opts.getdescription or opts.getfilename or opts.getformat or opts.dumpjson),
          'format': opts.format,
          'format_limit': opts.format_limit,
          'listformats': opts.listformats,
@@ -665,102 +680,47 @@ def _real_main(argv=None):
          'youtube_print_sig_code': opts.youtube_print_sig_code,
          'age_limit': opts.age_limit,
          'download_archive': opts.download_archive,
-        })
-
-    if opts.verbose:
-        write_string(u'[debug] youtube-dl version ' + __version__ + u'\n')
-        try:
-            sp = subprocess.Popen(
-                ['git', 'rev-parse', '--short', 'HEAD'],
-                stdout=subprocess.PIPE, stderr=subprocess.PIPE,
-                cwd=os.path.dirname(os.path.abspath(__file__)))
-            out, err = sp.communicate()
-            out = out.decode().strip()
-            if re.match('[0-9a-f]+', out):
-                write_string(u'[debug] Git HEAD: ' + out + u'\n')
-        except:
-            try:
-                sys.exc_clear()
-            except:
-                pass
-        write_string(u'[debug] Python version %s - %s' %(platform.python_version(), platform_name()) + u'\n')
-
-        proxy_map = {}
-        for handler in opener.handlers:
-            if hasattr(handler, 'proxies'):
-                proxy_map.update(handler.proxies)
-        write_string(u'[debug] Proxy map: ' + compat_str(proxy_map) + u'\n')
-
-    ydl.add_default_info_extractors()
-
-    # PostProcessors
-    # Add the metadata pp first, the other pps will copy it
-    if opts.addmetadata:
-        ydl.add_post_processor(FFmpegMetadataPP())
-    if opts.extractaudio:
-        ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
-    if opts.recodevideo:
-        ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
-    if opts.embedsubtitles:
-        ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
-
-    # Update version
-    if opts.update_self:
-        update_self(ydl.to_screen, opts.verbose)
-
-    # Maybe do nothing
-    if len(all_urls) < 1:
-        if not opts.update_self:
-            parser.error(u'you must provide at least one URL')
-        else:
-            sys.exit()
+        'cookiefile': opts.cookiefile,
+        'nocheckcertificate': opts.no_check_certificate,
+        'proxy': opts.proxy,
+        'socket_timeout': opts.socket_timeout,
+    }
  
-    try:
-        retcode = ydl.download(all_urls)
-    except MaxDownloadsReached:
-        ydl.to_screen(u'--max-download limit reached, aborting.')
-        retcode = 101
+    with YoutubeDL(ydl_opts) as ydl:
+        ydl.print_debug_header()
+        ydl.add_default_info_extractors()
+
+        # PostProcessors
+        # Add the metadata pp first, the other pps will copy it
+        if opts.addmetadata:
+            ydl.add_post_processor(FFmpegMetadataPP())
+        if opts.extractaudio:
+            ydl.add_post_processor(FFmpegExtractAudioPP(preferredcodec=opts.audioformat, preferredquality=opts.audioquality, nopostoverwrites=opts.nopostoverwrites))
+        if opts.recodevideo:
+            ydl.add_post_processor(FFmpegVideoConvertor(preferedformat=opts.recodevideo))
+        if opts.embedsubtitles:
+            ydl.add_post_processor(FFmpegEmbedSubtitlePP(subtitlesformat=opts.subtitlesformat))
+
+        # Update version
+        if opts.update_self:
+            update_self(ydl.to_screen, opts.verbose)
+
+        # Maybe do nothing
+        if len(all_urls) < 1:
+            if not opts.update_self:
+                parser.error(u'you must provide at least one URL')
+            else:
+                sys.exit()
  
-    # Dump cookie jar if requested
-    if opts.cookiefile is not None:
          try:
-            jar.save()
-        except (IOError, OSError):
-            sys.exit(u'ERROR: unable to save cookie jar')
+            retcode = ydl.download(all_urls)
+        except MaxDownloadsReached:
+            ydl.to_screen(u'--max-download limit reached, aborting.')
+            retcode = 101
  
      sys.exit(retcode)
  
  
-def _setup_opener(jar=None, opts=None, timeout=300):
-    if opts is None:
-        FakeOptions = collections.namedtuple(
-            'FakeOptions', ['proxy', 'no_check_certificate'])
-        opts = FakeOptions(proxy=None, no_check_certificate=False)
-
-    cookie_processor = compat_urllib_request.HTTPCookieProcessor(jar)
-    if opts.proxy is not None:
-        if opts.proxy == '':
-            proxies = {}
-        else:
-            proxies = {'http': opts.proxy, 'https': opts.proxy}
-    else:
-        proxies = compat_urllib_request.getproxies()
-        # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
-        if 'http' in proxies and 'https' not in proxies:
-            proxies['https'] = proxies['http']
-    proxy_handler = compat_urllib_request.ProxyHandler(proxies)
-    https_handler = make_HTTPS_handler(opts)
-    opener = compat_urllib_request.build_opener(
-        https_handler, proxy_handler, cookie_processor, YoutubeDLHandler())
-    # Delete the default user-agent header, which would otherwise apply in
-    # cases where our custom HTTP handler doesn't come into play
-    # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
-    opener.addheaders = []
-    compat_urllib_request.install_opener(opener)
-    socket.setdefaulttimeout(timeout)
-    return opener
-
-
  def main(argv=None):
      try:
          _real_main(argv)
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py

index f9caca4ef8c3658e5c25602c98742dd8c3f94fb6..bd996483b4e0d0469ad5a0f04aec4bc078788a13 100644 (file)
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -1,5 +1,6 @@
  from .appletrailers import AppleTrailersIE
  from .addanime import AddAnimeIE
+from .anitube import AnitubeIE
  from .archiveorg import ArchiveOrgIE
  from .ard import ARDIE
  from .arte import (
@@ -10,7 +11,7 @@ from .arte import (
  )
  from .auengine import AUEngineIE
  from .bambuser import BambuserIE, BambuserChannelIE
-from .bandcamp import BandcampIE
+from .bandcamp import BandcampIE, BandcampAlbumIE
  from .bliptv import BlipTVIE, BlipTVUserIE
  from .bloomberg import BloombergIE
  from .breakcom import BreakIE
@@ -19,12 +20,15 @@ from .c56 import C56IE
  from .canalplus import CanalplusIE
  from .canalc2 import Canalc2IE
  from .cinemassacre import CinemassacreIE
+from .clipfish import ClipfishIE
+from .clipsyndicate import ClipsyndicateIE
  from .cnn import CNNIE
  from .collegehumor import CollegeHumorIE
-from .comedycentral import ComedyCentralIE
+from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
  from .condenast import CondeNastIE
  from .criterion import CriterionIE
  from .cspan import CSpanIE
+from .d8 import D8IE
  from .dailymotion import (
      DailymotionIE,
      DailymotionPlaylistIE,
@@ -57,6 +61,7 @@ from .francetv import (
  )
  from .freesound import FreesoundIE
  from .funnyordie import FunnyOrDieIE
+from .gamekings import GamekingsIE
  from .gamespot import GameSpotIE
  from .gametrailers import GametrailersIE
  from .generic import GenericIE
@@ -67,6 +72,7 @@ from .hotnewhiphop import HotNewHipHopIE
  from .howcast import HowcastIE
  from .hypem import HypemIE
  from .ign import IGNIE, OneUPIE
+from .imdb import ImdbIE
  from .ina import InaIE
  from .infoq import InfoQIE
  from .instagram import InstagramIE
@@ -79,7 +85,7 @@ from .keezmovies import KeezMoviesIE
  from .kickstarter import KickStarterIE
  from .keek import KeekIE
  from .liveleak import LiveLeakIE
-from .livestream import LivestreamIE
+from .livestream import LivestreamIE, LivestreamOriginalIE
  from .metacafe import MetacafeIE
  from .metacritic import MetacriticIE
  from .mit import TechTVMITIE, MITIE
@@ -95,11 +101,13 @@ from .nba import NBAIE
  from .nbc import NBCNewsIE
  from .newgrounds import NewgroundsIE
  from .nhl import NHLIE, NHLVideocenterIE
+from .niconico import NiconicoIE
  from .nowvideo import NowVideoIE
  from .ooyala import OoyalaIE
  from .orf import ORFIE
  from .pbs import PBSIE
  from .photobucket import PhotobucketIE
+from .podomatic import PodomaticIE
  from .pornhub import PornHubIE
  from .pornotube import PornotubeIE
  from .rbmaradio import RBMARadioIE
@@ -113,27 +121,38 @@ from .rutube import RutubeIE
  from .sina import SinaIE
  from .slashdot import SlashdotIE
  from .slideshare import SlideshareIE
+from .smotri import (
+    SmotriIE,
+    SmotriCommunityIE,
+    SmotriUserIE,
+)
  from .sohu import SohuIE
  from .soundcloud import SoundcloudIE, SoundcloudSetIE, SoundcloudUserIE
-from .southparkstudios import SouthParkStudiosIE
+from .southparkstudios import (
+    SouthParkStudiosIE,
+    SouthparkDeIE,
+)
  from .space import SpaceIE
  from .spankwire import SpankwireIE
  from .spiegel import SpiegelIE
  from .stanfordoc import StanfordOpenClassroomIE
  from .statigram import StatigramIE
  from .steam import SteamIE
+from .streamcloud import StreamcloudIE
  from .sztvhu import SztvHuIE
  from .teamcoco import TeamcocoIE
  from .techtalks import TechTalksIE
  from .ted import TEDIE
  from .tf1 import TF1IE
  from .thisav import ThisAVIE
+from .toutv import TouTvIE
  from .traileraddict import TrailerAddictIE
  from .trilulilu import TriluliluIE
  from .tube8 import Tube8IE
  from .tudou import TudouIE
  from .tumblr import TumblrIE
  from .tutv import TutvIE
+from .tvp import TvpIE
  from .unistra import UnistraIE
  from .ustream import UstreamIE, UstreamChannelIE
  from .vbox7 import Vbox7IE
@@ -145,8 +164,13 @@ from .viddler import ViddlerIE
  from .videodetective import VideoDetectiveIE
  from .videofyme import VideofyMeIE
  from .videopremium import VideoPremiumIE
-from .vimeo import VimeoIE, VimeoChannelIE
+from .vimeo import (
+    VimeoIE,
+    VimeoChannelIE,
+    VimeoUserIE,
+)
  from .vine import VineIE
+from .viki import VikiIE
  from .vk import VKIE
  from .wat import WatIE
  from .websurg import WeBSurgIE
@@ -157,7 +181,11 @@ from .xhamster import XHamsterIE
  from .xnxx import XNXXIE
  from .xvideos import XVideosIE
  from .xtube import XTubeIE
-from .yahoo import YahooIE, YahooSearchIE
+from .yahoo import (
+    YahooIE,
+    YahooNewsIE,
+    YahooSearchIE,
+)
  from .youjizz import YouJizzIE
  from .youku import YoukuIE
  from .youporn import YouPornIE
@@ -174,6 +202,7 @@ from .youtube import (
      YoutubeTruncatedURLIE,
      YoutubeWatchLaterIE,
      YoutubeFavouritesIE,
+    YoutubeHistoryIE,
  )
  from .zdf import ZDFIE
  
diff --git a/youtube_dl/extractor/anitube.py b/youtube_dl/extractor/anitube.py

new file mode 100644 (file)

index 0000000..2b019da
--- /dev/null
+++ b/youtube_dl/extractor/anitube.py
@@ -0,0 +1,53 @@
+import re
+
+from .common import InfoExtractor
+
+
+class AnitubeIE(InfoExtractor):
+    IE_NAME = u'anitube.se'
+    _VALID_URL = r'https?://(?:www\.)?anitube\.se/video/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.anitube.se/video/36621',
+        u'md5': u'59d0eeae28ea0bc8c05e7af429998d43',
+        u'file': u'36621.mp4',
+        u'info_dict': {
+            u'id': u'36621',
+            u'ext': u'mp4',
+            u'title': u'Recorder to Randoseru 01',
+        },
+        u'skip': u'Blocked in the US',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        webpage = self._download_webpage(url, video_id)
+        key = self._html_search_regex(r'http://www\.anitube\.se/embed/([A-Za-z0-9_-]*)',
+                                      webpage, u'key')
+
+        config_xml = self._download_xml('http://www.anitube.se/nuevo/econfig.php?key=%s' % key,
+                                                key)
+
+        video_title = config_xml.find('title').text
+
+        formats = []
+        video_url = config_xml.find('file')
+        if video_url is not None:
+            formats.append({
+                'format_id': 'sd',
+                'url': video_url.text,
+            })
+        video_url = config_xml.find('filehd')
+        if video_url is not None:
+            formats.append({
+                'format_id': 'hd',
+                'url': video_url.text,
+            })
+
+        return {
+            'id': video_id,
+            'title': video_title,
+            'formats': formats
+        }
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py

index 6d6237f8af79c02048da0e1b1624f33086a120b6..4befff3942cd5f17fddb48bfb3b4c7f7623af1d6 100644 (file)
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -113,7 +113,7 @@ class AppleTrailersIE(InfoExtractor):
                  })
              formats = sorted(formats, key=lambda f: (f['height'], f['width']))
  
-            info = {
+            playlist.append({
                  '_type': 'video',
                  'id': video_id,
                  'title': title,
@@ -124,12 +124,7 @@ class AppleTrailersIE(InfoExtractor):
                  'upload_date': upload_date,
                  'uploader_id': uploader_id,
                  'user_agent': 'QuickTime compatible (youtube-dl)',
-            }
-            # TODO: Remove when #980 has been merged
-            info['url'] = formats[-1]['url']
-            info['ext'] = formats[-1]['ext']
-
-            playlist.append(info)
+            })
  
          return {
              '_type': 'playlist',
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py

index 61ce4469a05dd3cdf9bddbecf8c82119c40b5c3f..3ae0aebb1275f0a4b1bed0c1dda3d969c0672a87 100644 (file)
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -49,7 +49,7 @@ class ArchiveOrgIE(InfoExtractor):
          for f in formats:
              f['ext'] = determine_ext(f['url'])
  
-        info = {
+        return {
              '_type': 'video',
              'id': video_id,
              'title': title,
@@ -57,12 +57,5 @@ class ArchiveOrgIE(InfoExtractor):
              'description': description,
              'uploader': uploader,
              'upload_date': upload_date,
+            'thumbnail': data.get('misc', {}).get('image'),
          }
-        thumbnail = data.get('misc', {}).get('image')
-        if thumbnail:
-            info['thumbnail'] = thumbnail
-
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py

index b35a679e3b036d2c573a4f1fc85d53bd793f745b..8b62ee774cc021d4b77e97aced8034f72d4d64e4 100644 (file)
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -1,7 +1,6 @@
  # encoding: utf-8
  import re
  import json
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -69,7 +68,7 @@ class ArteTvIE(InfoExtractor):
              lang = mobj.group('lang')
              return self._extract_liveweb(url, name, lang)
  
-        if re.search(self._LIVE_URL, video_id) is not None:
+        if re.search(self._LIVE_URL, url) is not None:
              raise ExtractorError(u'Arte live streams are not yet supported, sorry')
              # self.extractLiveStream(url)
              # return
@@ -78,8 +77,7 @@ class ArteTvIE(InfoExtractor):
          """Extract from videos.arte.tv"""
          ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
          ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
-        ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
-        ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
+        ref_xml_doc = self._download_xml(ref_xml_url, video_id, note=u'Downloading metadata')
          config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
          config_xml_url = config_node.attrib['ref']
          config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
@@ -109,13 +107,12 @@ class ArteTvIE(InfoExtractor):
          """Extract form http://liveweb.arte.tv/"""
          webpage = self._download_webpage(url, name)
          video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id')
-        config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
+        config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id,
                                              video_id, u'Downloading information')
-        config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
          event_doc = config_doc.find('event')
          url_node = event_doc.find('video').find('urlHd')
          if url_node is None:
-            url_node = video_doc.find('urlSd')
+            url_node = event_doc.find('urlSd')
  
          return {'id': video_id,
                  'title': event_doc.find('name%s' % lang.capitalize()).text,
diff --git a/youtube_dl/extractor/auengine.py b/youtube_dl/extractor/auengine.py

index 0febbff4f6c42afd10f8dbc13ea9df883edae4c6..95c038003b431dc48ac3bb89dcc03f8aa39ea07f 100644 (file)
--- a/youtube_dl/extractor/auengine.py
+++ b/youtube_dl/extractor/auengine.py
@@ -1,10 +1,10 @@
-import os.path
  import re
  
  from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse,
-    compat_urllib_parse_urlparse,
+    determine_ext,
+    ExtractorError,
  )
  
  class AUEngineIE(InfoExtractor):
@@ -25,22 +25,25 @@ class AUEngineIE(InfoExtractor):
          title = self._html_search_regex(r'<title>(?P<title>.+?)</title>',
                  webpage, u'title')
          title = title.strip()
-        links = re.findall(r'[^A-Za-z0-9]?(?:file|url):\s*["\'](http[^\'"&]*)', webpage)
-        links = [compat_urllib_parse.unquote(l) for l in links]
+        links = re.findall(r'\s(?:file|url):\s*["\']([^\'"]+)["\']', webpage)
+        links = map(compat_urllib_parse.unquote, links)
+
+        thumbnail = None
+        video_url = None
          for link in links:
-            root, pathext = os.path.splitext(compat_urllib_parse_urlparse(link).path)
-            if pathext == '.png':
+            if link.endswith('.png'):
                  thumbnail = link
-            elif pathext == '.mp4':
-                url = link
-                ext = pathext
+            elif '/videos/' in link:
+                video_url = link
+        if not video_url:
+            raise ExtractorError(u'Could not find video URL')
+        ext = u'.' + determine_ext(video_url)
          if ext == title[-len(ext):]:
              title = title[:-len(ext)]
-        ext = ext[1:]
-        return [{
+
+        return {
              'id':        video_id,
-            'url':       url,
-            'ext':       ext,
+            'url':       video_url,
              'title':     title,
              'thumbnail': thumbnail,
-        }]
+        }
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py

index f3b36f4733021e05fb8c3db5bf3d218cb2e59536..b80508efed09a7ccece8e6980706e7083d3b96e9 100644 (file)
--- a/youtube_dl/extractor/bambuser.py
+++ b/youtube_dl/extractor/bambuser.py
@@ -15,7 +15,8 @@ class BambuserIE(InfoExtractor):
  
      _TEST = {
          u'url': u'http://bambuser.com/v/4050584',
-        u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
+        # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
+        #u'md5': u'fba8f7693e48fd4e8641b3fd5539a641',
          u'info_dict': {
              u'id': u'4050584',
              u'ext': u'flv',
@@ -24,6 +25,11 @@ class BambuserIE(InfoExtractor):
              u'uploader': u'pixelversity',
              u'uploader_id': u'344706',
          },
+        u'params': {
+            # It doesn't respect the 'Range' header, it would download the whole video
+            # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59
+            u'skip_download': True,
+        },
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index 129a20f4497b4cc6fc9f031e8e48dd8eb8980f66..3a32c14c598dd2da14841fe68c1cb59582f30799 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -3,13 +3,16 @@ import re
  
  from .common import InfoExtractor
  from ..utils import (
+    compat_str,
+    compat_urlparse,
      ExtractorError,
  )
  
  
  class BandcampIE(InfoExtractor):
+    IE_NAME = u'Bandcamp'
      _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
          u'file': u'1812978515.mp3',
          u'md5': u'cdeb30cdae1921719a3cbcab696ef53c',
@@ -17,7 +20,7 @@ class BandcampIE(InfoExtractor):
              u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad"
          },
          u'skip': u'There is a limit of 200 free downloads / month for the test song'
-    }
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
@@ -26,6 +29,23 @@ class BandcampIE(InfoExtractor):
          # We get the link to the free download page
          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
          if m_download is None:
+            m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
+        if m_trackinfo:
+            json_code = m_trackinfo.group(1)
+            data = json.loads(json_code)
+
+            for d in data:
+                formats = [{
+                    'format_id': 'format_id',
+                    'url': format_url,
+                    'ext': format_id.partition('-')[0]
+                } for format_id, format_url in sorted(d['file'].items())]
+                return {
+                    'id': compat_str(d['id']),
+                    'title': d['title'],
+                    'formats': formats,
+                }
+        else:
              raise ExtractorError(u'No free songs found')
  
          download_link = m_download.group(1)
@@ -61,3 +81,49 @@ class BandcampIE(InfoExtractor):
                        }
  
          return [track_info]
+
+
+class BandcampAlbumIE(InfoExtractor):
+    IE_NAME = u'Bandcamp:album'
+    _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P<title>.*)'
+
+    _TEST = {
+        u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
+        u'playlist': [
+            {
+                u'file': u'1353101989.mp3',
+                u'md5': u'39bc1eded3476e927c724321ddf116cf',
+                u'info_dict': {
+                    u'title': u'Intro',
+                }
+            },
+            {
+                u'file': u'38097443.mp3',
+                u'md5': u'1a2c32e2691474643e912cc6cd4bffaa',
+                u'info_dict': {
+                    u'title': u'Kero One - Keep It Alive (Blazo remix)',
+                }
+            },
+        ],
+        u'params': {
+            u'playlistend': 2
+        },
+        u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
+        if not tracks_paths:
+            raise ExtractorError(u'The page doesn\'t contain any track')
+        entries = [
+            self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+            for t_path in tracks_paths]
+        title = self._search_regex(r'album_title : "(.*?)"', webpage, u'title')
+        return {
+            '_type': 'playlist',
+            'title': title,
+            'entries': entries,
+        }
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py

index d8c35465a34fa4c4d4ca822d499892504a51ce62..66fe0ac9ade6fad80d77f0429c136c2d022af16d 100644 (file)
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -75,16 +75,22 @@ class BrightcoveIE(InfoExtractor):
          params = {'flashID': object_doc.attrib['id'],
                    'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
                    }
-        playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+        def find_param(name):
+            node = find_xpath_attr(object_doc, './param', 'name', name)
+            if node is not None:
+                return node.attrib['value']
+            return None
+        playerKey = find_param('playerKey')
          # Not all pages define this value
          if playerKey is not None:
-            params['playerKey'] = playerKey.attrib['value']
-        videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+            params['playerKey'] = playerKey
+        # The three fields hold the id of the video
+        videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID')
          if videoPlayer is not None:
-            params['@videoPlayer'] = videoPlayer.attrib['value']
-        linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL')
+            params['@videoPlayer'] = videoPlayer
+        linkBase = find_param('linkBaseURL')
          if linkBase is not None:
-            params['linkBaseURL'] = linkBase.attrib['value']
+            params['linkBaseURL'] = linkBase
          data = compat_urllib_parse.urlencode(params)
          return cls._FEDERATED_URL_TEMPLATE % data
  
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py

index 1db9b24cf204cc26d68b1a1bdaff93577c3ae903..7cdcd8399a8cabcd17ef7af8d89ba9052e9f8901 100644 (file)
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -1,10 +1,10 @@
  # encoding: utf-8
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import unified_strdate
  
+
  class CanalplusIE(InfoExtractor):
      _VALID_URL = r'https?://(www\.canalplus\.fr/.*?/(?P<path>.*)|player\.canalplus\.fr/#/(?P<id>\d+))'
      _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
@@ -25,16 +25,15 @@ class CanalplusIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = mobj.groupdict().get('id')
          if video_id is None:
              webpage = self._download_webpage(url, mobj.group('path'))
              video_id = self._search_regex(r'videoId = "(\d+)";', webpage, u'video id')
          info_url = self._VIDEO_INFO_TEMPLATE % video_id
-        info_page = self._download_webpage(info_url,video_id, 
+        doc = self._download_xml(info_url,video_id, 
                                             u'Downloading video info')
  
          self.report_extraction(video_id)
-        doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
          video_info = [video for video in doc if video.find('ID').text == video_id][0]
          infos = video_info.find('INFOS')
          media = video_info.find('MEDIA')
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py

new file mode 100644 (file)

index 0000000..43efb08
--- /dev/null
+++ b/youtube_dl/extractor/clipfish.py
@@ -0,0 +1,58 @@
+import re
+import time
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class ClipfishIE(InfoExtractor):
+    IE_NAME = u'clipfish'
+
+    _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/'
+    _TEST = {
+        u'url': u'http://www.clipfish.de/special/game-trailer/video/3966754/fifa-14-e3-2013-trailer/',
+        u'file': u'3966754.mp4',
+        u'md5': u'2521cd644e862936cf2e698206e47385',
+        u'info_dict': {
+            u'title': u'FIFA 14 - E3 2013 Trailer',
+            u'duration': 82,
+        },
+        u'skip': 'Blocked in the US'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' %
+                    (video_id, int(time.time())))
+        doc = self._download_xml(
+            info_url, video_id, note=u'Downloading info page')
+        title = doc.find('title').text
+        video_url = doc.find('filename').text
+        if video_url is None:
+            xml_bytes = xml.etree.ElementTree.tostring(doc)
+            raise ExtractorError(u'Cannot find video URL in document %r' %
+                                 xml_bytes)
+        thumbnail = doc.find('imageurl').text
+        duration_str = doc.find('duration').text
+        m = re.match(
+            r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$',
+            duration_str)
+        if m:
+            duration = (
+                (int(m.group('hours')) * 60 * 60) +
+                (int(m.group('minutes')) * 60) +
+                (int(m.group('seconds')))
+            )
+        else:
+            duration = None
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/clipsyndicate.py b/youtube_dl/extractor/clipsyndicate.py

new file mode 100644 (file)

index 0000000..d4fc869
--- /dev/null
+++ b/youtube_dl/extractor/clipsyndicate.py
@@ -0,0 +1,52 @@
+import re
+import xml.etree.ElementTree
+
+from .common import InfoExtractor
+from ..utils import (
+    find_xpath_attr,
+)
+
+
+class ClipsyndicateIE(InfoExtractor):
+    _VALID_URL = r'http://www\.clipsyndicate\.com/video/play(list/\d+)?/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.clipsyndicate.com/video/play/4629301/brick_briscoe',
+        u'md5': u'4d7d549451bad625e0ff3d7bd56d776c',
+        u'info_dict': {
+            u'id': u'4629301',
+            u'ext': u'mp4',
+            u'title': u'Brick Briscoe',
+            u'duration': 612,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        js_player = self._download_webpage(
+            'http://eplayer.clipsyndicate.com/embed/player.js?va_id=%s' % video_id,
+            video_id, u'Downlaoding player')
+        # it includes a required token
+        flvars = self._search_regex(r'flvars: "(.*?)"', js_player, u'flvars')
+
+        playlist_page = self._download_webpage(
+            'http://eplayer.clipsyndicate.com/osmf/playlist?%s' % flvars,
+            video_id, u'Downloading video info') 
+        # Fix broken xml
+        playlist_page = re.sub('&', '&amp;', playlist_page)
+        pdoc = xml.etree.ElementTree.fromstring(playlist_page.encode('utf-8'))
+
+        track_doc = pdoc.find('trackList/track')
+        def find_param(name):
+            node = find_xpath_attr(track_doc, './/param', 'name', name)
+            if node is not None:
+                return node.attrib['value']
+
+        return {
+            'id': video_id,
+            'title': find_param('title'),
+            'url': track_doc.find('location').text,
+            'thumbnail': find_param('thumbnail'),
+            'duration': int(find_param('duration')),
+        }
diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py

index 34adf6dda519a5ed2657fee3687d9e2e0f52ef73..a034bb2fb6288fc62d964021405aa94eff532ff6 100644 (file)
--- a/youtube_dl/extractor/cnn.py
+++ b/youtube_dl/extractor/cnn.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import determine_ext
@@ -33,8 +32,7 @@ class CNNIE(InfoExtractor):
          path = mobj.group('path')
          page_title = mobj.group('title')
          info_url = u'http://cnn.com/video/data/3.0/%s/index.xml' % path
-        info_xml = self._download_webpage(info_url, page_title)
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
+        info = self._download_xml(info_url, page_title)
  
          formats = []
          for f in info.findall('files/file'):
diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py

index 8d4c93d6da91f4470c9809bf32dd0fbbe886c92b..b27c1dfc52401f3c148d48d2b2897d2b06db3834 100644 (file)
--- a/youtube_dl/extractor/collegehumor.py
+++ b/youtube_dl/extractor/collegehumor.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -46,11 +45,10 @@ class CollegeHumorIE(InfoExtractor):
  
          self.report_extraction(video_id)
          xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
-        metaXml = self._download_webpage(xmlUrl, video_id,
+        mdoc = self._download_xml(xmlUrl, video_id,
                                           u'Downloading info XML',
                                           u'Unable to download video info XML')
  
-        mdoc = xml.etree.ElementTree.fromstring(metaXml)
          try:
              videoNode = mdoc.findall('./video')[0]
              youtubeIdNode = videoNode.find('./youtubeID')
@@ -65,16 +63,13 @@ class CollegeHumorIE(InfoExtractor):
  
          if next_url.endswith(u'manifest.f4m'):
              manifest_url = next_url + '?hdcore=2.10.3'
-            manifestXml = self._download_webpage(manifest_url, video_id,
+            adoc = self._download_xml(manifest_url, video_id,
                                           u'Downloading XML manifest',
                                           u'Unable to download video info XML')
  
-            adoc = xml.etree.ElementTree.fromstring(manifestXml)
              try:
-                media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
-                node_id = media_node.attrib['url']
                  video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
-            except IndexError as err:
+            except IndexError:
                  raise ExtractorError(u'Invalid manifest file')
              url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
              info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py

index 69b2beecebac319ef92e8043ab75ad71fad46a25..53579aa2703e78150c14dfe5dfd35e6240310952 100644 (file)
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@@ -1,7 +1,7 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
+from .mtv import MTVServicesInfoExtractor
  from ..utils import (
      compat_str,
      compat_urllib_parse,
@@ -11,7 +11,31 @@ from ..utils import (
  )
  
  
-class ComedyCentralIE(InfoExtractor):
+class ComedyCentralIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
+    _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
+
+    _TEST = {
+        u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
+        u'md5': u'4167875aae411f903b751a21f357f1ee',
+        u'info_dict': {
+            u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
+            u'ext': u'mp4',
+            u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
+            u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        title = mobj.group('title')
+        webpage = self._download_webpage(url, title)
+        mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
+                                  webpage, u'mgid')
+        return self._get_videos_info(mgid)
+
+
+class ComedyCentralShowsIE(InfoExtractor):
      IE_DESC = u'The Daily Show / Colbert Report'
      # urls can be abbreviations like :thedailyshow or :colbert
      # urls for episodes like:
@@ -127,13 +151,12 @@ class ComedyCentralIE(InfoExtractor):
  
          uri = mMovieParams[0][1]
          indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
-        indexXml = self._download_webpage(indexUrl, epTitle,
+        idoc = self._download_xml(indexUrl, epTitle,
                                            u'Downloading show index',
                                            u'unable to download episode index')
  
          results = []
  
-        idoc = xml.etree.ElementTree.fromstring(indexXml)
          itemEls = idoc.findall('.//item')
          for partNum,itemEl in enumerate(itemEls):
              mediaId = itemEl.findall('./guid')[0].text
@@ -144,10 +167,9 @@ class ComedyCentralIE(InfoExtractor):
  
              configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
                          compat_urllib_parse.urlencode({'uri': mediaId}))
-            configXml = self._download_webpage(configUrl, epTitle,
+            cdoc = self._download_xml(configUrl, epTitle,
                                                 u'Downloading configuration for %s' % shortMediaId)
  
-            cdoc = xml.etree.ElementTree.fromstring(configXml)
              turls = []
              for rendition in cdoc.findall('.//rendition'):
                  finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
@@ -169,7 +191,7 @@ class ComedyCentralIE(InfoExtractor):
                  })
  
              effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
-            info = {
+            results.append({
                  'id': shortMediaId,
                  'formats': formats,
                  'uploader': showId,
@@ -177,11 +199,6 @@ class ComedyCentralIE(InfoExtractor):
                  'title': effTitle,
                  'thumbnail': None,
                  'description': compat_str(officialTitle),
-            }
-
-            # TODO: Remove when #980 has been merged
-            info.update(info['formats'][-1])
-
-            results.append(info)
+            })
  
          return results
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index fb2d50a098992f8088c41259b73d653c6f6f173d..1b049082de5bbc9541a6513acc4124a478b87ea0 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -4,11 +4,11 @@ import re
  import socket
  import sys
  import netrc
+import xml.etree.ElementTree
  
  from ..utils import (
      compat_http_client,
      compat_urllib_error,
-    compat_urllib_request,
      compat_str,
  
      clean_html,
@@ -19,6 +19,7 @@ from ..utils import (
      unescapeHTML,
  )
  
+
  class InfoExtractor(object):
      """Information Extractor class.
  
@@ -71,6 +72,11 @@ class InfoExtractor(object):
                                  ("3D" or "DASH video")
                      * width     Width of the video, if known
                      * height    Height of the video, if known
+                    * abr       Average audio bitrate in KBit/s
+                    * acodec    Name of the audio codec in use
+                    * vbr       Average video bitrate in KBit/s
+                    * vcodec    Name of the video codec in use
+                    * filesize  The number of bytes, if known in advance
      webpage_url:    The url to the video webpage, if given to youtube-dl it
                      should allow to get the same result again. (It will be set
                      by YoutubeDL if it's missing)
@@ -152,7 +158,7 @@ class InfoExtractor(object):
          elif note is not False:
              self.to_screen(u'%s: %s' % (video_id, note))
          try:
-            return compat_urllib_request.urlopen(url_or_request)
+            return self._downloader.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              if errnote is None:
                  errnote = u'Unable to download webpage'
@@ -204,6 +210,12 @@ class InfoExtractor(object):
          """ Returns the data of the page as a string """
          return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
  
+    def _download_xml(self, url_or_request, video_id,
+                      note=u'Downloading XML', errnote=u'Unable to download XML'):
+        """Return the xml as an xml.etree.ElementTree.Element"""
+        xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
+        return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
+
      def to_screen(self, msg):
          """Print msg to screen, prefixing it with '[ie_name]'"""
          self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
@@ -225,12 +237,14 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    def url_result(self, url, ie=None):
+    def url_result(self, url, ie=None, video_id=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
                        'url': url,
                        'ie_key': ie}
+        if video_id is not None:
+            video_info['id'] = video_id
          return video_info
      def playlist_result(self, entries, playlist_id=None, playlist_title=None):
          """Returns a playlist"""
@@ -315,16 +329,22 @@ class InfoExtractor(object):
  
      # Helper functions for extracting OpenGraph info
      @staticmethod
-    def _og_regex(prop):
-        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+    def _og_regexes(prop):
+        content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
+        property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
+        template = r'<meta[^>]+?%s[^>]+?%s'
+        return [
+            template % (property_re, content_re),
+            template % (content_re, property_re),
+        ]
  
      def _og_search_property(self, prop, html, name=None, **kargs):
          if name is None:
              name = 'OpenGraph %s' % prop
-        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
-        if not escaped is None:
-            return unescapeHTML(escaped)
-        return None
+        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+        if escaped is None:
+            return None
+        return unescapeHTML(escaped)
  
      def _og_search_thumbnail(self, html, **kargs):
          return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
@@ -336,10 +356,22 @@ class InfoExtractor(object):
          return self._og_search_property('title', html, **kargs)
  
      def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
-        regexes = [self._og_regex('video')]
-        if secure: regexes.insert(0, self._og_regex('video:secure_url'))
+        regexes = self._og_regexes('video')
+        if secure: regexes = self._og_regexes('video:secure_url') + regexes
          return self._html_search_regex(regexes, html, name, **kargs)
  
+    def _html_search_meta(self, name, html, display_name=None):
+        if display_name is None:
+            display_name = name
+        return self._html_search_regex(
+            r'''(?ix)<meta
+                    (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
+                    [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
+            html, display_name, fatal=False)
+
+    def _dc_search_uploader(self, html):
+        return self._html_search_meta('dc.creator', html, 'uploader')
+
      def _rta_search(self, html):
          # See http://www.rtalabel.org/index.php?content=howtofaq#single
          if re.search(r'(?ix)<meta\s+name="rating"\s+'
@@ -348,6 +380,23 @@ class InfoExtractor(object):
              return 18
          return 0
  
+    def _media_rating_search(self, html):
+        # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
+        rating = self._html_search_meta('rating', html)
+
+        if not rating:
+            return None
+
+        RATING_TABLE = {
+            'safe for kids': 0,
+            'general': 8,
+            '14 years': 14,
+            'mature': 17,
+            'restricted': 19,
+        }
+        return RATING_TABLE.get(rating.lower(), None)
+
+
  
  class SearchInfoExtractor(InfoExtractor):
      """
diff --git a/youtube_dl/extractor/d8.py b/youtube_dl/extractor/d8.py

new file mode 100644 (file)

index 0000000..a56842b
--- /dev/null
+++ b/youtube_dl/extractor/d8.py
@@ -0,0 +1,22 @@
+# encoding: utf-8
+from .canalplus import CanalplusIE
+
+
+class D8IE(CanalplusIE):
+    _VALID_URL = r'https?://www\.d8\.tv/.*?/(?P<path>.*)'
+    _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/d8/%s'
+    IE_NAME = u'd8.tv'
+
+    _TEST = {
+        u'url': u'http://www.d8.tv/d8-docs-mags/pid6589-d8-campagne-intime.html',
+        u'file': u'966289.flv',
+        u'info_dict': {
+            u'title': u'Campagne intime - Documentaire exceptionnel',
+            u'description': u'md5:d2643b799fb190846ae09c61e59a859f',
+            u'upload_date': u'20131108',
+        },
+        u'params': {
+            # rtmp
+            u'skip_download': True,
+        },
+    }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index e87690f9d288103ea222e1c216786b42e89364de..71f5e03eea393b7733bf3bfeb4f2eeea5b21eb85 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -186,7 +186,7 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
              webpage = self._download_webpage(request,
                                               id, u'Downloading page %s' % pagenum)
  
-            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+            playlist_el = get_element_by_attribute(u'class', u'row video_list', webpage)
              video_ids.extend(re.findall(r'data-id="(.+?)"', playlist_el))
  
              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py

index a804e83bdc637fa79fa85c6e65f1eecaafe9e3a9..d418ce4a8a29c122e811c96aac76d388c790b560 100644 (file)
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -1,6 +1,5 @@
  # encoding: utf-8
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -29,17 +28,16 @@ class DaumIE(InfoExtractor):
          video_id = mobj.group(1)
          canonical_url = 'http://tvpot.daum.net/v/%s' % video_id
          webpage = self._download_webpage(canonical_url, video_id)
-        full_id = self._search_regex(r'<link rel="video_src" href=".+?vid=(.+?)"',
+        full_id = self._search_regex(
+            r'<iframe src="http://videofarm.daum.net/controller/video/viewer/Video.html\?.*?vid=(.+?)[&"]',
              webpage, u'full id')
          query = compat_urllib_parse.urlencode({'vid': full_id})
-        info_xml = self._download_webpage(
+        info = self._download_xml(
              'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id,
              u'Downloading video info')
-        urls_xml = self._download_webpage(
+        urls = self._download_xml(
              'http://videofarm.daum.net/controller/api/open/v1_2/MovieData.apixml?' + query,
              video_id, u'Downloading video formats info')
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
-        urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
  
          self.to_screen(u'%s: Getting video urls' % video_id)
          formats = []
@@ -49,10 +47,9 @@ class DaumIE(InfoExtractor):
                  'vid': full_id,
                  'profile': profile,
              })
-            url_xml = self._download_webpage(
+            url_doc = self._download_xml(
                  'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
                  video_id, note=False)
-            url_doc = xml.etree.ElementTree.fromstring(url_xml.encode('utf-8'))
              format_url = url_doc.find('result/url').text
              formats.append({
                  'url': format_url,
@@ -60,7 +57,7 @@ class DaumIE(InfoExtractor):
                  'format_id': profile,
              })
  
-        info = {
+        return {
              'id': video_id,
              'title': info.find('TITLE').text,
              'formats': formats,
@@ -69,6 +66,3 @@ class DaumIE(InfoExtractor):
              'duration': int(info.find('DURATION').text),
              'upload_date': info.find('REGDTTM').text[:8],
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py

index 765cb1f377df132ee91deac5872877777187cd6a..24ce794255211112eafadaf2b5a629716b90aa5e 100644 (file)
--- a/youtube_dl/extractor/dreisat.py
+++ b/youtube_dl/extractor/dreisat.py
@@ -1,7 +1,6 @@
  # coding: utf-8
  
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -30,8 +29,7 @@ class DreiSatIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
          details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
-        details_xml = self._download_webpage(details_url, video_id, note=u'Downloading video details')
-        details_doc = xml.etree.ElementTree.fromstring(details_xml.encode('utf-8'))
+        details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details')
  
          thumbnail_els = details_doc.findall('.//teaserimage')
          thumbnails = [{
@@ -67,7 +65,7 @@ class DreiSatIE(InfoExtractor):
              return (qidx, prefer_http, format['video_bitrate'])
          formats.sort(key=_sortkey)
  
-        info = {
+        return {
              '_type': 'video',
              'id': video_id,
              'title': video_title,
@@ -78,8 +76,3 @@ class DreiSatIE(InfoExtractor):
              'uploader': video_uploader,
              'upload_date': upload_date,
          }
-
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
diff --git a/youtube_dl/extractor/ebaumsworld.py b/youtube_dl/extractor/ebaumsworld.py

index f02c6998b7c8e316b698f111532d87634323721f..877113d63a7261a284a628a06908c466d446613d 100644 (file)
--- a/youtube_dl/extractor/ebaumsworld.py
+++ b/youtube_dl/extractor/ebaumsworld.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import determine_ext
@@ -21,9 +20,8 @@ class EbaumsWorldIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
-        config_xml = self._download_webpage(
+        config = self._download_xml(
              'http://www.ebaumsworld.com/video/player/%s' % video_id, video_id)
-        config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
          video_url = config.find('file').text
  
          return {
diff --git a/youtube_dl/extractor/eighttracks.py b/youtube_dl/extractor/eighttracks.py

index 2cfbcd363c0db4f2505d8da7120d7c3161a7b0a9..f21ef88530d2f8913b4b35d9c03fc4fc14de7ddc 100644 (file)
--- a/youtube_dl/extractor/eighttracks.py
+++ b/youtube_dl/extractor/eighttracks.py
@@ -1,4 +1,3 @@
-import itertools
  import json
  import random
  import re
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

index 3aa2da52c0117bc9926df9c250eeb70da6cc2299..b1242f6bc457a41a9c8413eb851671acd05cc8c0 100644 (file)
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -11,11 +11,11 @@ from ..utils import (
  
  
  class EscapistIE(InfoExtractor):
-    _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
+    _VALID_URL = r'^https?://?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
      _TEST = {
          u'url': u'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
          u'file': u'6618-Breaking-Down-Baldurs-Gate.mp4',
-        u'md5': u'c6793dbda81388f4264c1ba18684a74d',
+        u'md5': u'ab3a706c681efca53f0a35f1415cf0d1',
          u'info_dict': {
              u"description": u"Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.", 
              u"uploader": u"the-escapist-presents", 
@@ -25,50 +25,60 @@ class EscapistIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          showName = mobj.group('showname')
          videoId = mobj.group('episode')
  
          self.report_extraction(videoId)
          webpage = self._download_webpage(url, videoId)
  
-        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+        videoDesc = self._html_search_regex(
+            r'<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
  
-        playerUrl = self._og_search_video_url(webpage, name='player url')
+        playerUrl = self._og_search_video_url(webpage, name=u'player URL')
  
-        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
-            webpage, u'player url').split(' : ')[-1]
+        title = self._html_search_regex(
+            r'<meta name="title" content="([^"]*)"',
+            webpage, u'title').split(' : ')[-1]
  
-        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config URL')
          configUrl = compat_urllib_parse.unquote(configUrl)
  
-        configJSON = self._download_webpage(configUrl, videoId,
-                                            u'Downloading configuration',
-                                            u'unable to download configuration')
-
-        # Technically, it's JavaScript, not JSON
-        configJSON = configJSON.replace("'", '"')
-
+        formats = []
+
+        def _add_format(name, cfgurl):
+            configJSON = self._download_webpage(
+                cfgurl, videoId,
+                u'Downloading ' + name + ' configuration',
+                u'Unable to download ' + name + ' configuration')
+
+            # Technically, it's JavaScript, not JSON
+            configJSON = configJSON.replace("'", '"')
+
+            try:
+                config = json.loads(configJSON)
+            except (ValueError,) as err:
+                raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+            playlist = config['playlist']
+            formats.append({
+                'url': playlist[1]['url'],
+                'format_id': name,
+            })
+
+        _add_format(u'normal', configUrl)
+        hq_url = (configUrl +
+                  ('&hq=1' if '?' in configUrl else configUrl + '?hq=1'))
          try:
-            config = json.loads(configJSON)
-        except (ValueError,) as err:
-            raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
+            _add_format(u'hq', hq_url)
+        except ExtractorError:
+            pass  # That's fine, we'll just use normal quality
  
-        playlist = config['playlist']
-        videoUrl = playlist[1]['url']
-
-        info = {
+        return {
              'id': videoId,
-            'url': videoUrl,
+            'formats': formats,
              'uploader': showName,
-            'upload_date': None,
              'title': title,
-            'ext': 'mp4',
              'thumbnail': self._og_search_thumbnail(webpage),
              'description': videoDesc,
              'player_url': playerUrl,
          }
-
-        return [info]
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py

index f8bdfc2d33c9f00b9f902a4303eb7024f4646312..3b210710e3695ec3aa940b335d9868a281d7740a 100644 (file)
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -1,5 +1,4 @@
  import json
-import netrc
  import re
  import socket
  
diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py

index 89ed08db4cbb99f9381013813fa03a19474c8e24..d0dfde694b4d93f7249f2dd3a326ecb0bdca98dd 100644 (file)
--- a/youtube_dl/extractor/faz.py
+++ b/youtube_dl/extractor/faz.py
@@ -1,6 +1,5 @@
  # encoding: utf-8
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -28,9 +27,8 @@ class FazIE(InfoExtractor):
          webpage = self._download_webpage(url, video_id)
          config_xml_url = self._search_regex(r'writeFLV\(\'(.+?)\',', webpage,
              u'config xml url')
-        config_xml = self._download_webpage(config_xml_url, video_id,
+        config = self._download_xml(config_xml_url, video_id,
              u'Downloading config xml')
-        config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
  
          encodings = config.find('ENCODINGS')
          formats = []
@@ -46,13 +44,10 @@ class FazIE(InfoExtractor):
              })
  
          descr = self._html_search_regex(r'<p class="Content Copy">(.*?)</p>', webpage, u'description')
-        info = {
+        return {
              'id': video_id,
              'title': self._og_search_title(webpage),
              'formats': formats,
              'description': descr,
              'thumbnail': config.find('STILL/STILL_BIG').text,
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/fktv.py b/youtube_dl/extractor/fktv.py

index 9c89362efafefbb22c4dd5e4ef73950446fe9246..dba1a8dc262979b5afce987211bab2f14e502dba 100644 (file)
--- a/youtube_dl/extractor/fktv.py
+++ b/youtube_dl/extractor/fktv.py
@@ -39,7 +39,6 @@ class FKTVIE(InfoExtractor):
          for i, _ in enumerate(files, 1):
              video_id = '%04d%d' % (episode, i)
              video_url = 'http://dl%d.fernsehkritik.tv/fernsehkritik%d%s.flv' % (server, episode, '' if i == 1 else '-%d' % i)
-            video_title = 'Fernsehkritik %d.%d' % (episode, i)
              videos.append({
                  'id': video_id,
                  'url': video_url,
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py

index 086cafca027e3b99967f50f3220d1632f1cc8033..6e1971043b3853b9fe54e682473a61621c9989e2 100644 (file)
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -1,6 +1,5 @@
  # encoding: utf-8
  import re
-import xml.etree.ElementTree
  import json
  
  from .common import InfoExtractor
@@ -11,11 +10,10 @@ from ..utils import (
  
  class FranceTVBaseInfoExtractor(InfoExtractor):
      def _extract_video(self, video_id):
-        xml_desc = self._download_webpage(
+        info = self._download_xml(
              'http://www.francetvinfo.fr/appftv/webservices/video/'
              'getInfosOeuvre.php?id-diffusion='
              + video_id, video_id, 'Downloading XML config')
-        info = xml.etree.ElementTree.fromstring(xml_desc.encode('utf-8'))
  
          manifest_url = info.find('videos/video/url').text
          video_url = manifest_url.replace('manifest.f4m', 'index_2_av.m3u8')
diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py

new file mode 100644 (file)

index 0000000..c91669b
--- /dev/null
+++ b/youtube_dl/extractor/gamekings.py
@@ -0,0 +1,38 @@
+import re
+
+from .common import InfoExtractor
+
+
+class GamekingsIE(InfoExtractor):
+    _VALID_URL = r'http?://www\.gamekings\.tv/videos/(?P<name>[0-9a-z\-]+)'
+    _TEST = {
+        u"url": u"http://www.gamekings.tv/videos/phoenix-wright-ace-attorney-dual-destinies-review/",
+        u'file': u'20130811.mp4',
+        # MD5 is flaky, seems to change regularly
+        #u'md5': u'2f32b1f7b80fdc5cb616efb4f387f8a3',
+        u'info_dict': {
+            u"title": u"Phoenix Wright: Ace Attorney \u2013 Dual Destinies Review",
+            u"description": u"Melle en Steven hebben voor de review een week in de rechtbank doorbracht met Phoenix Wright: Ace Attorney - Dual Destinies.",
+        }
+    }
+
+    def _real_extract(self, url):
+
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        webpage = self._download_webpage(url, name)
+        video_url = self._og_search_video_url(webpage)
+
+        video = re.search(r'[0-9]+', video_url)
+        video_id = video.group(0)
+
+        # Todo: add medium format
+        video_url = video_url.replace(video_id, 'large/' + video_id)
+
+        return {
+            'id': video_id,
+            'ext': 'mp4',
+            'url': video_url,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+        }
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py

index 098768361ede01d8acc01dc773a31b5b8fc67241..26b7d2ae531f785bc3177af4029652c531d840da 100644 (file)
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -24,7 +24,7 @@ class GameSpotIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        page_id = video_id = mobj.group('page_id')
+        page_id = mobj.group('page_id')
          webpage = self._download_webpage(url, page_id)
          data_video_json = self._search_regex(r'data-video=\'(.*?)\'', webpage, u'data video')
          data_video = json.loads(unescapeHTML(data_video_json))
@@ -47,13 +47,10 @@ class GameSpotIE(InfoExtractor):
                  'format_id': q,
              })
  
-        info = {
+        return {
              'id': data_video['guid'],
              'title': compat_urllib_parse.unquote(data_video['title']),
              'formats': formats,
              'description': get_meta_content('description', webpage),
              'thumbnail': self._og_search_thumbnail(webpage),
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/gametrailers.py b/youtube_dl/extractor/gametrailers.py

index 3cc02d97e04aace34e0eb03cccab254f4927f77d..3a8bef250fa8eddd89af54291228ec3909c1453c 100644 (file)
--- a/youtube_dl/extractor/gametrailers.py
+++ b/youtube_dl/extractor/gametrailers.py
@@ -1,13 +1,11 @@
  import re
  
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
  
-class GametrailersIE(MTVIE):
-    """
-    Gametrailers use the same videos system as MTVIE, it just changes the feed
-    url, where the uri is and the method to get the thumbnails.
-    """
+
+class GametrailersIE(MTVServicesInfoExtractor):
      _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
      _TEST = {
          u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
          u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
@@ -17,15 +15,9 @@ class GametrailersIE(MTVIE):
              u'description': u'Faith is back!  Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
          },
      }
-    # Overwrite MTVIE properties we don't want
-    _TESTS = []
  
      _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
  
-    def _get_thumbnail_url(self, uri, itemdoc):
-        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
-        return itemdoc.find(search_path).attrib['url']
-
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index c7552fddb587a60454bec6faa174c36bd4aa9a4a..10ae06263ef1349ff6526575feb61cb71591f3cc 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -162,6 +162,16 @@ class GenericIE(InfoExtractor):
              raise ExtractorError(u'Failed to download URL: %s' % url)
  
          self.report_extraction(video_id)
+
+        # it's tempting to parse this further, but you would
+        # have to take into account all the variations like
+        #   Video Title - Site Name
+        #   Site Name | Video Title
+        #   Video Title - Tagline | Site Name
+        # and so on and so forth; it's just not practical
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title', default=u'video', flags=re.DOTALL)
+
          # Look for BrightCove:
          bc_url = BrightcoveIE._extract_brightcove_url(webpage)
          if bc_url is not None:
@@ -177,17 +187,29 @@ class GenericIE(InfoExtractor):
              return self.url_result(surl, 'Vimeo')
  
          # Look for embedded YouTube player
-        mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?youtube.com/embed/.+?)\1', webpage)
-        if mobj:
-            surl = unescapeHTML(mobj.group(u'url'))
-            return self.url_result(surl, 'Youtube')
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube.com/embed/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Youtube')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
+
+        # Look for embedded Dailymotion player
+        matches = re.findall(
+            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion.com/embed/video/.+?)\1', webpage)
+        if matches:
+            urlrs = [self.url_result(unescapeHTML(tuppl[1]), 'Dailymotion')
+                     for tuppl in matches]
+            return self.playlist_result(
+                urlrs, playlist_id=video_id, playlist_title=video_title)
  
          # Look for Bandcamp pages with custom domain
          mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
          if mobj is not None:
              burl = unescapeHTML(mobj.group(1))
-            return self.url_result(burl, 'Bandcamp')
+            # Don't set the extractor because it can be a track url or an album
+            return self.url_result(burl)
  
          # Start with something easy: JW Player in SWFObject
          mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
@@ -196,7 +218,7 @@ class GenericIE(InfoExtractor):
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
          if mobj is None:
              # Broaden the search a little bit: JWPlayer JS loader
-            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"&]*)', webpage)
+            mobj = re.search(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http[^\'"]*)', webpage)
          if mobj is None:
              # Try to find twitter cards info
              mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
@@ -223,27 +245,16 @@ class GenericIE(InfoExtractor):
          video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
  
          # here's a fun little line of code for you:
-        video_extension = os.path.splitext(video_id)[1][1:]
          video_id = os.path.splitext(video_id)[0]
  
-        # it's tempting to parse this further, but you would
-        # have to take into account all the variations like
-        #   Video Title - Site Name
-        #   Site Name | Video Title
-        #   Video Title - Tagline | Site Name
-        # and so on and so forth; it's just not practical
-        video_title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'video title', default=u'video', flags=re.DOTALL)
-
          # video uploader is domain name
          video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
              url, u'video uploader')
  
-        return [{
+        return {
              'id':       video_id,
              'url':      video_url,
              'uploader': video_uploader,
              'upload_date':  None,
              'title':    video_title,
-            'ext':      video_extension,
-        }]
+        }
diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py

index 46954337f25e1cbd7bae89e7da76d4e93ecc8c9e..bafc5826f680353af40b820609a543192ac73d17 100644 (file)
--- a/youtube_dl/extractor/howcast.py
+++ b/youtube_dl/extractor/howcast.py
@@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor):
      _TEST = {
          u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly',
          u'file': u'390161.mp4',
-        u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138',
+        u'md5': u'8b743df908c42f60cf6496586c7f12c3',
          u'info_dict': {
              u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.", 
              u"title": u"How to Tie a Square Knot Properly"
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py

new file mode 100644 (file)

index 0000000..d8e9712
--- /dev/null
+++ b/youtube_dl/extractor/imdb.py
@@ -0,0 +1,59 @@
+import re
+import json
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urlparse,
+    get_element_by_attribute,
+)
+
+
+class ImdbIE(InfoExtractor):
+    IE_NAME = u'imdb'
+    IE_DESC = u'Internet Movie Database trailers'
+    _VALID_URL = r'http://www\.imdb\.com/video/imdb/vi(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.imdb.com/video/imdb/vi2524815897',
+        u'md5': u'9f34fa777ade3a6e57a054fdbcb3a068',
+        u'info_dict': {
+            u'id': u'2524815897',
+            u'ext': u'mp4',
+            u'title': u'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
+            u'description': u'md5:9061c2219254e5d14e03c25c98e96a81',
+            u'duration': 151,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url,video_id)
+        descr = get_element_by_attribute('itemprop', 'description', webpage)
+        available_formats = re.findall(
+            r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
+            flags=re.MULTILINE)
+        formats = []
+        for f_id, f_path in available_formats:
+            format_page = self._download_webpage(
+                compat_urlparse.urljoin(url, f_path),
+                u'Downloading info for %s format' % f_id)
+            json_data = self._search_regex(
+                r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
+                format_page, u'json data', flags=re.DOTALL)
+            info = json.loads(json_data)
+            format_info = info['videoPlayerObject']['video']
+            formats.append({
+                'format_id': f_id,
+                'url': format_info['url'],
+                'height': int(info['titleObject']['encoding']['selected'][:-1]),
+            })
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'formats': formats,
+            'description': descr,
+            'thumbnail': format_info['slate'],
+            'duration': int(info['titleObject']['title']['duration_seconds']),
+        }
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py

index be8e05f539d7f64c301f7a63a488aedbf9d129cd..16a6f73c87e27d47d401c444aa02b80d5a6313b3 100644 (file)
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -43,9 +42,8 @@ class InternetVideoArchiveIE(InfoExtractor):
          video_id = query_dic['publishedid'][0]
          url = self._build_url(query)
  
-        flashconfiguration_xml = self._download_webpage(url, video_id,
+        flashconfiguration = self._download_xml(url, video_id,
              u'Downloading flash configuration')
-        flashconfiguration = xml.etree.ElementTree.fromstring(flashconfiguration_xml.encode('utf-8'))
          file_url = flashconfiguration.find('file').text
          file_url = file_url.replace('/playlist.aspx', '/mrssplaylist.aspx')
          # Replace some of the parameters in the query to get the best quality
@@ -53,9 +51,8 @@ class InternetVideoArchiveIE(InfoExtractor):
          file_url = re.sub(r'(?<=\?)(.+)$',
              lambda m: self._clean_query(m.group()),
              file_url)
-        info_xml = self._download_webpage(file_url, video_id,
+        info = self._download_xml(file_url, video_id,
              u'Downloading video info')
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
          item = info.find('channel/item')
  
          def _bp(p):
diff --git a/youtube_dl/extractor/jeuxvideo.py b/youtube_dl/extractor/jeuxvideo.py

index 6bb54b932298395b8f07554b12ad6091cca140d3..caf9d8c85f447cf1a8bee6b53b8e3b72ee2ad6b7 100644 (file)
--- a/youtube_dl/extractor/jeuxvideo.py
+++ b/youtube_dl/extractor/jeuxvideo.py
@@ -2,7 +2,6 @@
  
  import json
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  
@@ -22,7 +21,7 @@ class JeuxVideoIE(InfoExtractor):
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        title = re.match(self._VALID_URL, url).group(1)
+        title = mobj.group(1)
          webpage = self._download_webpage(url, title)
          xml_link = self._html_search_regex(
              r'<param name="flashvars" value="config=(.*?)" />',
@@ -32,12 +31,9 @@ class JeuxVideoIE(InfoExtractor):
              r'http://www\.jeuxvideo\.com/config/\w+/\d+/(.*?)/\d+_player\.xml',
              xml_link, u'video ID')
  
-        xml_config = self._download_webpage(
+        config = self._download_xml(
              xml_link, title, u'Downloading XML config')
-        config = xml.etree.ElementTree.fromstring(xml_config.encode('utf-8'))
-        info_json = self._search_regex(
-            r'(?sm)<format\.json>(.*?)</format\.json>',
-            xml_config, u'JSON information')
+        info_json = config.find('format.json').text
          info = json.loads(info_json)['versions'][0]
          
          video_url = 'http://video720.jeuxvideo.com/' + info['file']
diff --git a/youtube_dl/extractor/justintv.py b/youtube_dl/extractor/justintv.py

index f6001799231f6361608643e0b2a27667913c882e..e9bde0c186a76e0546f97cdb08bf69e2b80b3e93 100644 (file)
--- a/youtube_dl/extractor/justintv.py
+++ b/youtube_dl/extractor/justintv.py
@@ -1,7 +1,6 @@
  import json
  import os
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -94,10 +93,9 @@ class JustinTVIE(InfoExtractor):
              archive_id = m.group(1)
  
              api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
-            chapter_info_xml = self._download_webpage(api, chapter_id,
+            doc = self._download_xml(api, chapter_id,
                                               note=u'Downloading chapter information',
                                               errnote=u'Chapter information download failed')
-            doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
              for a in doc.findall('.//archive'):
                  if archive_id == a.find('./id').text:
                      break
diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py

index 4531fd6ab23a958d3d4dc4e38d52e7f330d85196..9bc35b115033ce641e4435ebb807c6e1c93c975e 100644 (file)
--- a/youtube_dl/extractor/livestream.py
+++ b/youtube_dl/extractor/livestream.py
@@ -5,12 +5,12 @@ from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urlparse,
-    get_meta_content,
-    ExtractorError,
+    xpath_with_ns,
  )
  
  
  class LivestreamIE(InfoExtractor):
+    IE_NAME = u'livestream'
      _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
      _TEST = {
          u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
@@ -54,3 +54,43 @@ class LivestreamIE(InfoExtractor):
              info = json.loads(self._download_webpage(api_url, video_id,
                                                       u'Downloading video info'))
              return self._extract_video_info(info)
+
+
+# The original version of Livestream uses a different system
+class LivestreamOriginalIE(InfoExtractor):
+    IE_NAME = u'livestream:original'
+    _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)'
+    _TEST = {
+        u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+        u'info_dict': {
+            u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb',
+            u'ext': u'flv',
+            u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital',
+        },
+        u'params': {
+            # rtmp
+            u'skip_download': True,
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        user = mobj.group('user')
+        api_url = 'http://x{0}x.api.channel.livestream.com/2.0/clipdetails?extendedInfo=true&id={1}'.format(user, video_id)
+
+        info = self._download_xml(api_url, video_id)
+        item = info.find('channel').find('item')
+        ns = {'media': 'http://search.yahoo.com/mrss'}
+        thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url']
+        # Remove the extension and number from the path (like 1.jpg)
+        path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path')
+
+        return {
+            'id': video_id,
+            'title': item.find('title').text,
+            'url': 'rtmp://extondemand.livestream.com/ondemand',
+            'play_path': 'mp4:trans/dv15/mogulus-{0}.mp4'.format(path),
+            'ext': 'flv',
+            'thumbnail': thumbnail_url,
+        }
diff --git a/youtube_dl/extractor/metacritic.py b/youtube_dl/extractor/metacritic.py

index 449138b569f80c97154ea79ac874617efc484a3f..6b95b4998852ac61d1061e0dcf6c3f442772fee2 100644 (file)
--- a/youtube_dl/extractor/metacritic.py
+++ b/youtube_dl/extractor/metacritic.py
@@ -43,13 +43,10 @@ class MetacriticIE(InfoExtractor):
          description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>',
              webpage, u'description', flags=re.DOTALL)
  
-        info = {
+        return {
              'id': video_id,
              'title': clip.find('title').text,
              'formats': formats,
              'description': description,
              'duration': int(clip.find('duration').text),
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py

index a200dcd74a5a7af220cedea02a60c01cfd643e79..e2baf44d7e15032022e6b304ace2bf8ef11a09b2 100644 (file)
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -60,7 +60,7 @@ class MixcloudIE(InfoExtractor):
              'title': info['name'],
              'url': final_song_url,
              'ext': 'mp3',
-            'description': info['description'],
+            'description': info.get('description'),
              'thumbnail': info['pictures'].get('extra_large'),
              'uploader': info['user']['name'],
              'uploader_id': info['user']['username'],
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py

index 24a79ae130e7ca52983ae9846532f643eda60085..6b3feb560768f96c4d5b3bb3adc0989ecf1c1d4f 100644 (file)
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -10,35 +10,8 @@ from ..utils import (
  def _media_xml_tag(tag):
      return '{http://search.yahoo.com/mrss/}%s' % tag
  
-class MTVIE(InfoExtractor):
-    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
-
-    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
-
-    _TESTS = [
-        {
-            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
-            u'file': u'853555.mp4',
-            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
-            u'info_dict': {
-                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
-                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
-            },
-        },
-        {
-            u'add_ie': ['Vevo'],
-            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
-            u'file': u'USCJY1331283.mp4',
-            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
-            u'info_dict': {
-                u'title': u'Everything Has Changed',
-                u'upload_date': u'20130606',
-                u'uploader': u'Taylor Swift',
-            },
-            u'skip': u'VEVO is only available in some countries',
-        },
-    ]
  
+class MTVServicesInfoExtractor(InfoExtractor):
      @staticmethod
      def _id_from_uri(uri):
          return uri.split(':')[-1]
@@ -48,18 +21,22 @@ class MTVIE(InfoExtractor):
      def _transform_rtmp_url(rtmp_video_url):
          m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
          if not m:
-            raise ExtractorError(u'Cannot transform RTMP url')
+            return rtmp_video_url
          base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
          return base + m.group('finalid')
  
      def _get_thumbnail_url(self, uri, itemdoc):
-        return 'http://mtv.mtvnimages.com/uri/' + uri
+        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+        thumb_node = itemdoc.find(search_path)
+        if thumb_node is None:
+            return None
+        else:
+            return thumb_node.attrib['url']
  
      def _extract_video_formats(self, metadataXml):
          if '/error_country_block.swf' in metadataXml:
              raise ExtractorError(u'This video is not available from your country.', expected=True)
          mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
-        renditions = mdoc.findall('.//rendition')
  
          formats = []
          for rendition in mdoc.findall('.//rendition'):
@@ -94,7 +71,7 @@ class MTVIE(InfoExtractor):
          else:
              description = None
  
-        info = {
+        return {
              'title': itemdoc.find('title').text,
              'formats': self._extract_video_formats(mediagen_page),
              'id': video_id,
@@ -102,19 +79,46 @@ class MTVIE(InfoExtractor):
              'description': description,
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(info['formats'][-1])
-
-        return info
-
      def _get_videos_info(self, uri):
          video_id = self._id_from_uri(uri)
          data = compat_urllib_parse.urlencode({'uri': uri})
-        infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
+        idoc = self._download_xml(self._FEED_URL +'?' + data, video_id,
                                           u'Downloading info')
-        idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
          return [self._get_video_info(item) for item in idoc.findall('.//item')]
  
+
+class MTVIE(MTVServicesInfoExtractor):
+    _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+
+    _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+
+    _TESTS = [
+        {
+            u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+            u'file': u'853555.mp4',
+            u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+            u'info_dict': {
+                u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+                u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+            },
+        },
+        {
+            u'add_ie': ['Vevo'],
+            u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+            u'file': u'USCJY1331283.mp4',
+            u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+            u'info_dict': {
+                u'title': u'Everything Has Changed',
+                u'upload_date': u'20130606',
+                u'uploader': u'Taylor Swift',
+            },
+            u'skip': u'VEVO is only available in some countries',
+        },
+    ]
+
+    def _get_thumbnail_url(self, uri, itemdoc):
+        return 'http://mtv.mtvnimages.com/uri/' + uri
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py

index 107665d15f1b939155159121c1ff00da58a33243..0067bf134fb416596c5db6948060ede7881421fa 100644 (file)
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -1,5 +1,4 @@
  import os.path
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -33,8 +32,7 @@ class MySpassIE(InfoExtractor):
  
          # get metadata
          metadata_url = META_DATA_URL_TEMPLATE % video_id
-        metadata_text = self._download_webpage(metadata_url, video_id)
-        metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
+        metadata = self._download_xml(metadata_url, video_id)
  
          # extract values from metadata
          url_flv_el = metadata.find('url_flv')
diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py

index 9df236d69850bdc8e97fcb595e81b675c970ad1b..c012ec0cfacb2afea6b395c5c87509f53ed58614 100644 (file)
--- a/youtube_dl/extractor/naver.py
+++ b/youtube_dl/extractor/naver.py
@@ -1,6 +1,5 @@
  # encoding: utf-8
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -38,14 +37,12 @@ class NaverIE(InfoExtractor):
              'protocol': 'p2p',
              'inKey': key,
          })
-        info_xml = self._download_webpage(
+        info = self._download_xml(
              'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query,
              video_id, u'Downloading video info')
-        urls_xml = self._download_webpage(
+        urls = self._download_xml(
              'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls,
              video_id, u'Downloading video formats info')
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8'))
-        urls = xml.etree.ElementTree.fromstring(urls_xml.encode('utf-8'))
  
          formats = []
          for format_el in urls.findall('EncodingOptions/EncodingOption'):
@@ -59,7 +56,7 @@ class NaverIE(InfoExtractor):
                  'height': int(format_el.find('height').text),
              })
  
-        info = {
+        return {
              'id': video_id,
              'title': info.find('Subject').text,
              'formats': formats,
@@ -68,6 +65,3 @@ class NaverIE(InfoExtractor):
              'upload_date': info.find('WriteDate').text.replace('.', ''),
              'view_count': int(info.find('PlayCount').text),
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-        return info
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py

index 3bc9dae6de4db9a71d8d2261030f604f6abc14ce..e8bbfff7bd59eeaca0e04c83ee1599baff8af088 100644 (file)
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import find_xpath_attr, compat_str
@@ -21,8 +20,8 @@ class NBCNewsIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
-        info_xml = self._download_webpage('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
-        info = xml.etree.ElementTree.fromstring(info_xml.encode('utf-8')).find('video')
+        all_info = self._download_xml('http://www.nbcnews.com/id/%s/displaymode/1219' % video_id, video_id)
+        info = all_info.find('video')
  
          return {'id': video_id,
                  'title': info.find('headline').text,
diff --git a/youtube_dl/extractor/nhl.py b/youtube_dl/extractor/nhl.py

index 224f56ac84b77647c2ff5468b41d229786da632e..2edd806a3f6aa12792f3c8d8065a57fd2e2e70a1 100644 (file)
--- a/youtube_dl/extractor/nhl.py
+++ b/youtube_dl/extractor/nhl.py
@@ -1,6 +1,5 @@
  import re
  import json
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -26,9 +25,8 @@ class NHLBaseInfoExtractor(InfoExtractor):
              'path': initial_video_url.replace('.mp4', '_sd.mp4'),
          })
          path_url = 'http://video.nhl.com/videocenter/servlets/encryptvideopath?' + data
-        path_response = self._download_webpage(path_url, video_id,
+        path_doc = self._download_xml(path_url, video_id,
              u'Downloading final video url')
-        path_doc = xml.etree.ElementTree.fromstring(path_response)
          video_url = path_doc.find('path').text
  
          join = compat_urlparse.urljoin
@@ -72,7 +70,7 @@ class NHLIE(NHLBaseInfoExtractor):
  
  class NHLVideocenterIE(NHLBaseInfoExtractor):
      IE_NAME = u'nhl.com:videocenter'
-    IE_DESC = u'Download the first 12 videos from a videocenter category'
+    IE_DESC = u'NHL videocenter category'
      _VALID_URL = r'https?://video\.(?P<team>[^.]*)\.nhl\.com/videocenter/(console\?.*?catid=(?P<catid>[^&]+))?'
  
      @classmethod
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py

new file mode 100644 (file)

index 0000000..4677431
--- /dev/null
+++ b/youtube_dl/extractor/niconico.py
@@ -0,0 +1,127 @@
+# encoding: utf-8
+
+import re
+import socket
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_http_client,
+    compat_urllib_error,
+    compat_urllib_parse,
+    compat_urllib_request,
+    compat_urlparse,
+    compat_str,
+
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class NiconicoIE(InfoExtractor):
+    IE_NAME = u'niconico'
+    IE_DESC = u'ニコニコ動画'
+
+    _TEST = {
+        u'url': u'http://www.nicovideo.jp/watch/sm22312215',
+        u'file': u'sm22312215.mp4',
+        u'md5': u'd1a75c0823e2f629128c43e1212760f9',
+        u'info_dict': {
+            u'title': u'Big Buck Bunny',
+            u'uploader': u'takuya0301',
+            u'uploader_id': u'2698420',
+            u'upload_date': u'20131123',
+            u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
+        },
+        u'params': {
+            u'username': u'ydl.niconico@gmail.com',
+            u'password': u'youtube-dl',
+        },
+    }
+
+    _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$'
+    _NETRC_MACHINE = 'niconico'
+    # If True it will raise an error if no login info is provided
+    _LOGIN_REQUIRED = True
+
+    def _real_initialize(self):
+        self._login()
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        # No authentication to be performed
+        if username is None:
+            if self._LOGIN_REQUIRED:
+                raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+            return False
+
+        # Log in
+        login_form_strs = {
+            u'mail': username,
+            u'password': password,
+        }
+        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+        # chokes on unicode
+        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+        login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
+        request = compat_urllib_request.Request(
+            u'https://secure.nicovideo.jp/secure/login', login_data)
+        login_results = self._download_webpage(
+            request, u'', note=u'Logging in', errnote=u'Unable to log in')
+        if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+            self._downloader.report_warning(u'unable to log in: bad username or password')
+            return False
+        return True
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        # Get video webpage. We are not actually interested in it, but need
+        # the cookies in order to be able to download the info webpage
+        self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
+
+        video_info = self._download_xml(
+            'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
+            note=u'Downloading video info page')
+
+        # Get flv info
+        flv_info_webpage = self._download_webpage(
+            u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id,
+            video_id, u'Downloading flv info')
+        video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0]
+
+        # Start extracting information
+        video_title = video_info.find('.//title').text
+        video_extension = video_info.find('.//movie_type').text
+        video_format = video_extension.upper()
+        video_thumbnail = video_info.find('.//thumbnail_url').text
+        video_description = video_info.find('.//description').text
+        video_uploader_id = video_info.find('.//user_id').text
+        video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0])
+        video_view_count = video_info.find('.//view_counter').text
+        video_webpage_url = video_info.find('.//watch_url').text
+
+        # uploader
+        video_uploader = video_uploader_id
+        url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id
+        try:
+            user_info = self._download_xml(
+                url, video_id, note=u'Downloading user information')
+            video_uploader = user_info.find('.//nickname').text
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err))
+
+        return {
+            'id':          video_id,
+            'url':         video_real_url,
+            'title':       video_title,
+            'ext':         video_extension,
+            'format':      video_format,
+            'thumbnail':   video_thumbnail,
+            'description': video_description,
+            'uploader':    video_uploader,
+            'upload_date': video_upload_date,
+            'uploader_id': video_uploader_id,
+            'view_count':  video_view_count,
+            'webpage_url': video_webpage_url,
+        }
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py

new file mode 100644 (file)

index 0000000..5820097
--- /dev/null
+++ b/youtube_dl/extractor/podomatic.py
@@ -0,0 +1,49 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class PodomaticIE(InfoExtractor):
+    IE_NAME = 'podomatic'
+    _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+
+    _TEST = {
+        u"url": u"http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00",
+        u"file": u"2009-01-02T16_03_35-08_00.mp3",
+        u"md5": u"84bb855fcf3429e6bf72460e1eed782d",
+        u"info_dict": {
+            u"uploader": u"Science Teaching Tips",
+            u"uploader_id": u"scienceteachingtips",
+            u"title": u"64.  When the Moon Hits Your Eye",
+            u"duration": 446,
+        }
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        channel = mobj.group('channel')
+
+        json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
+                     '?permalink=true&rtmp=0') %
+                    (mobj.group('proto'), channel, video_id))
+        data_json = self._download_webpage(
+            json_url, video_id, note=u'Downloading video info')
+        data = json.loads(data_json)
+
+        video_url = data['downloadLink']
+        uploader = data['podcast']
+        title = data['title']
+        thumbnail = data['imageLocation']
+        duration = int(data['length'] / 1000.0)
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'uploader': uploader,
+            'uploader_id': channel,
+            'thumbnail': thumbnail,
+            'duration': duration,
+        }
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

index 75cf4bb9f6789157531377ed38ed100033e2393d..8b3471919565d4c7044d51eb24e8ef01cc8e77fc 100644 (file)
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -6,7 +6,6 @@ from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urllib_parse,
-    unescapeHTML,
  )
  from ..aes import (
      aes_decrypt_text
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py

index 994778e16758bc292a01e99e5292caee30a6d5c2..c2254ae8abdca2ab9dde2388fb2182b056ffd0e2 100644 (file)
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -8,7 +8,9 @@ class RedTubeIE(InfoExtractor):
      _TEST = {
          u'url': u'http://www.redtube.com/66418',
          u'file': u'66418.mp4',
-        u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
+        # md5 varies from time to time, as in
+        # https://travis-ci.org/rg3/youtube-dl/jobs/14052463#L295
+        #u'md5': u'7b8c22b5e7098a3e1c09709df1126d2d',
          u'info_dict': {
              u"title": u"Sucked on a toilet",
              u"age_limit": 18,
@@ -28,7 +30,7 @@ class RedTubeIE(InfoExtractor):
              r'<source src="(.+?)" type="video/mp4">', webpage, u'video URL')
  
          video_title = self._html_search_regex(
-            r'<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+            r'<h1 class="videoTitle[^"]*">(.+?)</h1>',
              webpage, u'title')
  
          # No self-labeling, but they describe themselves as
diff --git a/youtube_dl/extractor/rtlnow.py b/youtube_dl/extractor/rtlnow.py

index 9ac7c3be8c8f1b97f46c944f08124eafbe8f1a5a..2f238de35832d61222331cf423e2691d8de52721 100644 (file)
--- a/youtube_dl/extractor/rtlnow.py
+++ b/youtube_dl/extractor/rtlnow.py
@@ -62,18 +62,6 @@ class RTLnowIE(InfoExtractor):
              u'skip_download': True,
          },
      },
-    {
-        u'url': u'http://www.rtlnitronow.de/recht-ordnung/stadtpolizei-frankfurt-gerichtsvollzieher-leipzig.php?film_id=129679&player=1&season=1',
-        u'file': u'129679.flv',
-        u'info_dict': {
-            u'upload_date': u'20131016', 
-            u'title': u'Recht & Ordnung - Stadtpolizei Frankfurt/ Gerichtsvollzieher...',
-            u'description': u'Stadtpolizei Frankfurt/ Gerichtsvollzieher Leipzig',
-        },
-        u'params': {
-            u'skip_download': True,
-        },
-    },
      {
          u'url': u'http://www.n-tvnow.de/top-gear/episode-1-2013-01-01-00-00-00.php?film_id=124903&player=1&season=10',
          u'file': u'124903.flv',
diff --git a/youtube_dl/extractor/sina.py b/youtube_dl/extractor/sina.py

index 14b1c656c538637b6922ff68a0785c448825e553..74a87fe56c58f929665f9ff811e187ca25258d20 100644 (file)
--- a/youtube_dl/extractor/sina.py
+++ b/youtube_dl/extractor/sina.py
@@ -1,7 +1,6 @@
  # coding: utf-8
  
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -35,12 +34,11 @@ class SinaIE(InfoExtractor):
  
      def _extract_video(self, video_id):
          data = compat_urllib_parse.urlencode({'vid': video_id})
-        url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+        url_doc = self._download_xml('http://v.iask.com/v_play.php?%s' % data,
              video_id, u'Downloading video url')
          image_page = self._download_webpage(
              'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
              video_id, u'Downloading thumbnail info')
-        url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
  
          return {'id': video_id,
                  'url': url_doc.find('./durl/url').text,
diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py

new file mode 100644 (file)

index 0000000..f035a32
--- /dev/null
+++ b/youtube_dl/extractor/smotri.py
@@ -0,0 +1,252 @@
+# encoding: utf-8
+
+import re
+import json
+import hashlib
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    ExtractorError
+)
+
+
+class SmotriIE(InfoExtractor):
+    IE_DESC = u'Smotri.com'
+    IE_NAME = u'smotri'
+    _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/video/view/\?id=(?P<videoid>v(?P<realvideoid>[0-9]+)[a-z0-9]{4}))'
+
+    _TESTS = [
+        # real video id 2610366
+        {
+            u'url': u'http://smotri.com/video/view/?id=v261036632ab',
+            u'file': u'v261036632ab.mp4',
+            u'md5': u'2a7b08249e6f5636557579c368040eb9',
+            u'info_dict': {
+                u'title': u'катастрофа с камер видеонаблюдения',
+                u'uploader': u'rbc2008',
+                u'uploader_id': u'rbc08',
+                u'upload_date': u'20131118',
+                u'description': u'катастрофа с камер видеонаблюдения, видео катастрофа с камер видеонаблюдения',
+                u'thumbnail': u'http://frame6.loadup.ru/8b/a9/2610366.3.3.jpg',
+            },
+        },
+        # real video id 57591
+        {
+            u'url': u'http://smotri.com/video/view/?id=v57591cb20',
+            u'file': u'v57591cb20.flv',
+            u'md5': u'830266dfc21f077eac5afd1883091bcd',
+            u'info_dict': {
+                u'title': u'test',
+                u'uploader': u'Support Photofile@photofile',
+                u'uploader_id': u'support-photofile',
+                u'upload_date': u'20070704',
+                u'description': u'test, видео test',
+                u'thumbnail': u'http://frame4.loadup.ru/03/ed/57591.2.3.jpg',
+            },
+        },
+        # video-password
+        {
+            u'url': u'http://smotri.com/video/view/?id=v1390466a13c',
+            u'file': u'v1390466a13c.mp4',
+            u'md5': u'f6331cef33cad65a0815ee482a54440b',
+            u'info_dict': {
+                u'title': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+                u'uploader': u'timoxa40',
+                u'uploader_id': u'timoxa40',
+                u'upload_date': u'20100404',
+                u'thumbnail': u'http://frame7.loadup.ru/af/3f/1390466.3.3.jpg',
+                u'description': u'TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1, видео TOCCA_A_NOI_-_LE_COSE_NON_VANNO_CAMBIAMOLE_ORA-1',
+            },
+            u'params': {
+                u'videopassword': u'qwerty',
+            },
+        },
+        # age limit + video-password
+        {
+            u'url': u'http://smotri.com/video/view/?id=v15408898bcf',
+            u'file': u'v15408898bcf.flv',
+            u'md5': u'91e909c9f0521adf5ee86fbe073aad70',
+            u'info_dict': {
+                u'title': u'этот ролик не покажут по ТВ',
+                u'uploader': u'zzxxx',
+                u'uploader_id': u'ueggb',
+                u'upload_date': u'20101001',
+                u'thumbnail': u'http://frame3.loadup.ru/75/75/1540889.1.3.jpg',
+                u'age_limit': 18,
+                u'description': u'этот ролик не покажут по ТВ, видео этот ролик не покажут по ТВ',
+            },
+            u'params': {
+                u'videopassword': u'333'
+            }
+        }
+    ]
+    
+    _SUCCESS = 0
+    _PASSWORD_NOT_VERIFIED = 1
+    _PASSWORD_DETECTED = 2
+    _VIDEO_NOT_FOUND = 3
+
+    def _search_meta(self, name, html, display_name=None):
+        if display_name is None:
+            display_name = name
+        return self._html_search_regex(
+            r'<meta itemprop="%s" content="([^"]+)" />' % re.escape(name),
+            html, display_name, fatal=False)
+        return self._html_search_meta(name, html, display_name)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('videoid')
+        real_video_id = mobj.group('realvideoid')
+
+        # Download video JSON data
+        video_json_url = 'http://smotri.com/vt.php?id=%s' % real_video_id
+        video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON')
+        video_json = json.loads(video_json_page)
+        
+        status = video_json['status']
+        if status == self._VIDEO_NOT_FOUND:
+            raise ExtractorError(u'Video %s does not exist' % video_id, expected=True)
+        elif status == self._PASSWORD_DETECTED:  # The video is protected by a password, retry with
+                                                # video-password set
+            video_password = self._downloader.params.get('videopassword', None)
+            if not video_password:
+                raise ExtractorError(u'This video is protected by a password, use the --video-password option', expected=True)
+            video_json_url += '&md5pass=%s' % hashlib.md5(video_password.encode('utf-8')).hexdigest()
+            video_json_page = self._download_webpage(video_json_url, video_id, u'Downloading video JSON (video-password set)')
+            video_json = json.loads(video_json_page)
+            status = video_json['status']
+            if status == self._PASSWORD_NOT_VERIFIED:
+                raise ExtractorError(u'Video password is invalid', expected=True)
+        
+        if status != self._SUCCESS:
+            raise ExtractorError(u'Unexpected status value %s' % status)
+        
+        # Extract the URL of the video
+        video_url = video_json['file_data']
+        
+        # Video JSON does not provide enough meta data
+        # We will extract some from the video web page instead
+        video_page_url = 'http://' + mobj.group('url')
+        video_page = self._download_webpage(video_page_url, video_id, u'Downloading video page')
+        
+        # Adult content
+        if re.search(u'EroConfirmText">', video_page) is not None:
+            self.report_age_confirmation()
+            confirm_string = self._html_search_regex(
+                r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id,
+                video_page, u'confirm string')
+            confirm_url = video_page_url + '&confirm=%s' % confirm_string
+            video_page = self._download_webpage(confirm_url, video_id, u'Downloading video page (age confirmed)')
+            adult_content = True
+        else:
+            adult_content = False
+        
+        # Extract the rest of meta data
+        video_title = self._search_meta(u'name', video_page, u'title')
+        if not video_title:
+            video_title = video_url.rsplit('/', 1)[-1]
+
+        video_description = self._search_meta(u'description', video_page)
+        END_TEXT = u' на сайте Smotri.com'
+        if video_description.endswith(END_TEXT):
+            video_description = video_description[:-len(END_TEXT)]
+        START_TEXT = u'Смотреть онлайн ролик '
+        if video_description.startswith(START_TEXT):
+            video_description = video_description[len(START_TEXT):]
+        video_thumbnail = self._search_meta(u'thumbnail', video_page)
+
+        upload_date_str = self._search_meta(u'uploadDate', video_page, u'upload date')
+        upload_date_m = re.search(r'(?P<year>\d{4})\.(?P<month>\d{2})\.(?P<day>\d{2})T', upload_date_str)
+        video_upload_date = (
+            (
+                upload_date_m.group('year') +
+                upload_date_m.group('month') +
+                upload_date_m.group('day')
+            )
+            if upload_date_m else None
+        )
+        
+        duration_str = self._search_meta(u'duration', video_page)
+        duration_m = re.search(r'T(?P<hours>[0-9]{2})H(?P<minutes>[0-9]{2})M(?P<seconds>[0-9]{2})S', duration_str)
+        video_duration = (
+            (
+                (int(duration_m.group('hours')) * 60 * 60) +
+                (int(duration_m.group('minutes')) * 60) +
+                int(duration_m.group('seconds'))
+            )
+            if duration_m else None
+        )
+        
+        video_uploader = self._html_search_regex(
+            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info[^"]+">(.*?)</a>',
+            video_page, u'uploader', fatal=False, flags=re.MULTILINE|re.DOTALL)
+        
+        video_uploader_id = self._html_search_regex(
+            u'<div class="DescrUser"><div>Автор.*?onmouseover="popup_user_info\\(.*?\'([^\']+)\'\\);">',
+            video_page, u'uploader id', fatal=False, flags=re.MULTILINE|re.DOTALL)
+        
+        video_view_count = self._html_search_regex(
+            u'Общее количество просмотров.*?<span class="Number">(\\d+)</span>',
+            video_page, u'view count', fatal=False, flags=re.MULTILINE|re.DOTALL)
+                
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': video_title,
+            'thumbnail': video_thumbnail,
+            'description': video_description,
+            'uploader': video_uploader,
+            'upload_date': video_upload_date,
+            'uploader_id': video_uploader_id,
+            'video_duration': video_duration,
+            'view_count': video_view_count,
+            'age_limit': 18 if adult_content else 0,
+            'video_page_url': video_page_url
+        }
+
+
+class SmotriCommunityIE(InfoExtractor):
+    IE_DESC = u'Smotri.com community videos'
+    IE_NAME = u'smotri:community'
+    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)'
+    
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        community_id = mobj.group('communityid')
+
+        url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id
+        rss = self._download_xml(url, community_id, u'Downloading community RSS')
+
+        entries = [self.url_result(video_url.text, 'Smotri')
+                   for video_url in rss.findall('./channel/item/link')]
+
+        description_text = rss.find('./channel/description').text
+        community_title = self._html_search_regex(
+            u'^Видео сообщества "([^"]+)"$', description_text, u'community title')
+
+        return self.playlist_result(entries, community_id, community_title)
+
+
+class SmotriUserIE(InfoExtractor):
+    IE_DESC = u'Smotri.com user videos'
+    IE_NAME = u'smotri:user'
+    _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('userid')
+
+        url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id
+        rss = self._download_xml(url, user_id, u'Downloading user RSS')
+
+        entries = [self.url_result(video_url.text, 'Smotri')
+                   for video_url in rss.findall('./channel/item/link')]
+
+        description_text = rss.find('./channel/description').text
+        user_nickname = self._html_search_regex(
+            u'^Видео режиссера (.*)$', description_text,
+            u'user nickname')
+
+        return self.playlist_result(entries, user_id, user_nickname)
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py

index 4717fbb77e0ec21a26147b25d9cc3be2f83d9f94..3a19ab17222831d87ffde4992e5712b01359e6eb 100644 (file)
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -59,6 +59,7 @@ class SoundcloudIE(InfoExtractor):
      ]
  
      _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+    _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
  
      @classmethod
      def suitable(cls, url):
@@ -75,36 +76,79 @@ class SoundcloudIE(InfoExtractor):
      def _extract_info_dict(self, info, full_title=None, quiet=False):
          track_id = compat_str(info['id'])
          name = full_title or track_id
-        if quiet == False:
+        if quiet:
              self.report_extraction(name)
  
          thumbnail = info['artwork_url']
          if thumbnail is not None:
              thumbnail = thumbnail.replace('-large', '-t500x500')
+        ext = info.get('original_format', u'mp3')
          result = {
-            'id':       track_id,
-            'url':      info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+            'id': track_id,
              'uploader': info['user']['username'],
              'upload_date': unified_strdate(info['created_at']),
-            'title':    info['title'],
-            'ext':      u'mp3',
+            'title': info['title'],
              'description': info['description'],
              'thumbnail': thumbnail,
          }
          if info.get('downloadable', False):
-            result['url'] = 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(track_id, self._CLIENT_ID)
-        if not info.get('streamable', False):
-            # We have to get the rtmp url
+            # We can build a direct link to the song
+            format_url = (
+                u'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
+                    track_id, self._CLIENT_ID))
+            result['formats'] = [{
+                'format_id': 'download',
+                'ext': ext,
+                'url': format_url,
+                'vcodec': 'none',
+            }]
+        else:
+            # We have to retrieve the url
              stream_json = self._download_webpage(
-                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._CLIENT_ID),
+                'http://api.soundcloud.com/i1/tracks/{0}/streams?client_id={1}'.format(track_id, self._IPHONE_CLIENT_ID),
                  track_id, u'Downloading track url')
-            rtmp_url = json.loads(stream_json)['rtmp_mp3_128_url']
-            # The url doesn't have an rtmp app, we have to extract the playpath
-            url, path = rtmp_url.split('mp3:', 1)
-            result.update({
-                'url': url,
-                'play_path': 'mp3:' + path,
-            })
+
+            formats = []
+            format_dict = json.loads(stream_json)
+            for key, stream_url in format_dict.items():
+                if key.startswith(u'http'):
+                    formats.append({
+                        'format_id': key,
+                        'ext': ext,
+                        'url': stream_url,
+                        'vcodec': 'none',
+                    })
+                elif key.startswith(u'rtmp'):
+                    # The url doesn't have an rtmp app, we have to extract the playpath
+                    url, path = stream_url.split('mp3:', 1)
+                    formats.append({
+                        'format_id': key,
+                        'url': url,
+                        'play_path': 'mp3:' + path,
+                        'ext': ext,
+                        'vcodec': 'none',
+                    })
+
+            if not formats:
+                # We fallback to the stream_url in the original info, this
+                # cannot be always used, sometimes it can give an HTTP 404 error
+                formats.append({
+                    'format_id': u'fallback',
+                    'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+                    'ext': ext,
+                    'vcodec': 'none',
+                })
+
+            def format_pref(f):
+                if f['format_id'].startswith('http'):
+                    return 2
+                if f['format_id'].startswith('rtmp'):
+                    return 1
+                return 0
+
+            formats.sort(key=format_pref)
+            result['formats'] = formats
+
          return result
  
      def _real_extract(self, url):
@@ -158,7 +202,6 @@ class SoundcloudSetIE(SoundcloudIE):
          resolv_url = self._resolv_url(url)
          info_json = self._download_webpage(resolv_url, full_title)
  
-        videos = []
          info = json.loads(info_json)
          if 'errors' in info:
              for err in info['errors']:
diff --git a/youtube_dl/extractor/southparkstudios.py b/youtube_dl/extractor/southparkstudios.py

index b1e96b679b63a0c728eb039dee76cb209e5e9976..fd90cc5dd18f966242d658df1e133456271c8ee3 100644 (file)
--- a/youtube_dl/extractor/southparkstudios.py
+++ b/youtube_dl/extractor/southparkstudios.py
@@ -1,38 +1,42 @@
  import re
  
-from .mtv import MTVIE, _media_xml_tag
+from .mtv import MTVServicesInfoExtractor
  
  
-class SouthParkStudiosIE(MTVIE):
+class SouthParkStudiosIE(MTVServicesInfoExtractor):
      IE_NAME = u'southparkstudios.com'
-    _VALID_URL = r'https?://www\.southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$)'
+    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southparkstudios\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
  
      _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
  
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://www.southparkstudios.com/clips/104437/bat-daded#tab=featured',
          u'file': u'a7bff6c2-ed00-11e0-aca6-0026b9414f30.mp4',
          u'info_dict': {
              u'title': u'Bat Daded',
              u'description': u'Randy disqualifies South Park by getting into a fight with Bat Dad.',
          },
-    }
-
-    # Overwrite MTVIE properties we don't want
-    _TESTS = []
-
-    def _get_thumbnail_url(self, uri, itemdoc):
-        search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
-        thumb_node = itemdoc.find(search_path)
-        if thumb_node is None:
-            return None
-        else:
-            return thumb_node.attrib['url']
+    }]
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
+        url = u'http://www.' + mobj.group(u'url')
          video_id = mobj.group('id')
          webpage = self._download_webpage(url, video_id)
          mgid = self._search_regex(r'swfobject.embedSWF\(".*?(mgid:.*?)"',
                                    webpage, u'mgid')
          return self._get_videos_info(mgid)
+
+class SouthparkDeIE(SouthParkStudiosIE):
+    IE_NAME = u'southpark.de'
+    _VALID_URL = r'(https?://)?(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+    _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
+
+    _TESTS = [{
+        u'url': u'http://www.southpark.de/clips/uygssh/the-government-wont-respect-my-privacy#tab=featured',
+        u'file': u'85487c96-b3b9-4e39-9127-ad88583d9bf2.mp4',
+        u'info_dict': {
+            u'title': u'The Government Won\'t Respect My Privacy',
+            u'description': u'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.',
+        },
+    }]
diff --git a/youtube_dl/extractor/spankwire.py b/youtube_dl/extractor/spankwire.py

index 97f9c268a4531114912f9209c8a959d567add062..9e2ad0d9962c375ca27851b3f842de302be28e56 100644 (file)
--- a/youtube_dl/extractor/spankwire.py
+++ b/youtube_dl/extractor/spankwire.py
@@ -6,7 +6,6 @@ from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urllib_parse,
-    unescapeHTML,
  )
  from ..aes import (
      aes_decrypt_text
@@ -36,11 +35,12 @@ class SpankwireIE(InfoExtractor):
          webpage = self._download_webpage(req, video_id)
  
          video_title = self._html_search_regex(r'<h1>([^<]+)', webpage, u'title')
-        video_uploader = self._html_search_regex(r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
-        thumbnail = self._html_search_regex(r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
-        description = self._html_search_regex(r'>\s*Description:</div>\s*<[^>]*>([^<]+)', webpage, u'description', fatal=False)
-        if len(description) == 0:
-            description = None
+        video_uploader = self._html_search_regex(
+            r'by:\s*<a [^>]*>(.+?)</a>', webpage, u'uploader', fatal=False)
+        thumbnail = self._html_search_regex(
+            r'flashvars\.image_url = "([^"]+)', webpage, u'thumbnail', fatal=False)
+        description = self._html_search_regex(
+            r'<div\s+id="descriptionContent">([^<]+)<', webpage, u'description', fatal=False)
  
          video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'flashvars\.quality_[0-9]{3}p = "([^"]+)', webpage)))
          if webpage.find('flashvars\.encrypted = "true"') != -1:
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py

index 13c86401c0095bd5584e03ac35614d06edb76be1..6955205242dcbbba01cfac482d362b3ca292b6b6 100644 (file)
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -1,19 +1,26 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  
  
  class SpiegelIE(InfoExtractor):
      _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
-    _TEST = {
+    _TESTS = [{
          u'url': u'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
          u'file': u'1259285.mp4',
          u'md5': u'2c2754212136f35fb4b19767d242f66e',
          u'info_dict': {
              u"title": u"Vulkanausbruch in Ecuador: Der \"Feuerschlund\" ist wieder aktiv"
          }
-    }
+    },
+    {
+        u'url': u'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
+        u'file': u'1309159.mp4',
+        u'md5': u'f2cdf638d7aa47654e251e1aee360af1',
+        u'info_dict': {
+            u'title': u'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers'
+        }
+    }]
  
      def _real_extract(self, url):
          m = re.match(self._VALID_URL, url)
@@ -21,25 +28,36 @@ class SpiegelIE(InfoExtractor):
  
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
-            webpage, u'title')
+        video_title = self._html_search_regex(
+            r'<div class="module-title">(.*?)</div>', webpage, u'title')
  
          xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
-        xml_code = self._download_webpage(xml_url, video_id,
-                    note=u'Downloading XML', errnote=u'Failed to download XML')
-
-        idoc = xml.etree.ElementTree.fromstring(xml_code)
-        last_type = idoc[-1]
-        filename = last_type.findall('./filename')[0].text
-        duration = float(last_type.findall('./duration')[0].text)
+        idoc = self._download_xml(
+            xml_url, video_id,
+            note=u'Downloading XML', errnote=u'Failed to download XML')
+
+        formats = [
+            {
+                'format_id': n.tag.rpartition('type')[2],
+                'url': u'http://video2.spiegel.de/flash/' + n.find('./filename').text,
+                'width': int(n.find('./width').text),
+                'height': int(n.find('./height').text),
+                'abr': int(n.find('./audiobitrate').text),
+                'vbr': int(n.find('./videobitrate').text),
+                'vcodec': n.find('./codec').text,
+                'acodec': 'MP4A',
+            }
+            for n in list(idoc)
+            # Blacklist type 6, it's extremely LQ and not available on the same server
+            if n.tag.startswith('type') and n.tag != 'type6'
+        ]
+        formats.sort(key=lambda f: f['vbr'])
+        duration = float(idoc[0].findall('./duration')[0].text)
  
-        video_url = 'http://video2.spiegel.de/flash/' + filename
-        video_ext = filename.rpartition('.')[2]
          info = {
              'id': video_id,
-            'url': video_url,
-            'ext': video_ext,
              'title': video_title,
              'duration': duration,
+            'formats': formats,
          }
-        return [info]
+        return info
diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py

new file mode 100644 (file)

index 0000000..9faf3a5
--- /dev/null
+++ b/youtube_dl/extractor/streamcloud.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+import re
+import time
+
+from .common import InfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+
+
+class StreamcloudIE(InfoExtractor):
+    IE_NAME = u'streamcloud.eu'
+    _VALID_URL = r'https?://streamcloud\.eu/(?P<id>[a-zA-Z0-9_-]+)/(?P<fname>[^#?]*)\.html'
+
+    _TEST = {
+        u'url': u'http://streamcloud.eu/skp9j99s4bpz/youtube-dl_test_video_____________-BaW_jenozKc.mp4.html',
+        u'file': u'skp9j99s4bpz.mp4',
+        u'md5': u'6bea4c7fa5daaacc2a946b7146286686',
+        u'info_dict': {
+            u'title': u'youtube-dl test video  \'/\\ ä ↭',
+            u'duration': 9,
+        },
+        u'skip': u'Only available from the EU'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+
+        orig_webpage = self._download_webpage(url, video_id)
+
+        fields = re.findall(r'''(?x)<input\s+
+            type="(?:hidden|submit)"\s+
+            name="([^"]+)"\s+
+            (?:id="[^"]+"\s+)?
+            value="([^"]*)"
+            ''', orig_webpage)
+        post = compat_urllib_parse.urlencode(fields)
+
+        self.to_screen('%s: Waiting for timeout' % video_id)
+        time.sleep(12)
+        headers = {
+            b'Content-Type': b'application/x-www-form-urlencoded',
+        }
+        req = compat_urllib_request.Request(url, post, headers)
+
+        webpage = self._download_webpage(
+            req, video_id, note=u'Downloading video page ...')
+        title = self._html_search_regex(
+            r'<h1[^>]*>([^<]+)<', webpage, u'title')
+        video_url = self._search_regex(
+            r'file:\s*"([^"]+)"', webpage, u'video URL')
+        duration_str = self._search_regex(
+            r'duration:\s*"?([0-9]+)"?', webpage, u'duration', fatal=False)
+        duration = None if duration_str is None else int(duration_str)
+        thumbnail = self._search_regex(
+            r'image:\s*"([^"]+)"', webpage, u'thumbnail URL', fatal=False)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
diff --git a/youtube_dl/extractor/sztvhu.py b/youtube_dl/extractor/sztvhu.py

index 81fa35c4bd297f6b6a4b5fa44ed1b98998393ab6..c9359fafb5c5989923c6320e3e684673b80057d6 100644 (file)
--- a/youtube_dl/extractor/sztvhu.py
+++ b/youtube_dl/extractor/sztvhu.py
@@ -15,7 +15,8 @@ class SztvHuIE(InfoExtractor):
          u'info_dict': {
              u"title": u"Cserkészek népszerűsítették a környezettudatos életmódot a Savaria téren",
              u"description": u'A zöld nap játékos ismeretterjesztő programjait a Magyar Cserkész Szövetség szervezte, akik az ország nyolc városában adják át tudásukat az érdeklődőknek. A PET...',
-        }
+        },
+        u'skip': u'Service temporarily disabled as of 2013-11-20'
      }
  
      def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index bc48620f0b992366e866181b1dad22aeb2e5d0a6..2bf26d05682e8e2535d412c2718d78bec6077622 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -32,8 +31,7 @@ class TeamcocoIE(InfoExtractor):
          self.report_extraction(video_id)
  
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
-        data_xml = self._download_webpage(data_url, video_id, 'Downloading data webpage')
-        data = xml.etree.ElementTree.fromstring(data_xml.encode('utf-8'))
+        data = self._download_xml(data_url, video_id, 'Downloading data webpage')
  
  
          qualities = ['500k', '480p', '1000k', '720p', '1080p']
@@ -60,7 +58,7 @@ class TeamcocoIE(InfoExtractor):
                  return -1
          formats.sort(key=sort_key)
          if not formats:
-            raise RegexNotFoundError(u'Unable to extract video URL')
+            raise ExtractorError(u'Unable to extract video URL')
  
          return {
              'id':          video_id,
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py

index 76cfdfb90c886a94e95cb60b144e93b3f0acbb1a..4bca62ba003e325ebedd0fcc74c953bd64120cd5 100644 (file)
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -4,7 +4,6 @@ import re
  from .subtitles import SubtitlesInfoExtractor
  
  from ..utils import (
-    compat_str,
      RegexNotFoundError,
  )
  
@@ -43,26 +42,25 @@ class TEDIE(SubtitlesInfoExtractor):
              self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
              return [self._playlist_videos_info(url,name,playlist_id)]
  
-    def _playlist_videos_info(self,url,name,playlist_id=0):
+
+    def _playlist_videos_info(self, url, name, playlist_id):
          '''Returns the videos of the playlist'''
-        video_RE=r'''
-                     <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
-                     ([.\s]*?)data-playlist_item_id="(\d+)"
-                     ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
-                     '''
-        video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
-        webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
-        m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
-        m_names=re.finditer(video_name_RE,webpage)
+
+        webpage = self._download_webpage(
+            url, playlist_id, u'Downloading playlist webpage')
+        matches = re.finditer(
+            r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
+            webpage)
  
          playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
                                                   webpage, 'playlist title')
  
-        playlist_entries = []
-        for m_video, m_name in zip(m_videos,m_names):
-            talk_url='http://www.ted.com%s' % m_name.group('talk_url')
-            playlist_entries.append(self.url_result(talk_url, 'TED'))
-        return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
+        playlist_entries = [
+            self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
+            for m in matches
+        ]
+        return self.playlist_result(
+            playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
  
      def _talk_info(self, url, video_id=0):
          """Return the video for the talk in the url"""
@@ -85,7 +83,7 @@ class TEDIE(SubtitlesInfoExtractor):
              'ext': 'mp4',
              'url': stream['file'],
              'format': stream['id']
-            } for stream in info['htmlStreams']]
+        } for stream in info['htmlStreams']]
  
          video_id = info['id']
  
@@ -95,7 +93,7 @@ class TEDIE(SubtitlesInfoExtractor):
              self._list_available_subtitles(video_id, webpage)
              return
  
-        info = {
+        return {
              'id': video_id,
              'title': title,
              'thumbnail': thumbnail,
@@ -104,11 +102,6 @@ class TEDIE(SubtitlesInfoExtractor):
              'formats': formats,
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(info['formats'][-1])
-
-        return info
-
      def _get_available_subtitles(self, video_id, webpage):
          try:
              options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
@@ -119,6 +112,6 @@ class TEDIE(SubtitlesInfoExtractor):
                      url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                      sub_lang_list[l] = url
                  return sub_lang_list
-        except RegexNotFoundError as err:
+        except RegexNotFoundError:
              self._downloader.report_warning(u'video doesn\'t have subtitles')
          return {}
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py

new file mode 100644 (file)

index 0000000..1e9598e
--- /dev/null
+++ b/youtube_dl/extractor/toutv.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    unified_strdate,
+)
+
+
+class TouTvIE(InfoExtractor):
+    IE_NAME = u'tou.tv'
+    _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))'
+
+    _TEST = {
+        u'url': u'http://www.tou.tv/30-vies/S04E41',
+        u'file': u'30-vies_S04E41.mp4',
+        u'info_dict': {
+            u'title': u'30 vies Saison 4 / Épisode 41',
+            u'description': u'md5:da363002db82ccbe4dafeb9cab039b09',
+            u'age_limit': 8,
+            u'uploader': u'Groupe des Nouveaux Médias',
+            u'duration': 1296,
+            u'upload_date': u'20131118',
+            u'thumbnail': u'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg',
+        },
+        u'params': {
+            u'skip_download': True,  # Requires rtmpdump
+        },
+        u'skip': 'Only available in Canada'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+
+        mediaId = self._search_regex(
+            r'"idMedia":\s*"([^"]+)"', webpage, u'media ID')
+
+        streams_url = u'http://release.theplatform.com/content.select?pid=' + mediaId
+        streams_doc = self._download_xml(
+            streams_url, video_id, note=u'Downloading stream list')
+
+        video_url = next(n.text
+                         for n in streams_doc.findall('.//choice/url')
+                         if u'//ad.doubleclick' not in n.text)
+        if video_url.endswith('/Unavailable.flv'):
+            raise ExtractorError(
+                u'Access to this video is blocked from outside of Canada',
+                expected=True)
+
+        duration_str = self._html_search_meta(
+            'video:duration', webpage, u'duration')
+        duration = int(duration_str) if duration_str else None
+        upload_date_str = self._html_search_meta(
+            'video:release_date', webpage, u'upload date')
+        upload_date = unified_strdate(upload_date_str) if upload_date_str else None
+
+        return {
+            'id': video_id,
+            'title': self._og_search_title(webpage),
+            'url': video_url,
+            'description': self._og_search_description(webpage),
+            'uploader': self._dc_search_uploader(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+            'age_limit': self._media_rating_search(webpage),
+            'duration': duration,
+            'upload_date': upload_date,
+            'ext': 'mp4',
+        }
diff --git a/youtube_dl/extractor/trilulilu.py b/youtube_dl/extractor/trilulilu.py

index 0bf028f6195ba56be22e059bdc83d23cbabff59b..d64aaa41f690956b08211ed4fe07e1bc27267641 100644 (file)
--- a/youtube_dl/extractor/trilulilu.py
+++ b/youtube_dl/extractor/trilulilu.py
@@ -1,6 +1,5 @@
  import json
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  
@@ -36,12 +35,10 @@ class TriluliluIE(InfoExtractor):
  
          format_url = (u'http://fs%(server)s.trilulilu.ro/%(hash)s/'
                        u'video-formats2' % log)
-        format_str = self._download_webpage(
+        format_doc = self._download_xml(
              format_url, video_id,
              note=u'Downloading formats',
              errnote=u'Error while downloading formats')
-
-        format_doc = xml.etree.ElementTree.fromstring(format_str)
   
          video_url_template = (
              u'http://fs%(server)s.trilulilu.ro/stream.php?type=video'
@@ -58,7 +55,7 @@ class TriluliluIE(InfoExtractor):
              for fnode in format_doc.findall('./formats/format')
          ]
  
-        info = {
+        return {
              '_type': 'video',
              'id': video_id,
              'formats': formats,
@@ -67,7 +64,3 @@ class TriluliluIE(InfoExtractor):
              'thumbnail': thumbnail,
          }
  
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
-
-        return info
diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py

index d4b7603c7c96a5da148ff50869559b0ff0c11a0f..4d9d41db3af2382bead67efa4afe6edbf9a6846e 100644 (file)
--- a/youtube_dl/extractor/tube8.py
+++ b/youtube_dl/extractor/tube8.py
@@ -5,8 +5,6 @@ from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
-    compat_urllib_parse,
-    unescapeHTML,
  )
  from ..aes import (
      aes_decrypt_text
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py

new file mode 100644 (file)

index 0000000..bfed9dd
--- /dev/null
+++ b/youtube_dl/extractor/tvp.py
@@ -0,0 +1,42 @@
+import json
+import re
+
+from .common import InfoExtractor
+
+
+class TvpIE(InfoExtractor):
+    IE_NAME = u'tvp.pl'
+    _VALID_URL = r'https?://www\.tvp\.pl/.*?wideo/(?P<date>\d+)/(?P<id>\d+)'
+
+    _TEST = {
+        u'url': u'http://www.tvp.pl/warszawa/magazyny/campusnews/wideo/31102013/12878238',
+        u'md5': u'148408967a6a468953c0a75cbdaf0d7a',
+        u'file': u'12878238.wmv',
+        u'info_dict': {
+            u'title': u'31.10.2013 - Odcinek 2',
+            u'description': u'31.10.2013 - Odcinek 2',
+        },
+        u'skip': u'Download has to use same server IP as extraction. Therefore, a good (load-balancing) DNS resolver will make the download fail.'
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        json_url = 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id
+        json_params = self._download_webpage(
+            json_url, video_id, u"Downloading video metadata")
+
+        params = json.loads(json_params)
+        self.report_extraction(video_id)
+        video_url = params['video_url']
+
+        title = self._og_search_title(webpage, fatal=True)
+        return {
+            'id': video_id,
+            'title': title,
+            'ext': 'wmv',
+            'url': video_url,
+            'description': self._og_search_description(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
+        }
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py

index 3f6020f74ec9eeefbddafc184d3f48cf5e436adb..4378b17800f1df78275d68a9525ca95585dc8b9d 100644 (file)
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -78,12 +78,13 @@ class VevoIE(InfoExtractor):
                  continue
  
              format_url = self._SMIL_BASE_URL + m.group('path')
-            format_note = ('%(vcodec)s@%(vbr)4sk, %(acodec)s@%(abr)3sk' %
-                           m.groupdict())
              formats.append({
                  'url': format_url,
                  'format_id': u'SMIL_' + m.group('cbr'),
-                'format_note': format_note,
+                'vcodec': m.group('vcodec'),
+                'acodec': m.group('acodec'),
+                'vbr': int(m.group('vbr')),
+                'abr': int(m.group('abr')),
                  'ext': m.group('ext'),
                  'width': int(m.group('width')),
                  'height': int(m.group('height')),
diff --git a/youtube_dl/extractor/viddler.py b/youtube_dl/extractor/viddler.py

index 826804af37af54e308f90349e909d3e0e3aa5126..75335dfb8797e83c7413f0c8bee86603ed429847 100644 (file)
--- a/youtube_dl/extractor/viddler.py
+++ b/youtube_dl/extractor/viddler.py
@@ -47,7 +47,7 @@ class ViddlerIE(InfoExtractor):
              r"thumbnail\s*:\s*'([^']*)'",
              webpage, u'thumbnail', fatal=False)
  
-        info = {
+        return {
              '_type': 'video',
              'id': video_id,
              'title': title,
@@ -56,9 +56,3 @@ class ViddlerIE(InfoExtractor):
              'duration': duration,
              'formats': formats,
          }
-
-        # TODO: Remove when #980 has been merged
-        info['formats'][-1]['ext'] = determine_ext(info['formats'][-1]['url'])
-        info.update(info['formats'][-1])
-
-        return info
diff --git a/youtube_dl/extractor/videofyme.py b/youtube_dl/extractor/videofyme.py

index 94f64ffa5aaba21ec4ca4470d6034352acbf411e..912802d9aa22082f2f39148db7920a6287c74ec6 100644 (file)
--- a/youtube_dl/extractor/videofyme.py
+++ b/youtube_dl/extractor/videofyme.py
@@ -1,5 +1,4 @@
  import re
-import xml.etree.ElementTree
  
  from .common import InfoExtractor
  from ..utils import (
@@ -27,9 +26,8 @@ class VideofyMeIE(InfoExtractor):
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('id')
-        config_xml = self._download_webpage('http://sunshine.videofy.me/?videoId=%s' % video_id,
+        config = self._download_xml('http://sunshine.videofy.me/?videoId=%s' % video_id,
                                              video_id)
-        config = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8'))
          video = config.find('video')
          sources = video.find('sources')
          url_node = next(node for node in [find_xpath_attr(sources, 'source', 'id', 'HQ %s' % key) 
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py

index 65f39b98259bc0050b512073b2f26e5bd0e49605..acae81448e38e3b362fcfdd93b4a6dcd9cc5f7d0 100644 (file)
--- a/youtube_dl/extractor/videopremium.py
+++ b/youtube_dl/extractor/videopremium.py
@@ -5,7 +5,7 @@ from .common import InfoExtractor
  
  
  class VideoPremiumIE(InfoExtractor):
-    _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.tv/(?P<id>\w+)(?:/.*)?'
+    _VALID_URL = r'(?:https?://)?(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
      _TEST = {
          u'url': u'http://videopremium.tv/4w7oadjsf156',
          u'file': u'4w7oadjsf156.f4v',
@@ -24,12 +24,16 @@ class VideoPremiumIE(InfoExtractor):
          webpage_url = 'http://videopremium.tv/' + video_id
          webpage = self._download_webpage(webpage_url, video_id)
  
-        self.report_extraction(video_id)
+        if re.match(r"^<html><head><script[^>]*>window.location\s*=", webpage):
+            # Download again, we need a cookie
+            webpage = self._download_webpage(
+                webpage_url, video_id,
+                note=u'Downloading webpage again (with cookie)')
  
-        video_title = self._html_search_regex(r'<h2(?:.*?)>\s*(.+?)\s*<',
-            webpage, u'video title')
+        video_title = self._html_search_regex(
+            r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, u'video title')
  
-        return [{
+        return {
              'id':          video_id,
              'url':         "rtmp://e%d.md.iplay.md/play" % random.randint(1, 16),
              'play_path':   "mp4:%s.f4v" % video_id,
@@ -37,4 +41,4 @@ class VideoPremiumIE(InfoExtractor):
              'player_url':  "http://videopremium.tv/uplayer/uppod.swf",
              'ext':         'f4v',
              'title':       video_title,
-        }]
+        }
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py

new file mode 100644 (file)

index 0000000..2206a06
--- /dev/null
+++ b/youtube_dl/extractor/viki.py
@@ -0,0 +1,101 @@
+import re
+
+from ..utils import (
+    ExtractorError,
+    unescapeHTML,
+    unified_strdate,
+)
+from .subtitles import SubtitlesInfoExtractor
+
+
+class VikiIE(SubtitlesInfoExtractor):
+    IE_NAME = u'viki'
+
+    _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)'
+    _TEST = {
+        u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14',
+        u'file': u'1023585v.mp4',
+        u'md5': u'a21454021c2646f5433514177e2caa5f',
+        u'info_dict': {
+            u'title': u'Heirs Episode 14',
+            u'uploader': u'SBS',
+            u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e',
+            u'upload_date': u'20131121',
+            u'age_limit': 13,
+        },
+        u'skip': u'Blocked in the US',
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group(1)
+
+        webpage = self._download_webpage(url, video_id)
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage)
+        thumbnail = self._og_search_thumbnail(webpage)
+
+        uploader_m = re.search(
+            r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage)
+        if uploader_m is None:
+            uploader = None
+        else:
+            uploader = uploader_m.group(1).strip()
+
+        rating_str = self._html_search_regex(
+            r'<strong>Rating: </strong>\s*([^<]*)<', webpage,
+            u'rating information', default='').strip()
+        RATINGS = {
+            'G': 0,
+            'PG': 10,
+            'PG-13': 13,
+            'R': 16,
+            'NC': 18,
+        }
+        age_limit = RATINGS.get(rating_str)
+
+        info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id
+        info_webpage = self._download_webpage(
+            info_url, video_id, note=u'Downloading info page')
+        if re.match(r'\s*<div\s+class="video-error', info_webpage):
+            raise ExtractorError(
+                u'Video %s is blocked from your location.' % video_id,
+                expected=True)
+        video_url = self._html_search_regex(
+            r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL')
+
+        upload_date_str = self._html_search_regex(
+            r'"created_at":"([^"]+)"', info_webpage, u'upload date')
+        upload_date = (
+            unified_strdate(upload_date_str)
+            if upload_date_str is not None
+            else None
+        )
+
+        # subtitles
+        video_subtitles = self.extract_subtitles(video_id, info_webpage)
+        if self._downloader.params.get('listsubtitles', False):
+            self._list_available_subtitles(video_id, info_webpage)
+            return
+
+        return {
+            'id': video_id,
+            'title': title,
+            'url': video_url,
+            'description': description,
+            'thumbnail': thumbnail,
+            'age_limit': age_limit,
+            'uploader': uploader,
+            'subtitles': video_subtitles,
+            'upload_date': upload_date,
+        }
+
+    def _get_available_subtitles(self, video_id, info_webpage):
+        res = {}
+        for sturl_html in re.findall(r'<track src="([^"]+)"/>', info_webpage):
+            sturl = unescapeHTML(sturl_html)
+            m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl)
+            if not m:
+                continue
+            res[m.group('lang')] = sturl
+        return res
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py

index d465bf20b6d65b7b5cd3a0545af676c2c710a07d..f27763ae2ff110051613243ef2543e3349d6b019 100644 (file)
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -151,7 +151,7 @@ class VimeoIE(InfoExtractor):
                  config = json.loads(config_json)
              except RegexNotFoundError:
                  # For pro videos or player.vimeo.com urls
-                config = self._search_regex([r' = {config:({.+?}),assets:', r'c=({.+?);'],
+                config = self._search_regex([r' = {config:({.+?}),assets:', r'(?:c|b)=({.+?});'],
                      webpage, u'info section', flags=re.DOTALL)
                  config = json.loads(config)
          except Exception as e:
@@ -249,25 +249,46 @@ class VimeoChannelIE(InfoExtractor):
      IE_NAME = u'vimeo:channel'
      _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
      _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+    _TITLE_RE = r'<link rel="alternate"[^>]+?title="(.*?)"'
  
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        channel_id =  mobj.group('id')
+    def _extract_videos(self, list_id, base_url):
          video_ids = []
-
          for pagenum in itertools.count(1):
-            webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
-                                             channel_id, u'Downloading page %s' % pagenum)
+            webpage = self._download_webpage(
+                '%s/videos/page:%d/' % (base_url, pagenum),list_id,
+                u'Downloading page %s' % pagenum)
              video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
              if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
                  break
  
          entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
                     for video_id in video_ids]
-        channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
-                                                webpage, u'channel title')
+        list_title = self._html_search_regex(self._TITLE_RE, webpage,
+            u'list title')
          return {'_type': 'playlist',
-                'id': channel_id,
-                'title': channel_title,
+                'id': list_id,
+                'title': list_title,
                  'entries': entries,
                  }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        channel_id =  mobj.group('id')
+        return self._extract_videos(channel_id, 'http://vimeo.com/channels/%s' % channel_id)
+
+
+class VimeoUserIE(VimeoChannelIE):
+    IE_NAME = u'vimeo:user'
+    _VALID_URL = r'(?:https?://)?vimeo.\com/(?P<name>[^/]+)'
+    _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
+
+    @classmethod
+    def suitable(cls, url):
+        if VimeoChannelIE.suitable(url) or VimeoIE.suitable(url):
+            return False
+        return super(VimeoUserIE, cls).suitable(url)
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        name = mobj.group('name')
+        return self._extract_videos(name, 'http://vimeo.com/%s' % name)
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py

index c4ec1f06ffe3ccce17598aeb319047f0890f9a02..651ba317dcd8fffefb2ac938c9f09d5de8356865 100644 (file)
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,7 +27,7 @@ class VineIE(InfoExtractor):
          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
              webpage, u'video URL')
  
-        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+        uploader = self._html_search_regex(r'<p class="username">(.*?)</p>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
  
          return [{
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py

index 7444d3393a25f8a49778a5bd589aa839591bd9d8..279f75e7a1f5b860e81d955c33bb58fcea092cbc 100644 (file)
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -26,7 +26,7 @@ class XHamsterIE(InfoExtractor):
      {
          u'url': u'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
          u'file': u'2221348.flv',
-        u'md5': u'e767b9475de189320f691f49c679c4c7',
+        u'md5': u'970a94178ca4118c5aa3aaea21211b81',
          u'info_dict': {
              u"upload_date": u"20130914",
              u"uploader_id": u"jojo747400",
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py

index 03ad88bededd60a7d8462bb85e5a8cb23db381b2..e3458d2bd4abaa196190f886afce2e9ac05df191 100644 (file)
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -5,7 +5,6 @@ from .common import InfoExtractor
  from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
-    compat_urllib_parse,
  )
  
  class XTubeIE(InfoExtractor):
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py

index 34e6afb20fb6833ab21501785deb54cf5f0a0e24..e457c4707a8feda7c3d0709c18671282b6da3814 100644 (file)
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -17,27 +17,21 @@ class YahooIE(InfoExtractor):
      _TESTS = [
          {
              u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
-            u'file': u'214727115.flv',
+            u'file': u'214727115.mp4',
+            u'md5': u'4962b075c08be8690a922ee026d05e69',
              u'info_dict': {
                  u'title': u'Julian Smith & Travis Legg Watch Julian Smith',
                  u'description': u'Julian and Travis watch Julian Smith',
              },
-            u'params': {
-                # Requires rtmpdump
-                u'skip_download': True,
-            },
          },
          {
              u'url': u'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
-            u'file': u'103000935.flv',
+            u'file': u'103000935.mp4',
+            u'md5': u'd6e6fc6e1313c608f316ddad7b82b306',
              u'info_dict': {
                  u'title': u'Codefellas - The Cougar Lies with Spanish Moss',
                  u'description': u'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
              },
-            u'params': {
-                # Requires rtmpdump
-                u'skip_download': True,
-            },
          },
      ]
  
@@ -46,15 +40,19 @@ class YahooIE(InfoExtractor):
          video_id = mobj.group('id')
          webpage = self._download_webpage(url, video_id)
  
-        items_json = self._search_regex(r'YVIDEO_INIT_ITEMS = ({.*?});$',
+        items_json = self._search_regex(r'mediaItems: ({.*?})$',
              webpage, u'items', flags=re.MULTILINE)
          items = json.loads(items_json)
          info = items['mediaItems']['query']['results']['mediaObj'][0]
          # The 'meta' field is not always in the video webpage, we request it
          # from another page
          long_id = info['id']
+        return self._get_info(info['id'], video_id)
+
+    def _get_info(self, long_id, video_id):
          query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
-                 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2"' % long_id)
+                 ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
+                 ' AND protocol="http"' % long_id)
          data = compat_urllib_parse.urlencode({
              'q': query,
              'env': 'prod',
@@ -91,17 +89,39 @@ class YahooIE(InfoExtractor):
              formats.append(format_info)
          formats = sorted(formats, key=lambda f:(f['height'], f['width']))
  
-        info = {
+        return {
              'id': video_id,
              'title': meta['title'],
              'formats': formats,
              'description': clean_html(meta['description']),
              'thumbnail': meta['thumbnail'],
          }
-        # TODO: Remove when #980 has been merged
-        info.update(formats[-1])
  
-        return info
+
+class YahooNewsIE(YahooIE):
+    IE_NAME = 'yahoo:news'
+    _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
+
+    _TEST = {
+        u'url': u'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+        u'md5': u'67010fdf3a08d290e060a4dd96baa07b',
+        u'info_dict': {
+            u'id': u'104538833',
+            u'ext': u'mp4',
+            u'title': u'China Moses Is Crazy About the Blues',
+            u'description': u'md5:9900ab8cd5808175c7b3fe55b979bed0',
+        },
+    }
+
+    # Overwrite YahooIE properties we don't want
+    _TESTS = []
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        webpage = self._download_webpage(url, video_id)
+        long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, u'long id')
+        return self._get_info(long_id, video_id)
  
  
  class YahooSearchIE(SearchInfoExtractor):
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index c992cba978441081a50e5f134a4a694c83a6685c..7fff761bd0b5a7835c5b4a11c3a1d15ac67567d8 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -11,7 +11,6 @@ import socket
  import string
  import struct
  import traceback
-import xml.etree.ElementTree
  import zlib
  
  from .common import InfoExtractor, SearchInfoExtractor
@@ -29,6 +28,7 @@ from ..utils import (
      clean_html,
      get_cachedir,
      get_element_by_id,
+    get_element_by_attribute,
      ExtractorError,
      unescapeHTML,
      unified_strdate,
@@ -139,10 +139,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
  
  class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      IE_DESC = u'YouTube.com'
-    _VALID_URL = r"""^
+    _VALID_URL = r"""(?x)^
                       (
-                         (?:https?://)?                                       # http(s):// (optional)
-                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
+                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              tube\.majestyc\.net/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
@@ -248,21 +248,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          '248': 'webm',
      }
      _video_dimensions = {
-        '5': '240x400',
+        '5': '400x240',
          '6': '???',
          '13': '???',
-        '17': '144x176',
-        '18': '360x640',
-        '22': '720x1280',
-        '34': '360x640',
-        '35': '480x854',
-        '36': '240x320',
-        '37': '1080x1920',
-        '38': '3072x4096',
-        '43': '360x640',
-        '44': '480x854',
-        '45': '720x1280',
-        '46': '1080x1920',
+        '17': '176x144',
+        '18': '640x360',
+        '22': '1280x720',
+        '34': '640x360',
+        '35': '854x480',
+        '36': '320x240',
+        '37': '1920x1080',
+        '38': '4096x3072',
+        '43': '640x360',
+        '44': '854x480',
+        '45': '1280x720',
+        '46': '1920x1080',
          '82': '360p',
          '83': '480p',
          '84': '720p',
@@ -336,7 +336,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u"uploader": u"Philipp Hagemeister",
                  u"uploader_id": u"phihag",
                  u"upload_date": u"20121002",
-                u"description": u"test chars:  \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
+                u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
              }
          },
          {
@@ -363,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  u"uploader_id": u"justintimberlakeVEVO"
              }
          },
+        {
+            u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+            u"file":  u"yZIXLfi8CZQ.mp4",
+            u"note": u"Embed-only video (#1746)",
+            u"info_dict": {
+                u"upload_date": u"20120608",
+                u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+                u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+                u"uploader": u"SET India",
+                u"uploader_id": u"setindia"
+            }
+        },
      ]
  
  
@@ -370,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
          if YoutubePlaylistIE.suitable(url): return False
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+        return re.match(cls._VALID_URL, url) is not None
  
      def __init__(self, *args, **kwargs):
          super(YoutubeIE, self).__init__(*args, **kwargs)
@@ -1019,6 +1031,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          """Turn the encrypted s field into a working signature"""
  
          if player_url is not None:
+            if player_url.startswith(u'//'):
+                player_url = u'https:' + player_url
              try:
                  player_id = (player_url, len(s))
                  if player_id not in self._player_cache:
@@ -1098,7 +1112,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat'),
+                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
                  'name': l[0].encode('utf-8'),
              })
              url = u'http://www.youtube.com/api/timedtext?' + params
@@ -1111,7 +1125,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      def _get_available_automatic_caption(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
-        sub_format = self._downloader.params.get('subtitlesformat')
+        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
          self.to_screen(u'%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = u'Couldn\'t find automatic captions for %s' % video_id
@@ -1130,8 +1144,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'asrs': 1,
              })
              list_url = caption_url + '&' + list_params
-            list_page = self._download_webpage(list_url, video_id)
-            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+            caption_list = self._download_xml(list_url, video_id)
              original_lang_node = caption_list.find('track')
              if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
                  self._downloader.report_warning(u'Video doesn\'t have automatic captions')
@@ -1270,7 +1283,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              # We simulate the access to the video from www.youtube.com/v/{video_id}
              # this can be viewed without login into Youtube
              data = compat_urllib_parse.urlencode({'video_id': video_id,
-                                                  'el': 'embedded',
+                                                  'el': 'player_embedded',
                                                    'gl': 'US',
                                                    'hl': 'en',
                                                    'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@@ -1299,6 +1312,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              else:
                  raise ExtractorError(u'"token" parameter not in video info for unknown reason')
  
+        if 'view_count' in video_info:
+            view_count = int(video_info['view_count'][0])
+        else:
+            view_count = None
+
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
              raise ExtractorError(u'"rental" videos not supported')
@@ -1348,6 +1366,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          # description
          video_description = get_element_by_id("eow-description", video_webpage)
          if video_description:
+            video_description = re.sub(r'''(?x)
+                <a\s+
+                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    title="([^"]+)"\s+
+                    (?:[a-zA-Z-]+="[^"]+"\s+)*?
+                    class="yt-uix-redirect-link"\s*>
+                [^<]+
+                </a>
+            ''', r'\1', video_description)
              video_description = clean_html(video_description)
          else:
              fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
@@ -1487,10 +1514,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  'age_limit':    18 if age_gate else 0,
                  'annotations':  video_annotations,
                  'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+                'view_count': view_count,
              })
          return results
  
-class YoutubePlaylistIE(InfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
      _VALID_URL = r"""(?:
                          (?:https?://)?
@@ -1506,8 +1534,9 @@ class YoutubePlaylistIE(InfoExtractor):
                       |
                          ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
                       )"""
-    _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
-    _MAX_RESULTS = 50
+    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+    _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+    _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
      IE_NAME = u'youtube:playlist'
  
      @classmethod
@@ -1515,6 +1544,27 @@ class YoutubePlaylistIE(InfoExtractor):
          """Receives a URL and returns True if suitable for this IE."""
          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  
+    def _real_initialize(self):
+        self._login()
+
+    def _ids_to_results(self, ids):
+        return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
+                       for vid_id in ids]
+
+    def _extract_mix(self, playlist_id):
+        # The mixes are generated from a a single video
+        # the id of the playlist is just 'RD' + video_id
+        url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id)
+        webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+        title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
+            get_element_by_attribute('class', 'title ', webpage))
+        title = clean_html(title_span)
+        video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s' % re.escape(playlist_id)
+        ids = orderedSet(re.findall(video_re, webpage))
+        url_results = self._ids_to_results(ids)
+
+        return self.playlist_result(url_results, playlist_id, title)
+
      def _real_extract(self, url):
          # Extract playlist id
          mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1528,45 +1578,33 @@ class YoutubePlaylistIE(InfoExtractor):
              video_id = query_dict['v'][0]
              if self._downloader.params.get('noplaylist'):
                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
              else:
                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
-        # Download playlist videos from API
-        videos = []
+        if len(playlist_id) == 13:  # 'RD' + 11 characters for the video id
+            # Mixes require a custom extraction process
+            return self._extract_mix(playlist_id)
+
+        # Extract the video ids from the playlist pages
+        ids = []
  
          for page_num in itertools.count(1):
-            start_index = self._MAX_RESULTS * (page_num - 1) + 1
-            if start_index >= 1000:
-                self._downloader.report_warning(u'Max number of results reached')
-                break
-            url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+            url = self._TEMPLATE_URL % (playlist_id, page_num)
              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+            matches = re.finditer(self._VIDEO_RE, page)
+            # We remove the duplicates and the link with index 0
+            # (it's not the first video of the playlist)
+            new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
+            ids.extend(new_ids)
  
-            try:
-                response = json.loads(page)
-            except ValueError as err:
-                raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-
-            if 'feed' not in response:
-                raise ExtractorError(u'Got a malformed response from YouTube API')
-            playlist_title = response['feed']['title']['$t']
-            if 'entry' not in response['feed']:
-                # Number of videos is a multiple of self._MAX_RESULTS
+            if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                  break
  
-            for entry in response['feed']['entry']:
-                index = entry['yt$position']['$t']
-                if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
-                    videos.append((
-                        index,
-                        'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
-                    ))
+        playlist_title = self._og_search_title(page)
  
-        videos = [v[1] for v in sorted(videos)]
-
-        url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
-        return [self.playlist_result(url_results, playlist_id, playlist_title)]
+        url_results = self._ids_to_results(ids)
+        return self.playlist_result(url_results, playlist_id, playlist_title)
  
  
  class YoutubeChannelIE(InfoExtractor):
@@ -1592,26 +1630,37 @@ class YoutubeChannelIE(InfoExtractor):
          # Download channel page
          channel_id = mobj.group(1)
          video_ids = []
+        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
+        channel_page = self._download_webpage(url, channel_id)
+        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
+            autogenerated = True
+        else:
+            autogenerated = False
  
-        # Download all channel pages using the json-based channel_ajax query
-        for pagenum in itertools.count(1):
-            url = self._MORE_PAGES_URL % (pagenum, channel_id)
-            page = self._download_webpage(url, channel_id,
-                                          u'Downloading page #%s' % pagenum)
-
-            page = json.loads(page)
-
-            ids_in_page = self.extract_videos_from_page(page['content_html'])
-            video_ids.extend(ids_in_page)
-
-            if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
-                break
+        if autogenerated:
+            # The videos are contained in a single page
+            # the ajax pages can't be used, they are empty
+            video_ids = self.extract_videos_from_page(channel_page)
+        else:
+            # Download all channel pages using the json-based channel_ajax query
+            for pagenum in itertools.count(1):
+                url = self._MORE_PAGES_URL % (pagenum, channel_id)
+                page = self._download_webpage(url, channel_id,
+                                              u'Downloading page #%s' % pagenum)
+    
+                page = json.loads(page)
+    
+                ids_in_page = self.extract_videos_from_page(page['content_html'])
+                video_ids.extend(ids_in_page)
+    
+                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+                    break
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
-        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
-        url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
-        return [self.playlist_result(url_entries, channel_id)]
+        url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                       for video_id in video_ids]
+        return self.playlist_result(url_entries, channel_id)
  
  
  class YoutubeUserIE(InfoExtractor):
@@ -1675,9 +1724,11 @@ class YoutubeUserIE(InfoExtractor):
              if len(ids_in_page) < self._GDATA_PAGE_SIZE:
                  break
  
-        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
-        url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
-        return [self.playlist_result(url_results, playlist_title = username)]
+        url_results = [
+            self.url_result(video_id, 'Youtube', video_id=video_id)
+            for video_id in video_ids]
+        return self.playlist_result(url_results, playlist_title=username)
+
  
  class YoutubeSearchIE(SearchInfoExtractor):
      IE_DESC = u'YouTube.com searches'
@@ -1718,10 +1769,12 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
          if len(video_ids) > n:
              video_ids = video_ids[:n]
-        videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                  for video_id in video_ids]
          return self.playlist_result(videos, query)
  
  class YoutubeSearchDateIE(YoutubeSearchIE):
+    IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
      _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
      _SEARCH_KEY = 'ytsearchdate'
      IE_DESC = u'YouTube.com searches, newest videos first'
@@ -1748,7 +1801,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
      Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
      """
      _LOGIN_REQUIRED = True
-    _PAGING_STEP = 30
      # use action_load_personal_feed instead of action_load_system_feed
      _PERSONAL_FEED = False
  
@@ -1768,9 +1820,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
  
      def _real_extract(self, url):
          feed_entries = []
-        # The step argument is available only in 2.7 or higher
-        for i in itertools.count(0):
-            paging = i*self._PAGING_STEP
+        paging = 0
+        for i in itertools.count(1):
              info = self._download_webpage(self._FEED_TEMPLATE % paging,
                                            u'%s feed' % self._FEED_NAME,
                                            u'Downloading page %s' % i)
@@ -1778,9 +1829,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
              feed_html = info['feed_html']
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+            feed_entries.extend(
+                self.url_result(video_id, 'Youtube', video_id=video_id)
+                for video_id in ids)
              if info['paging'] is None:
                  break
+            paging = info['paging']
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
@@ -1800,9 +1854,15 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
      _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
      _FEED_NAME = 'watch_later'
      _PLAYLIST_TITLE = u'Youtube Watch Later'
-    _PAGING_STEP = 100
      _PERSONAL_FEED = True
  
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
+    _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PERSONAL_FEED = True
+    _PLAYLIST_TITLE = u'Youtube Watch History'
+
  class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
      IE_NAME = u'youtube:favorites'
      IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py

index faed7ff7f0511c666795a20f48eefd4dc96c7009..689f19735456e2a05defe8f2eb49c5b2f4848580 100644 (file)
--- a/youtube_dl/extractor/zdf.py
+++ b/youtube_dl/extractor/zdf.py
@@ -1,75 +1,125 @@
+# coding: utf-8
+
+import operator
  import re
  
  from .common import InfoExtractor
  from ..utils import (
-    determine_ext,
-    ExtractorError,
+    unified_strdate,
  )
  
  
  class ZDFIE(InfoExtractor):
-    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek(?P<hash>#)?\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
-    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _VALID_URL = r'^https?://www\.zdf\.de/ZDFmediathek(?P<hash>#)?/(.*beitrag/(?:video/)?)(?P<video_id>[0-9]+)(?:/[^/?]+)?(?:\?.*)?'
+
+    _TEST = {
+        u"url": u"http://www.zdf.de/ZDFmediathek/beitrag/video/2037704/ZDFspezial---Ende-des-Machtpokers--?bc=sts;stt",
+        u"file": u"2037704.webm",
+        u"info_dict": {
+            u"upload_date": u"20131127",
+            u"description": u"Union und SPD haben sich auf einen Koalitionsvertrag geeinigt. Aber was bedeutet das für die Bürger? Sehen Sie hierzu das ZDFspezial \"Ende des Machtpokers - Große Koalition für Deutschland\".",
+            u"uploader": u"spezial",
+            u"title": u"ZDFspezial - Ende des Machtpokers"
+        },
+        u"skip": u"Videos on ZDF.de are depublicised in short order",
+    }
  
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group('video_id')
  
-        if mobj.group('hash'):
-            url = url.replace(u'#', u'', 1)
+        xml_url = u'http://www.zdf.de/ZDFmediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
+        doc = self._download_xml(
+            xml_url, video_id,
+            note=u'Downloading video info',
+            errnote=u'Failed to download video info')
+
+        title = doc.find('.//information/title').text
+        description = doc.find('.//information/detail').text
+        uploader_node = doc.find('.//details/originChannelTitle')
+        uploader = None if uploader_node is None else uploader_node.text
+        duration_str = doc.find('.//details/length').text
+        duration_m = re.match(r'''(?x)^
+            (?P<hours>[0-9]{2})
+            :(?P<minutes>[0-9]{2})
+            :(?P<seconds>[0-9]{2})
+            (?:\.(?P<ms>[0-9]+)?)
+            ''', duration_str)
+        duration = (
+            (
+                (int(duration_m.group('hours')) * 60 * 60) +
+                (int(duration_m.group('minutes')) * 60) +
+                int(duration_m.group('seconds'))
+            )
+            if duration_m
+            else None
+        )
+        upload_date = unified_strdate(doc.find('.//details/airtime').text)
+
+        def xml_to_format(fnode):
+            video_url = fnode.find('url').text
+            is_available = u'http://www.metafilegenerator' not in video_url
  
-        html = self._download_webpage(url, video_id)
-        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
-        if streams is None:
-            raise ExtractorError(u'No media url found.')
+            format_id = fnode.attrib['basetype']
+            format_m = re.match(r'''(?x)
+                (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
+                (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
+            ''', format_id)
  
-        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
-        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
-        # choose first/default media type and highest quality for now
-        def stream_pref(s):
-            TYPE_ORDER = ['ostreaming', 'hstreaming', 'wstreaming']
+            ext = format_m.group('container')
+            is_supported = ext != 'f4f'
+
+            PROTO_ORDER = ['http', 'rtmp', 'rtsp']
              try:
-                type_pref = TYPE_ORDER.index(s['media_type'])
+                proto_pref = -PROTO_ORDER.index(format_m.group('proto'))
              except ValueError:
-                type_pref = 999
+                proto_pref = 999
  
-            QUALITY_ORDER = ['veryhigh', '300']
+            quality = fnode.find('./quality').text
+            QUALITY_ORDER = ['veryhigh', '300', 'high', 'med', 'low']
              try:
-                quality_pref = QUALITY_ORDER.index(s['quality'])
+                quality_pref = -QUALITY_ORDER.index(quality)
              except ValueError:
                  quality_pref = 999
  
-            return (type_pref, quality_pref)
-
-        sorted_streams = sorted(streams, key=stream_pref)
-        if not sorted_streams:
-            raise ExtractorError(u'No stream found.')
-        stream = sorted_streams[0]
-
-        media_link = self._download_webpage(
-            stream['video_url'],
-            video_id,
-            u'Get stream URL')
+            abr = int(fnode.find('./audioBitrate').text) // 1000
+            vbr = int(fnode.find('./videoBitrate').text) // 1000
+            pref = (is_available, is_supported,
+                    proto_pref, quality_pref, vbr, abr)
  
-        MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
-        RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+            format_note = u''
+            if not is_supported:
+                format_note += u'(unsupported)'
+            if not format_note:
+                format_note = None
  
-        mobj = re.search(self._MEDIA_STREAM, media_link)
-        if mobj is None:
-            mobj = re.search(RTSP_STREAM, media_link)
-            if mobj is None:
-                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
-        video_url = mobj.group('video_url')
+            return {
+                'format_id': format_id + u'-' + quality,
+                'url': video_url,
+                'ext': ext,
+                'acodec': format_m.group('acodec'),
+                'vcodec': format_m.group('vcodec'),
+                'abr': abr,
+                'vbr': vbr,
+                'width': int(fnode.find('./width').text),
+                'height': int(fnode.find('./height').text),
+                'filesize': int(fnode.find('./filesize').text),
+                'format_note': format_note,
+                '_pref': pref,
+                '_available': is_available,
+            }
  
-        title = self._html_search_regex(
-            r'<h1(?: class="beitragHeadline")?>(.*?)</h1>',
-            html, u'title')
+        format_nodes = doc.findall('.//formitaeten/formitaet')
+        formats = sorted(filter(lambda f: f['_available'],
+                                map(xml_to_format, format_nodes)),
+                         key=operator.itemgetter('_pref'))
  
          return {
              'id': video_id,
-            'url': video_url,
              'title': title,
-            'ext': determine_ext(video_url)
+            'formats': formats,
+            'description': description,
+            'uploader': uploader,
+            'duration': duration,
+            'upload_date': upload_date,
          }
diff --git a/youtube_dl/update.py b/youtube_dl/update.py

index 0689a4891200bf2a03024b96ec2ecda5d857efb1..cd9670166e582ae9f3074c2371026f0f06c252a1 100644 (file)
--- a/youtube_dl/update.py
+++ b/youtube_dl/update.py
@@ -2,11 +2,15 @@ import io
  import json
  import traceback
  import hashlib
+import os
  import subprocess
  import sys
  from zipimport import zipimporter
  
-from .utils import *
+from .utils import (
+    compat_str,
+    compat_urllib_request,
+)
  from .version import __version__
  
  def rsa_verify(message, signature, key):
@@ -37,6 +41,7 @@ def rsa_verify(message, signature, key):
      if signature != sha256(message).digest(): return False
      return True
  
+
  def update_self(to_screen, verbose):
      """Update the program file with the latest version from the repository"""
  
@@ -78,6 +83,13 @@ def update_self(to_screen, verbose):
          return
  
      version_id = versions_info['latest']
+
+    def version_tuple(version_str):
+        return tuple(map(int, version_str.split('.')))
+    if version_tuple(__version__) >= version_tuple(version_id):
+        to_screen(u'youtube-dl is up to date (%s)' % __version__)
+        return
+
      to_screen(u'Updating to version ' + version_id + '...')
      version = versions_info['versions'][version_id]
  
@@ -105,7 +117,7 @@ def update_self(to_screen, verbose):
              urlh = compat_urllib_request.urlopen(version['exe'][0])
              newcontent = urlh.read()
              urlh.close()
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to download latest version')
              return
@@ -118,7 +130,7 @@ def update_self(to_screen, verbose):
          try:
              with open(exe + '.new', 'wb') as outf:
                  outf.write(newcontent)
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to write the new version')
              return
@@ -137,7 +149,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
  
              subprocess.Popen([bat])  # Continues to run in the background
              return  # Do not show premature success messages
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to overwrite current version')
              return
@@ -148,7 +160,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
              urlh = compat_urllib_request.urlopen(version['bin'][0])
              newcontent = urlh.read()
              urlh.close()
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to download latest version')
              return
@@ -161,7 +173,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
          try:
              with open(filename, 'wb') as outf:
                  outf.write(newcontent)
-        except (IOError, OSError) as err:
+        except (IOError, OSError):
              if verbose: to_screen(compat_str(traceback.format_exc()))
              to_screen(u'ERROR: unable to overwrite current version')
              return
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 1d9785341ec685071ea8fcc4846029a3e889bc72..c486ef8ecfef9772aaabdb3863a2814349a296b7 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -8,13 +8,16 @@ import gzip
  import io
  import json
  import locale
+import math
  import os
  import pipes
  import platform
  import re
+import ssl
  import socket
  import sys
  import traceback
+import xml.etree.ElementTree
  import zlib
  
  try:
@@ -535,17 +538,34 @@ def formatSeconds(secs):
      else:
          return '%d' % secs
  
-def make_HTTPS_handler(opts):
-    if sys.version_info < (3,2):
-        # Python's 2.x handler is very simplistic
-        return compat_urllib_request.HTTPSHandler()
+def make_HTTPS_handler(opts_no_check_certificate):
+    if sys.version_info < (3, 2):
+        import httplib
+
+        class HTTPSConnectionV3(httplib.HTTPSConnection):
+            def __init__(self, *args, **kwargs):
+                httplib.HTTPSConnection.__init__(self, *args, **kwargs)
+
+            def connect(self):
+                sock = socket.create_connection((self.host, self.port), self.timeout)
+                if self._tunnel_host:
+                    self.sock = sock
+                    self._tunnel()
+                try:
+                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
+                except ssl.SSLError:
+                    self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
+
+        class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
+            def https_open(self, req):
+                return self.do_open(HTTPSConnectionV3, req)
+        return HTTPSHandlerV3()
      else:
-        import ssl
-        context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
+        context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
          context.set_default_verify_paths()
          
          context.verify_mode = (ssl.CERT_NONE
-                               if opts.no_check_certificate
+                               if opts_no_check_certificate
                                 else ssl.CERT_REQUIRED)
          return compat_urllib_request.HTTPSHandler(context=context)
  
@@ -734,6 +754,8 @@ def unified_strdate(date_str):
          '%Y/%m/%d %H:%M:%S',
          '%d.%m.%Y %H:%M',
          '%Y-%m-%dT%H:%M:%SZ',
+        '%Y-%m-%dT%H:%M:%S.%fZ',
+        '%Y-%m-%dT%H:%M:%S.%f0Z',
          '%Y-%m-%dT%H:%M:%S',
      ]
      for expression in format_expressions:
@@ -949,7 +971,16 @@ class locked_file(object):
  
  
  def shell_quote(args):
-    return ' '.join(map(pipes.quote, args))
+    quoted_args = []
+    encoding = sys.getfilesystemencoding()
+    if encoding is None:
+        encoding = 'utf-8'
+    for a in args:
+        if isinstance(a, bytes):
+            # We may get a filename encoded with 'encodeFilename'
+            a = a.decode(encoding)
+        quoted_args.append(pipes.quote(a))
+    return u' '.join(quoted_args)
  
  
  def takewhile_inclusive(pred, seq):
@@ -976,3 +1007,17 @@ def unsmuggle_url(smug_url):
      jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
      data = json.loads(jsond)
      return url, data
+
+
+def format_bytes(bytes):
+    if bytes is None:
+        return u'N/A'
+    if type(bytes) is str:
+        bytes = float(bytes)
+    if bytes == 0.0:
+        exponent = 0
+    else:
+        exponent = int(math.log(bytes, 1024.0))
+    suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
+    converted = float(bytes) / float(1024 ** exponent)
+    return u'%.2f%s' % (converted, suffix)
diff --git a/youtube_dl/version.py b/youtube_dl/version.py

index 338e7ba1ff83fc5375b91c54f2119588a72df9a6..68b30bfd4a4ec455f3dad230e1fc30c353d807ca 100644 (file)
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,2 +1,2 @@
  
-__version__ = '2013.11.11'
+__version__ = '2013.12.04'
author	Rogério Brito <rbrito@ime.usp.br>
	Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)
committer	Rogério Brito <rbrito@ime.usp.br>
	Thu, 5 Dec 2013 03:41:08 +0000 (01:41 -0200)
README.md		patch \| blob \| history
README.txt		patch \| blob \| history
devscripts/bash-completion.in		patch \| blob \| history
setup.py		patch \| blob \| history
test/helper.py		patch \| blob \| history
test/parameters.json		patch \| blob \| history
test/test_age_restriction.py		patch \| blob \| history
test/test_all_urls.py		patch \| blob \| history
test/test_download.py		patch \| blob \| history
test/test_playlists.py		patch \| blob \| history
test/test_subtitles.py		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
test/test_write_annotations.py		patch \| blob \| history
test/test_write_info_json.py		patch \| blob \| history
test/test_youtube_lists.py		patch \| blob \| history
test/test_youtube_signature.py		patch \| blob \| history
youtube-dl		patch \| blob \| history
youtube-dl.1		patch \| blob \| history
youtube-dl.bash-completion		patch \| blob \| history
youtube_dl/FileDownloader.py		patch \| blob \| history
youtube_dl/PostProcessor.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/__init__.py		patch \| blob \| history
youtube_dl/extractor/__init__.py		patch \| blob \| history
youtube_dl/extractor/anitube.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/appletrailers.py		patch \| blob \| history
youtube_dl/extractor/archiveorg.py		patch \| blob \| history
youtube_dl/extractor/arte.py		patch \| blob \| history
youtube_dl/extractor/auengine.py		patch \| blob \| history
youtube_dl/extractor/bambuser.py		patch \| blob \| history
youtube_dl/extractor/bandcamp.py		patch \| blob \| history
youtube_dl/extractor/brightcove.py		patch \| blob \| history
youtube_dl/extractor/canalplus.py		patch \| blob \| history
youtube_dl/extractor/clipfish.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/clipsyndicate.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/cnn.py		patch \| blob \| history
youtube_dl/extractor/collegehumor.py		patch \| blob \| history
youtube_dl/extractor/comedycentral.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/d8.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/daum.py		patch \| blob \| history
youtube_dl/extractor/dreisat.py		patch \| blob \| history
youtube_dl/extractor/ebaumsworld.py		patch \| blob \| history
youtube_dl/extractor/eighttracks.py		patch \| blob \| history
youtube_dl/extractor/escapist.py		patch \| blob \| history
youtube_dl/extractor/facebook.py		patch \| blob \| history
youtube_dl/extractor/faz.py		patch \| blob \| history
youtube_dl/extractor/fktv.py		patch \| blob \| history
youtube_dl/extractor/francetv.py		patch \| blob \| history
youtube_dl/extractor/gamekings.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/gamespot.py		patch \| blob \| history
youtube_dl/extractor/gametrailers.py		patch \| blob \| history
youtube_dl/extractor/generic.py		patch \| blob \| history
youtube_dl/extractor/howcast.py		patch \| blob \| history
youtube_dl/extractor/imdb.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/internetvideoarchive.py		patch \| blob \| history
youtube_dl/extractor/jeuxvideo.py		patch \| blob \| history
youtube_dl/extractor/justintv.py		patch \| blob \| history
youtube_dl/extractor/livestream.py		patch \| blob \| history
youtube_dl/extractor/metacritic.py		patch \| blob \| history
youtube_dl/extractor/mixcloud.py		patch \| blob \| history
youtube_dl/extractor/mtv.py		patch \| blob \| history
youtube_dl/extractor/myspass.py		patch \| blob \| history
youtube_dl/extractor/naver.py		patch \| blob \| history
youtube_dl/extractor/nbc.py		patch \| blob \| history
youtube_dl/extractor/nhl.py		patch \| blob \| history
youtube_dl/extractor/niconico.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/podomatic.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/pornhub.py		patch \| blob \| history
youtube_dl/extractor/redtube.py		patch \| blob \| history
youtube_dl/extractor/rtlnow.py		patch \| blob \| history
youtube_dl/extractor/sina.py		patch \| blob \| history
youtube_dl/extractor/smotri.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/soundcloud.py		patch \| blob \| history
youtube_dl/extractor/southparkstudios.py		patch \| blob \| history
youtube_dl/extractor/spankwire.py		patch \| blob \| history
youtube_dl/extractor/spiegel.py		patch \| blob \| history
youtube_dl/extractor/streamcloud.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/sztvhu.py		patch \| blob \| history
youtube_dl/extractor/teamcoco.py		patch \| blob \| history
youtube_dl/extractor/ted.py		patch \| blob \| history
youtube_dl/extractor/toutv.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/trilulilu.py		patch \| blob \| history
youtube_dl/extractor/tube8.py		patch \| blob \| history
youtube_dl/extractor/tvp.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vevo.py		patch \| blob \| history
youtube_dl/extractor/viddler.py		patch \| blob \| history
youtube_dl/extractor/videofyme.py		patch \| blob \| history
youtube_dl/extractor/videopremium.py		patch \| blob \| history
youtube_dl/extractor/viki.py	[new file with mode: 0644]	patch \| blob
youtube_dl/extractor/vimeo.py		patch \| blob \| history
youtube_dl/extractor/vine.py		patch \| blob \| history
youtube_dl/extractor/xhamster.py		patch \| blob \| history
youtube_dl/extractor/xtube.py		patch \| blob \| history
youtube_dl/extractor/yahoo.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history
youtube_dl/extractor/zdf.py		patch \| blob \| history
youtube_dl/update.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history
youtube_dl/version.py		patch \| blob \| history